In [1]:
import glob
import pandas as pd
from pathlib import Path

train = glob.glob('data/dataTrainComplete/*')
train = [Path(file).read_text(encoding='utf-8') for file in train]
train_labels = pd.read_csv('data/TrainLabel.csv').values

test = glob.glob('data/dataPublicComplete/*')
test = [Path(file).read_text(encoding='utf-8') for file in test]

In [2]:
# TODO: 
# 1. extract meaningful content from train data (most relevant 254 words, because max_length=512, [CLS]x1, [SEP]x2, 2 documents)
# 2. concat documents and provide labels ([doc1 [SEP] doc2])

dummy_labels = [len(text)%2 for text in train]
train_labels = dummy_labels

In [3]:
from datasets import load_dataset, Dataset

# dataset = load_dataset('text', data_files={'train': train, 'test': test})
dataset = Dataset.from_dict({'text': train, 'labels': train_labels})
dataset_test = Dataset.from_dict({'text': test})
dataset[0]

{'text': '梅雨季來臨，文旦黑點病易發生，請注意病徵，以及早加強防治措施。\n5月已進入梅雨季節，近日連續降雨，為文旦黑點病開始感染的時機，往年文旦在經過4-6月的春雨及梅雨季後，原來長得亮麗的果實外表，會開始出現許多小黑點，現在文旦已開始進入中果期，花蓮區農業改良場呼籲應注意防治。\n除冬季清園作業外，在4-8月時應每月施用一次56%貝芬硫\x7f可濕性粉劑800倍、或22.7%\x7f硫\x7f水懸劑1000倍、或80%鋅錳乃浦可濕性粉劑500倍、或33%鋅錳乃浦水懸劑500倍等政府核准登記使用之藥劑防治，並依登記使用方法使用，尤其雨前及雨後要特別加強防治，若遇連續降雨時則可利用間歇時分區進行施藥以即時達到防治效果。\n',
 'labels': 0}

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

def preprocess_fn(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(preprocess_fn, batched=True)
tokenized_dataset_test = dataset_test.map(preprocess_fn, batched=True)

# train_dataset = tokenized_dataset.shuffle(seed=42)
# eval_dataset = tokenized_dataset.shuffle(seed=42).select(range(100))
train_dataset, eval_dataset = tokenized_dataset.train_test_split(test_size=0.2).values()
test_dataset = tokenized_dataset_test
# train_dataset['input_ids'][0][:10]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
#special tokens
tokenizer("[CLS][SEP][UNK][PAD][MASK]")

{'input_ids': [101, 101, 102, 100, 0, 103, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [6]:
import torch
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('ckiplab/albert-tiny-chinese')
model

Some weights of the model checkpoint at ckiplab/albert-tiny-chinese were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.decoder.weight', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ckiplab/albert-tiny-chinese and are newly initialized: ['albert.pooler.weight

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(21128, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=312, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=312, out_features=312, bias=True)
                (key): Linear(in_features=312, out_features=312, bias=True)
                (value): Linear(in_features=312, out_features=31

In [7]:
import numpy as np
from datasets import list_metrics, load_metric

# metric = load_metric("glue", "mrpc")
accuracy = load_metric("accuracy")
f1_score = load_metric("f1")
precision = load_metric("precision")
recall = load_metric("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1_score.compute(predictions=predictions, references=labels)["f1"],
        "precision": precision.compute(predictions=predictions, references=labels)["precision"],
        "recall": recall.compute(predictions=predictions, references=labels)["recall"]
    }

In [8]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=32,
    num_train_epochs=10,
    evaluation_strategy='steps', # epoch
    eval_steps=500,
    save_steps=500,
    logging_steps=500
)


trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics
)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=test_trainer\runs\Dec05_01-09-21_DESKTOP-UUMLSE6,
logging_first_step=False,
logging_nan_inf_filter=True,
log

In [9]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 448
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 140


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=140, training_loss=0.6723639896937779, metrics={'train_runtime': 48.072, 'train_samples_per_second': 93.193, 'train_steps_per_second': 2.912, 'total_flos': 18042303283200.0, 'train_loss': 0.6723639896937779, 'epoch': 10.0})

In [10]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 112
  Batch size = 8


{'eval_loss': 0.7122237086296082,
 'eval_accuracy': 0.4732142857142857,
 'eval_f1': 0.42718446601941745,
 'eval_precision': 0.4782608695652174,
 'eval_recall': 0.38596491228070173,
 'eval_runtime': 0.5892,
 'eval_samples_per_second': 190.096,
 'eval_steps_per_second': 23.762,
 'epoch': 10.0}

In [11]:
logits = trainer.predict(test_dataset)[0]
predictions = np.argmax(logits, axis=-1)
predictions

The following columns in the test set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 421
  Batch size = 8


array([0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,

In [12]:
# LEGACY CODE: manual training using pytorch
# from tqdm import tqdm

# small_train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
# dataloader = torch.utils.data.DataLoader(small_train_dataset, batch_size=32)

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model.train().to(device)
# optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)
# for epoch in range(3):
#     for i, batch in enumerate(tqdm(dataloader)):
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs[0]
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         if i % 10 == 0:
#             print(f"loss: {loss}")