In [74]:
import pandas as pd
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import datasets
from datasets import load_dataset

In [75]:
config = AutoConfig.from_pretrained("emeraldgoose/bad-korean-tokenizer")
model = AutoModelForSequenceClassification.from_pretrained("beomi/KcELECTRA-base")
tokenizer = AutoTokenizer.from_pretrained('emeraldgoose/bad-korean-tokenizer')

loading configuration file https://huggingface.co/emeraldgoose/bad-korean-tokenizer/resolve/main/config.json from cache at /opt/ml/.cache/huggingface/transformers/440db990a6715f3d4c1f93091ed1da220bb5fc5c7c2f6e21adbb20adb249923b.2ccb1233a18c0cf90e39ee2d88f08019fe65c376fa142a098d3108cb5fab9d28
Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.12.5"

In [76]:
train_df = pd.read_csv('./curse.csv', index_col=0)
# eval_df = pd.read_csv('./eval_data.csv', index_col=0)
# len(train_df), len(eval_df)
len(train_df)

1107

In [77]:
train_dataset = load_dataset('csv', data_files=['curse.csv'], delimiter=',')
# eval_dataset = load_dataset('csv', data_files=['eval_data.csv'], delimiter=',')

Using custom data configuration default-c5ea465d669f8c36
Reusing dataset csv (/opt/ml/.cache/huggingface/datasets/csv/default-c5ea465d669f8c36/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [78]:
# type(train_dataset), type(eval_dataset)
type(train_dataset)

datasets.dataset_dict.DatasetDict

In [79]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 1107
    })
})

In [80]:
# eval_dataset

In [81]:
train_dataset = train_dataset['train'].remove_columns('Unnamed: 0')
# eval_dataset = eval_dataset['train'].remove_columns('Unnamed: 0')

In [82]:
# type(train_dataset), type(eval_dataset)
type(train_dataset)

In [83]:
def tokenized_dataset(dataset, tokenizer):
    tokenized = tokenizer(
        dataset['text'],
        return_tensors='pt',
        padding=True,
        max_length=256,
        add_special_tokens=True,
    )
    return tokenized

In [84]:
train_label = train_dataset['label']
# eval_label = eval_dataset['label']

train_dataset = tokenized_dataset(train_dataset, tokenizer)
# eval_dataset = tokenized_dataset(eval_dataset, tokenizer)

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


In [85]:
# type(train_dataset), type(eval_dataset)
type(train_dataset)

transformers.tokenization_utils_base.BatchEncoding

In [86]:
class CL_Dataset(torch.utils.data.Dataset):
    """ Dataset 구성을 위한 class."""

    def __init__(self, dataset, labels):
        self.dataset = dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach()
                for key, val in self.dataset.items()}
        item['label'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [87]:
train_set = CL_Dataset(train_dataset, train_label)
# eval_set = CL_Dataset(eval_dataset, eval_label)
# len(train_set), len(eval_set)
len(train_dataset)

3

In [88]:
train_set[0]

{'input_ids': tensor([    2,  2896,  4225, 14257,  4030,   116,  4770,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1,

In [89]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device = {device}')

device = cuda


In [90]:
model.resize_token_embeddings(tokenizer.vocab_size + len(tokenizer.get_added_vocab()))

Embedding(50144, 768)

In [91]:
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50144, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [98]:
from sklearn.metrics import f1_score, accuracy_score
def compute_metrics(preds):
    labels = preds.label_ids
    preds = preds.predictions.argmax(-1)
    return {'f1_score': f1_score(labels,preds), 'acc' : accuracy_score(labels,preds)}

In [99]:
training_arguments = TrainingArguments(
    do_train=True,
    output_dir=f'./results/',
    save_total_limit=5,
    save_steps=10,
    num_train_epochs=5,
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    # weight_decay=1e-6,
    evaluation_strategy='steps',
    eval_steps=10,
    load_best_model_at_end=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [100]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_set,
    # eval_dataset=eval_set,
    eval_dataset=train_set,
    compute_metrics=compute_metrics,
    
)

In [101]:
trainer.train()

***** Running training *****
  Num examples = 1107
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 350


Step,Training Loss,Validation Loss,F1 Score,Acc
10,0.4245,0.359991,0.068441,0.778681
20,0.3838,0.326976,0.370607,0.822042
30,0.3991,0.293939,0.7343,0.900632
40,0.3493,0.265771,0.838174,0.929539
50,0.3103,0.241394,0.856597,0.932249
60,0.2706,0.215556,0.868526,0.940379
70,0.2765,0.196098,0.88391,0.948509
80,0.2007,0.177484,0.891616,0.952123
90,0.2139,0.164356,0.901639,0.95664
100,0.2021,0.146851,0.923077,0.963866


***** Running Evaluation *****
  Num examples = 1107
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-10
Configuration saved in ./results/checkpoint-10/config.json
Model weights saved in ./results/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1107
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-20
Configuration saved in ./results/checkpoint-20/config.json
Model weights saved in ./results/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1107
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-30
Configuration saved in ./results/checkpoint-30/config.json
Model weights saved in ./results/checkpoint-30/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-50] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1107
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-40
Configuration saved in ./results/checkpoint-40/config.json
Mo

TrainOutput(global_step=350, training_loss=0.1546224845307214, metrics={'train_runtime': 187.2843, 'train_samples_per_second': 29.554, 'train_steps_per_second': 1.869, 'total_flos': 230394326181300.0, 'train_loss': 0.1546224845307214, 'epoch': 5.0})

In [102]:
model.save_pretrained('./')

Configuration saved in ./config.json
Model weights saved in ./pytorch_model.bin
