In [1]:
import pandas as pd
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import f1_score, accuracy_score

In [2]:
train_df = pd.read_excel('./data/emotion/Training.xlsx')
valid_df = pd.read_excel('./data/emotion/Validation.xlsx')

In [3]:
#train_df.head()

In [4]:
train_df['감정_대분류'] = train_df['감정_대분류'].apply(lambda x: x.rstrip().lstrip())
valid_df['감정_대분류'] = valid_df['감정_대분류'].apply(lambda x: x.rstrip().lstrip())
# train_df['감정_소분류'] = train_df['감정_소분류'].apply(lambda x: x.rstrip().lstrip())
# valid_df['감정_소분류'] = valid_df['감정_소분류'].apply(lambda x: x.rstrip().lstrip())

In [5]:
labels = ['기쁨', '불안', '당황', '슬픔', '분노', '상처']
#labels = valid_df['감정_소분류'].unique().tolist()
label_idx_dict = {label: idx for idx, label in enumerate(labels)}
idx_label_dict = {idx: label for label, idx in label_idx_dict.items()}

In [6]:
label_idx_dict

{'기쁨': 0, '불안': 1, '당황': 2, '슬픔': 3, '분노': 4, '상처': 5}

In [7]:
train_df['label'] = train_df['감정_대분류'].apply(lambda x: label_idx_dict[x])
valid_df['label'] = valid_df['감정_대분류'].apply(lambda x: label_idx_dict[x])

# train_df['label'] = train_df['감정_소분류'].apply(lambda x: label_idx_dict[x])
# valid_df['label'] = valid_df['감정_소분류'].apply(lambda x: label_idx_dict[x])

In [8]:
train_data = train_df['사람문장1'].to_list()
train_label = train_df['label'].to_list()

valid_data = valid_df['사람문장1'].to_list()
valid_label = valid_df['label'].to_list()

In [9]:
model_name_or_path ='klue/bert-base' #"kykim/albert-kor-base"#'klue/bert-base'
config = AutoConfig.from_pretrained(model_name_or_path,num_labels=len(labels))
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [10]:
train_data = tokenizer(train_data,padding=True, truncation=True)
valid_data = tokenizer(valid_data,padding=True, truncation=True)

In [11]:
class EmotionDataset(Dataset):
    def __init__(self, data, labels) -> None:
        super().__init__()
        self.data = data
        self.labels = labels
    def __getitem__(self, index):
        item = {k : torch.tensor(v[index]) for k, v in self.data.items()}
        item['labels'] = torch.tensor(self.labels[index])
        return item
    def __len__(self):
        return len(self.labels)


In [12]:
train_dataset = EmotionDataset(train_data,train_label)
valid_dataset = EmotionDataset(valid_data,valid_label)

In [13]:
def compute_metrics(pred):
    """validation을 위한 metrics function"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # print(probs)
    label_indices = [0,1,2,3,4,5]

    # calculate accuracy using sklearn's function
    f1 = f1_score(labels, preds, average="micro",labels=label_indices)
    acc = accuracy_score(labels,preds)

    return {
        "micro f1 score": f1,
        "accuracy": acc,
    }

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,config = config)
model.to(device)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [16]:
save_steps = 250
training_args = TrainingArguments(
    output_dir='./results',
    save_total_limit=2,
    save_steps=save_steps,
    num_train_epochs=5,
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps = save_steps,

    metric_for_best_model = "accuracy",

    fp16=True,
    fp16_opt_level='O1',

    load_best_model_at_end = True
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using amp fp16 backend


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
trainer.train()

***** Running training *****
  Num examples = 40879
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3195
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mddobokki[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss,Micro f1 score,Accuracy
250,1.0979,1.05218,0.618324,0.618324
500,1.0225,1.013326,0.632359,0.632359
750,0.8991,1.008643,0.638986,0.638986
1000,0.8398,1.003902,0.648148,0.648148
1250,0.8642,0.967174,0.65653,0.65653
1500,0.7326,1.002491,0.655361,0.655361
1750,0.6469,1.009539,0.65614,0.65614
2000,0.4671,1.06987,0.657115,0.657115
2250,0.4339,1.099076,0.651267,0.651267
2500,0.4974,1.083571,0.655945,0.655945


***** Running Evaluation *****
  Num examples = 5130
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-250
Configuration saved in ./results/checkpoint-250/config.json
Model weights saved in ./results/checkpoint-250/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1250] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 5130
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-6250] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 5130
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-750
Configuration saved in ./results/checkpoint-750/config.json
Model weights saved in ./results/checkpoint-750/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-250] due to args.save_total_limit
***** Runni

TrainOutput(global_step=3195, training_loss=0.7173074919293184, metrics={'train_runtime': 513.3175, 'train_samples_per_second': 398.184, 'train_steps_per_second': 6.224, 'total_flos': 9663686394204240.0, 'train_loss': 0.7173074919293184, 'epoch': 5.0})

In [18]:
model.save_pretrained('./results')

Configuration saved in ./results/config.json
Model weights saved in ./results/pytorch_model.bin
