In [155]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
import json
import sklearn
import random
import os

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader
import pickle as pickle

In [156]:
def set_seed(seed:int = 42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)

In [157]:
class RE_Dataset(torch.utils.data.Dataset):
  """ Dataset 구성을 위한 class."""
  def __init__(self, pair_dataset, labels):
    self.pair_dataset = pair_dataset
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)
  
def preprocessing_dataset(dataset):
  """ 처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다."""
  out_dataset = pd.DataFrame({'url':dataset['url'], 'context':dataset['context'], 'main':dataset['main'], 'detail':dataset['detail']})
  return out_dataset

def load_data(dataset_dir):
  """ csv 파일을 경로에 맡게 불러 옵니다. """
  dataset = pd.read_csv(dataset_dir)
  dataset = preprocessing_dataset(dataset)
  
  return dataset

def tokenized_dataset(context, tokenizer):
  """ tokenizer에 따라 sentence를 tokenizing 합니다."""
  tokenized_context = tokenizer(context,return_tensors="pt", padding=True, truncation=True, max_length=256, add_special_tokens=True,)
  return tokenized_context

In [159]:
def klue_re_micro_f1_for_main(preds, labels):
    label_list = ['기쁨', '슬픔', '싫어함(상태)', '분노', '미움(상대방)', '두려움', '수치심', '욕망', '사랑', '중립']
    label_indices = list(range(len(label_list)))
    return sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) * 100.0

def klue_re_auprc_for_main(probs, labels):
    labels = np.eye(10)[labels]

    score = np.zeros((10,))
    for c in range(10):
        targets_c = labels.take([c], axis=1).ravel()
        preds_c = probs.take([c], axis=1).ravel()
        precision, recall, _ = sklearn.metrics.precision_recall_curve(targets_c, preds_c)
        score[c] = sklearn.metrics.auc(recall, precision)
    return np.average(score) * 100.0

def klue_re_micro_f1_for_detail(preds, labels):
    label_list = ['만족감', '무기력', '즐거움', '답답함', '타오름', '불쾌', '자랑스러움', '절망', '치사함', '걱정', '부끄러움', '궁금함', '놀람', '아쉬움', '싫증', '공감', '감동', '냉담', '경멸',
                  '매력적', '반가움', '불만', '실망', '미안함', '다정함', '공포', '억울함', '난처함', '날카로움', '불신감', '동정(슬픔)', '불편함', '아픔', '고마움', '호감', '귀중함', '기대감', '고통',
                  '수치심', '초조함', '원망', '위축감', '후회', '욕심', '시기심', '안정감', '너그러움', '외면', '그리움', '허망', '편안함', '신명남', '비위상함', '반감', '죄책감', '아른거림', '외로움',
                  '서먹함', '자신감', '두근거림', '심심함', '갈등', '신뢰감', '열정적인']
    label_indices = list(range(len(label_list)))
    return sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) * 100.0

def klue_re_auprc_for_detail(probs, labels):
    labels = np.eye(64)[labels]

    score = np.zeros((64,))
    for c in range(64):
        targets_c = labels.take([c], axis=1).ravel()
        preds_c = probs.take([c], axis=1).ravel()
        precision, recall, _ = sklearn.metrics.precision_recall_curve(targets_c, preds_c)
        score[c] = sklearn.metrics.auc(recall, precision)
    return np.average(score) * 100.0

def compute_metrics_for_main(pred):
  """ validation을 위한 metrics function """
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  probs = pred.predictions

  # calculate accuracy using sklearn's function
  f1 = klue_re_micro_f1_for_main(preds, labels)
  auprc = klue_re_auprc_for_main(probs, labels)
  acc = accuracy_score(labels, preds)

  return {
      'micro f1 score': f1,
      'auprc' : auprc,
      'accuracy': acc,
  }

def compute_metrics_for_detail(pred):
  """ validation을 위한 metrics function """
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  probs = pred.predictions

  # calculate accuracy using sklearn's function
  f1 = klue_re_micro_f1_for_detail(preds, labels)
  auprc = klue_re_auprc_for_detail(probs, labels)
  acc = accuracy_score(labels, preds)

  return {
      'micro f1 score': f1,
      'auprc' : auprc,
      'accuracy': acc,
  }

In [160]:
with open('./detail_to_num.pkl', 'rb') as f:
  detail_to_num = pickle.load(f)
with open('./num_to_detail.pkl', 'rb') as f:
  num_to_detail = pickle.load(f)
with open('./main_to_num.pkl', 'rb') as f:
  main_to_num = pickle.load(f)
with open('./num_to_main.pkl', 'rb') as f:
  num_to_main = pickle.load(f)

In [161]:
MODEL_NAME = 'google-bert/bert-base-multilingual-uncased'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

cuda:0


In [162]:
df = load_data('../data/reviewed_emotion_0311.csv')
df.head()

Unnamed: 0,url,context,main,detail
0,https://x.com/1hhaa_/status/175373236692813461...,보는동안 너무 행복했고 초콜렛이 너무 먹고싶었고 티모시가 잘생겼고 울어!!하는부분이...,기쁨,만족감
1,https://x.com/makki_home/status/17552181165049...,어릴 때 가 보고 빕스는 거의 처음인데(기억에 없음) 지금 딸기축제 기간이라 만족스...,기쁨,만족감
2,https://x.com/302NOW/status/175539358101844788...,미리 계좌로 환전해둔 돈을 해외에서 환전수수료 없이 인출 가능한 트레블로그라는 카드...,기쁨,만족감
3,https://x.com/Hassen_cos/status/17556459885792...,요즘 번아웃도 자꾸 올라오고 무기력해서 종강하고 교류하기도 버거운 상태가 와부렀으요ㅠㅠ,슬픔,무기력
4,https://x.com/ssosohae1/status/175618221059468...,크라임씬 장똥민이 범행 도구 찾으려고 화장실 탱크 뒤지는데 거기에 진짜 똥 넣어놓은...,기쁨,즐거움


In [163]:
main = [main_to_num[value] for value in df.main]
detail = [detail_to_num[value] for value in df.detail]

In [164]:
context = tokenized_dataset(df['context'].to_list(), tokenizer)

In [169]:
# train_test_split 특정 label이 1개 밖에 없어서 stratify 불가
# context_train, context_test, main_label_train, main_label_test, detail_label_train, detail_label_test = train_test_split(df['context'], main, detail, test_size=0.3)

# RE_main_train_dataset = RE_Dataset(context_train, main_label_train)
# RE_main_test_dataset = RE_Dataset(context_test, main_label_test)

# RE_detail_train_dataset = RE_Dataset(context_train, detail_label_train)
# RE_detail_test_dataset = RE_Dataset(context_test, detail_label_test)

# split 없이
RE_main_dataset = RE_Dataset(context, main)
RE_detail_dataset = RE_Dataset(context, detail)

In [170]:
model_config_for_main =  AutoConfig.from_pretrained(MODEL_NAME)
model_config_for_main.num_labels = len(df['main'].unique())

model_config_for_detail =  AutoConfig.from_pretrained(MODEL_NAME)
model_config_for_detail.num_labels = len(df['detail'].unique())

model_for_main =  AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config_for_main)
model_for_detail =  AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config_for_detail)
model_for_main.to(device)
model_for_detail.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [171]:
training_args = TrainingArguments(
  output_dir='results_for_main',          # output directory
  save_steps=500,                 # model saving step.
  num_train_epochs=20,              # total number of training epochs
  learning_rate=1e-5,               # learning_rate
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=16,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  evaluation_strategy='steps', # evaluation strategy to adopt during training
                              # `no`: No evaluation during training.
                              # `steps`: Evaluate every `eval_steps`.
                              # `epoch`: Evaluate every end of epoch.
  logging_steps=500,              # log saving step.
  eval_steps = 500,            # evaluation step.
  load_best_model_at_end = True
  )
trainer_for_main = Trainer(
  model=model_for_main,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=RE_main_dataset,         # training dataset
  eval_dataset=RE_main_dataset,             # evaluation dataset
  compute_metrics=compute_metrics_for_main         # define metrics function
)

training_args = TrainingArguments(
  output_dir='results_for_detail',          # output directory
  save_steps=500,                 # model saving step.
  num_train_epochs=20,              # total number of training epochs
  learning_rate=1e-5,               # learning_rate
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=16,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  evaluation_strategy='steps', # evaluation strategy to adopt during training
                              # `no`: No evaluation during training.
                              # `steps`: Evaluate every `eval_steps`.
                              # `epoch`: Evaluate every end of epoch.
  logging_steps=500,              # log saving step
  eval_steps = 500,            # evaluation step.
  load_best_model_at_end = True
  )

trainer_for_detail = Trainer(
  model=model_for_detail,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=RE_detail_dataset,         # training dataset
  eval_dataset=RE_detail_dataset,             # evaluation dataset
  compute_metrics=compute_metrics_for_detail         # define metrics function
)

In [172]:
trainer_for_main.train()
trainer_for_detail.train()

Step,Training Loss,Validation Loss


# aihub

In [None]:
def inference(model, tokenized_sent, device):
  """
    test dataset을 DataLoader로 만들어 준 후,
    batch_size로 나눠 model이 예측 합니다.
  """
  dataloader = DataLoader(tokenized_sent, batch_size=16, shuffle=False)
  model.eval()
  output_pred = []
  output_prob = []
  for i, data in enumerate(dataloader):
    with torch.no_grad():
      outputs = model(
          input_ids=data['input_ids'].to(device),
          attention_mask=data['attention_mask'].to(device),
          token_type_ids=data['token_type_ids'].to(device)
          )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)

    output_pred.append(result)
    output_prob.append(prob)
  
  return np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()

# active learning

In [None]:
with open('../data/aihub.json') as aihub:
  parsed_json = json.load(aihub)
  
aihub_context = [data['talk']['content']['HS01'] for data in parsed_json]
aihub_label = [0] * len(aihub_context)
aihub_df = pd.DataFrame({'context':aihub_context, 'label':aihub_label})

In [None]:
training_args = TrainingArguments(
  output_dir='results',          # output directory
  save_steps=500,                 # model saving step.
  num_train_epochs=20,              # total number of training epochs
  learning_rate=1e-5,               # learning_rate
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=16,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  evaluation_strategy='steps', # evaluation strategy to adopt during training
                              # `no`: No evaluation during training.
                              # `steps`: Evaluate every `eval_steps`.
                              # `epoch`: Evaluate every end of epoch.
  logging_steps=500,              # log saving step.
  eval_steps = 500,            # evaluation step.
  load_best_model_at_end = True
  )


learning_count = 10
cut_count = len(aihub_label) // learning_count

for n in range(learning_count):
  print(f'{n + 1}번째 시작')
  aihub_tokenized_context = tokenized_dataset(aihub_df['context'].to_list(), tokenizer)
  aihub_label = aihub_df['label'].to_list()

  RE_test_dataset = RE_Dataset(aihub_tokenized_context, aihub_label)

  print('예측 시작')
  main_pred_answer, main_output_prob = inference(model_for_main, RE_test_dataset, device) # model에서 class 추론
  detail_pred_answer, detail_output_prob = inference(model_for_detail, RE_test_dataset, device) # model에서 class 추론
  print('예측 종료')
  # main_pred_answer_label = [num_to_main[value] for value in main_pred_answer]
  # detail_pred_answer_label = [num_to_detail[value] for value in detail_pred_answer]
  aihub_df['main_pred'] = main_pred_answer
  aihub_df['main_prob'] = main_output_prob
  aihub_df['main_prob'] = aihub_df['main_prob'].apply(lambda x: max(x))
  aihub_df['detail_pred'] = detail_pred_answer
  aihub_df['detail_prob'] = detail_output_prob
  aihub_df['detail_prob'] = aihub_df['detail_prob'].apply(lambda x: max(x))
  aihub_df['prob_mean'] = aihub_df.apply(lambda row: (row['main_prob'] * row['detail_prob']) / 2, axis=1)
  aihub_df = aihub_df.sort_values(by='prob_mean', ascending=False)

  new_learning_data = aihub_df.iloc[:cut_count]
  aihub_df = aihub_df.iloc[cut_count:]

  tokenized_context = tokenized_dataset(new_learning_data['context'].to_list(), tokenizer)
  NEW_main_train_dataset = RE_Dataset(tokenized_context, new_learning_data['main_pred'].to_list())
  NEW_detail_train_dataset = RE_Dataset(tokenized_context, new_learning_data['detail_pred'].to_list())

  trainer_for_main = Trainer(
    model=model_for_main,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=NEW_main_train_dataset,         # training dataset
    eval_dataset=RE_main_dataset,             # evaluation dataset
    compute_metrics=compute_metrics_for_main         # define metrics function
  )

  trainer_for_detail = Trainer(
    model=model_for_detail,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=NEW_detail_train_dataset,         # training dataset
    eval_dataset=RE_detail_dataset,             # evaluation dataset
    compute_metrics=compute_metrics_for_detail         # define metrics function
  )
  
  print('학습 시작')
  trainer_for_main.train()
  trainer_for_detail.train()
  print('학습 종료')
  print(f'{n + 1}번째 종료')