In [2]:
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn


from glob import glob
from tqdm.notebook import tqdm

save_path = "../dataset/curriculum"
data_path = "../dataset/*/*.tsv"

test_data, valid_data, train_data =\
    list(map(lambda p : pd.read_csv(p,sep='\t'), tqdm(glob(data_path))))



  0%|          | 0/3 [00:00<?, ?it/s]

### 데이터 비율확인

In [3]:
display(train_data['label'].value_counts(normalize=True))
display(valid_data['label'].value_counts(normalize=True))
display(test_data['label'].value_counts(normalize=True))

0    0.441489
2    0.316489
1    0.242021
Name: label, dtype: float64

2    0.401274
0    0.339703
1    0.259023
Name: label, dtype: float64

0    1.0
Name: label, dtype: float64

In [4]:
model_name = "beomi/KcELECTRA-base"

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer

tokenizer= AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, num_labels=3)
base_plm_model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
base_plm_model

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.weight', 'classifi

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

# 실험 방법
---
- EXP 1 : 명확한 욕설이 있는 데이터로 -> 애매한 친구들
- EXP 2 : 명확한 욕설 스페셜 토큰처리 -> 스패셜 토큰 처리 풀어버리기

In [5]:
train_data.columns

Index(['comments', 'label'], dtype='object')

In [6]:
import json
with open("../dataset/curriculum/abuse_voca.json" , "r") as f:
    abuse_voca = json.load(f)

def _check_badword(string):
    voca = abuse_voca['badwords']
    for v in voca:
        if v in string:
            return True
    return False
            

def build_curriculrum_train(data):
    easy_idx = data['comments'].apply(_check_badword)
    difficult_idx = ~easy_idx

    return data.loc[easy_idx], data.loc[difficult_idx]

In [7]:
from copy import deepcopy

In [8]:
from torch.utils.data import Dataset, DataLoader

class CurriculumDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer= tokenizer
        self.tokenized_text = tokenizer(
            self.data['comments'].tolist(),
            max_length = 140,
            padding=True,
            truncation=True,
            return_tensors='pt',
            return_token_type_ids=False
        )
    def __getitem__(self, idx):
        return {
            "input_ids" : self.tokenized_text['input_ids'][idx],
            "attention_mask" : self.tokenized_text['attention_mask'][idx]
        }, torch.tensor(self.data['label'].tolist()[idx])

    def __len__(self):
        return len(self.data)

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [10]:
tokenized_val = tokenizer(
            valid_data['comments'].tolist(),
            max_length = 140,
            padding=True,
            truncation=True,
            return_tensors='pt',
            return_token_type_ids=False
)

In [11]:
from sklearn.metrics import f1_score

In [12]:
a, b = build_curriculrum_train(train_data)

In [17]:
a[a['label']==0]

Unnamed: 0,comments,label
18,180넘어갈정도의 기럭지는 아님78에서79180넘는애들은 의심.의구심을 안하고안듬그...,0
27,1부는 눈 썩는 거 같아서 2부만 봤다 졸라 웃기더만 ㅋㅋ,0
34,2002년도 여름에 봤는데 서지영이랑 스포츠카타고온거...18년전이네 ㅎㅎ,0
39,2018년 마마 진정한 승자 화사,0
95,53살 아빠면 어때 아빠가 곽부성인데.... 축하해요,0
...,...,...
6903,지드래곤 누가 지랄해도 화이팅! 너의 능력은 국보급!,0
6940,지효 남친임 넘보지 마셈,0
7224,친한 사이였나보지,0
7566,한동안 안보이더니 불편하게 요즘 갑자기 왜이래???이상민 재기 성공하고 열심히 사는...,0


In [11]:
# EXP 1

# 데이터 셋 나누기
exp1_train_first, exp1_train_second = build_curriculrum_train(train_data)

exp1_train_first, exp1_train_second = CurriculumDataset(exp1_train_first, tokenizer), CurriculumDataset(exp1_train_second, tokenizer)

exp1_train_first, exp1_train_second = DataLoader(exp1_train_first, batch_size=32, shuffle=True) , DataLoader(exp1_train_second, batch_size=32, shuffle=True)

# 모델 구성
exp1_model = deepcopy(base_plm_model).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(exp1_model.parameters(), lr=1e-4)

# step 1 train start
epochs = 10

for epoch in tqdm(range(1, epochs+1)):

    exp1_model.train()
    loss_per_batch = 0

    for idx, (inputs, real) in enumerate(exp1_train_first):
        inputs, real = {key : val.to(device) for key, val in inputs.items()}, real.to(device)
        
        pred = exp1_model(inputs['input_ids'], inputs['attention_mask'])['logits']
        loss = criterion(pred, real)

        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()

        loss_per_batch += loss

    exp1_model.eval()
    
    val_pred = exp1_model(tokenized_val['input_ids'].to(device), tokenized_val['attention_mask'].to(device))['logits']
    val_pred = val_pred.detach().cpu().numpy()
    
    val_pred = np.argmax(val_pred,axis=1)
    val_real = valid_data['label'].tolist()
    val_f1 = f1_score(val_real, val_pred, average='macro')

    print(f"epoch:{epoch}, loss:{loss_per_batch / (idx+1)}, val_scroe : {val_f1}")

  0%|          | 0/5 [00:00<?, ?it/s]

471 471
epoch:1, loss:0.9441004991531372, val_scroe : 0.13715570545250141
471 471
epoch:2, loss:0.8515154123306274, val_scroe : 0.22516593437412358
471 471
epoch:3, loss:0.7212725281715393, val_scroe : 0.4787132234707104
471 471
epoch:4, loss:0.578576922416687, val_scroe : 0.5579391196123562
471 471
epoch:5, loss:0.38740721344947815, val_scroe : 0.5624587718745405
