In [1]:
import os
import numpy as np
import pandas as pd
import re
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.optim import Adam
# from dataset_task2 import Dataset_task2
from sklearn.metrics import matthews_corrcoef

In [2]:
# Load dataset
train_df = pd.read_csv('./data/NIKL_CoLA_train.tsv', sep='\t')[['sentence', 'acceptability_label']]
valid_df = pd.read_csv('./data/NIKL_CoLA_dev.tsv', sep='\t')[['sentence', 'acceptability_label']]


In [3]:
train_df.head()

Unnamed: 0,sentence,acceptability_label
0,높은 달이 떴다.,1
1,달이 뜸이 높았다.,0
2,실없는 사람이 까불까불한다.,1
3,나는 철수에게 공을 던졌다.,1
4,내가 순이와 둘이서 다툰다.,1


In [4]:
from transformers import ElectraModel, ElectraTokenizer
# Use funnel-kor-base tokenizer
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")


In [5]:
# Generate dataset
class Dataset_task1(Dataset):
    def __init__(self, _df, _tokenizer):
        sents = list(_df['sentence'])
        self.inputs = _tokenizer(sents, padding="longest", return_tensors="pt")
        self.labels = torch.tensor(_df['acceptability_label'].values).float()
        
    def __len__(self):
        return self.labels.shape[0]
    
    def get_batch_inputs(self, _inputs, _idx):
        _dict = {}
        for _key in _inputs.keys():
            _dict[_key] = _inputs[_key][_idx]
        return _dict
    
    def __getitem__(self, idx):
        _x = self.get_batch_inputs(self.inputs, idx)
        _y = self.labels[idx]
        return _x, _y

In [6]:
train_dataset = Dataset_task1(train_df, tokenizer)
valid_dataset = Dataset_task1(valid_df, tokenizer)

In [7]:
print(train_dataset.inputs['input_ids'].shape)
print(valid_dataset.inputs['input_ids'].shape)

torch.Size([15876, 40])
torch.Size([2032, 32])


In [8]:
# kor_model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator") 

In [9]:
@torch.no_grad()
def evaluation(_dataset, device, model, thres=0., bs=8):
    model.eval()
    _perm = np.arange(len(_dataset))
    y_preds = []; y_true = []
    for _idx in range(_perm.shape[0]//bs + 1):
        if _idx == _perm.shape[0]//bs:
            if _idx*bs < _perm.shape[0]:
                indices = _perm[_idx*bs:]
            else:
                break
        else:
            indices = _perm[_idx * bs: (_idx+1)*(bs)]
        
        x, y= _dataset[indices]
        for _k in x:
            x[_k] = x[_k].to(device)
        y = y.to(device)
        y_hat = model(x).squeeze()
        
        preds = (y_hat > thres).float()
        y_preds.append(preds.to('cpu').numpy())
        y_true.append(y.to('cpu').numpy())
    y_true = np.concatenate(y_true)
    y_preds = np.concatenate(y_preds)    
    return matthews_corrcoef(y_true, y_preds)

In [10]:
class task1_model(nn.Module):
    def __init__(self, dropout=0.1, _k=1):
        super(task1_model, self).__init__()
        self.hid_dim = 768+31 # 768  + 30
        self.kor_model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator") 
        self.mlp = nn.Sequential(                
                torch.nn.Dropout(p=dropout),
                nn.Linear(self.hid_dim, self.hid_dim),
                nn.BatchNorm1d(self.hid_dim),
                nn.GELU(),
               torch.nn.Dropout(p=dropout),
                nn.Linear(self.hid_dim, 1),
                )
        
    def forward(self, x):
        # x: [inputs1, idx_list1, inputs2, idx_lists2]
        # [batch, seq_len, hid_dim] --> [batch, hid_dim] 
        # [batch, seq, emb]
        cls_emb = self.kor_model(**x).last_hidden_state[:, 0, :]
#         print(cls_emb.shape)
        _mask = (x['input_ids'][0]==0)
        seq_emb = self.kor_model(**x).last_hidden_state[:, 1:32, :]
#         print(seq_emb.shape)
        new_emb = (cls_emb.unsqueeze(1) @ seq_emb.transpose(1,2)).squeeze(1)
        out = torch.cat([cls_emb, new_emb], dim=1)
        return self.mlp(out)    

In [14]:
# TODO: implement codes for saving the best model
device = torch.device("cuda:2") #  if torch.cuda.is_available() else torch.device("cpu")

DROPOUT = 0.2 # 0.1 # 0.1/ 3 layer/ w/o batchnorm에서 88.4
EPOCHS = 15
bs = 32
# MAX_NORM = 1. # 

model = task1_model(dropout=DROPOUT, _k=1).to(device)
optimizer = Adam(model.parameters(), lr=15e-7) # 1e-6 --> 54.3
bce_loss = nn.BCEWithLogitsLoss()

for epoch in range(1, EPOCHS+1):
    progress_bar = tqdm(range(len(train_dataset)//bs+1))
    # generate random permutation for a batch
    _perm = np.random.permutation(len(train_dataset))
    y_preds = []; y_true = []
    loss_sum = 0.
    for _idx in range(_perm.shape[0]//bs+1):
        model.train()
        model.zero_grad()
        optimizer.zero_grad()
        
        # Get batch items            
        if _idx == _perm.shape[0]//bs:
            if _idx*bs < _perm.shape[0]:
                indices = _perm[_idx*bs:]
            else:
                break
        else:
            indices = _perm[_idx * bs: (_idx+1)*(bs)]
            
            
        x, y= train_dataset[indices]
        for _k in x:
            x[_k] = x[_k].to(device)  
        y = y.to(device).squeeze()
        
        y_hat = model(x).squeeze()
        preds = (y_hat > 0.).float()
    
        loss = bce_loss(y_hat, y)
        loss_sum += loss.item()
        
        # Gradient clipping?
        loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_NORM)
        progress_bar.update(1)
        optimizer.step()
        
        y_preds.append(preds.to('cpu').numpy())
        y_true.append(y.to('cpu').numpy())    
        
    y_true = np.concatenate(y_true)
    y_preds = np.concatenate(y_preds)        
    # train_acc = evaluation(train_dataset, model, bs=8)
    loss_sum /= (len(train_dataset)//bs+1)
    train_mcc = matthews_corrcoef(y_true, y_preds)
    valid_mcc = evaluation(valid_dataset, device, model, bs=64)
    print(f"Epoch {epoch}| loss: {loss_sum:6.2f} | train mcc: {train_mcc:.3f} | valid mcc: {valid_mcc:.3f}")

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 1| loss:   0.64 | train mcc: 0.283 | valid mcc: 0.423


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 2| loss:   0.55 | train mcc: 0.460 | valid mcc: 0.463


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 3| loss:   0.50 | train mcc: 0.530 | valid mcc: 0.482


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 4| loss:   0.47 | train mcc: 0.577 | valid mcc: 0.504


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 5| loss:   0.44 | train mcc: 0.610 | valid mcc: 0.512


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 6| loss:   0.40 | train mcc: 0.651 | valid mcc: 0.510


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 7| loss:   0.38 | train mcc: 0.687 | valid mcc: 0.510


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 8| loss:   0.35 | train mcc: 0.715 | valid mcc: 0.514


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 9| loss:   0.33 | train mcc: 0.734 | valid mcc: 0.514


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 10| loss:   0.30 | train mcc: 0.759 | valid mcc: 0.519


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 11| loss:   0.29 | train mcc: 0.782 | valid mcc: 0.528


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 12| loss:   0.26 | train mcc: 0.805 | valid mcc: 0.524


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 13| loss:   0.24 | train mcc: 0.822 | valid mcc: 0.517


  0%|          | 0/497 [00:00<?, ?it/s]

Epoch 14| loss:   0.23 | train mcc: 0.843 | valid mcc: 0.512


  0%|          | 0/497 [00:00<?, ?it/s]

KeyboardInterrupt: 