## Setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m101.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import random
import os

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from tqdm import tqdm
from transformers import BertTokenizer, RobertaModel, AutoModel
from transformers import BertModel, RobertaTokenizer, AutoTokenizer
from torch.optim import Adam

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/230109

/content/drive/MyDrive/Colab Notebooks/230109


In [None]:
os.listdir()

['sample_submission.csv',
 'data_info.csv',
 'test.csv',
 'train.csv',
 'train_fold.csv',
 'results']

## Load

In [None]:
data = pd.read_csv('train.csv')
data.tail()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
9984,TRAIN_9984,You or me?,Chandler,1038,neutral
9985,TRAIN_9985,"I got it. Uh, Joey, women don't have Adam's ap...",Ross,1038,neutral
9986,TRAIN_9986,"You guys are messing with me, right?",Joey,1038,surprise
9987,TRAIN_9987,Yeah.,All,1038,neutral
9988,TRAIN_9988,"That was a good one. For a second there, I was...",Joey,1038,joy


In [None]:
train_ds = data[["ID", "Utterance", "Dialogue_ID", "Target"]]
train_ds

Unnamed: 0,ID,Utterance,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,0,neutral
2,TRAIN_0002,That I did. That I did.,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,0,neutral
4,TRAIN_0004,My duties? All right.,0,surprise
...,...,...,...,...
9984,TRAIN_9984,You or me?,1038,neutral
9985,TRAIN_9985,"I got it. Uh, Joey, women don't have Adam's ap...",1038,neutral
9986,TRAIN_9986,"You guys are messing with me, right?",1038,surprise
9987,TRAIN_9987,Yeah.,1038,neutral


In [None]:
train_ds["Target"].value_counts()

neutral     4710
joy         1743
surprise    1205
anger       1109
sadness      683
disgust      271
fear         268
Name: Target, dtype: int64

## Hyperparameter Setting

In [None]:
CFG = {
    'EPOCHS': 5,
    'LEARNING_RATE': 1e-6,
    'BATCH_SIZE': 8,
    'SEED': 42,
    'PLM': "tae898/emoberta-large",
    'OPTIMIZER': "Adam",
    'split': '8-2',
    'NFOLD': 7
}

## Fixed RandomSeed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED'])

## Label encoding

In [None]:
le = LabelEncoder()
le = le.fit(train_ds['Target'])
train_ds['Target'] = le.transform(train_ds['Target'])

In [None]:
for i, label in enumerate(le.classes_):
    print(i, '->', label)   

0 -> anger
1 -> disgust
2 -> fear
3 -> joy
4 -> neutral
5 -> sadness
6 -> surprise


## StratifiedKFold

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold

skf = StratifiedKFold(n_splits=CFG['NFOLD'], shuffle=True, random_state=CFG['SEED'])

for fold, ( _, val_) in enumerate(skf.split(X=train_ds, y=train_ds.Target)):
    train_ds.loc[val_ , "Kfold"] = int(fold)

train_ds["Kfold"] = train_ds["Kfold"].astype(int)
train_ds.head()

Unnamed: 0,ID,Utterance,Dialogue_ID,Target,Kfold
0,TRAIN_0000,also I was the point person on my company’s tr...,0,4,1
1,TRAIN_0001,You must’ve had your hands full.,0,4,2
2,TRAIN_0002,That I did. That I did.,0,4,4
3,TRAIN_0003,So let’s talk a little bit about your duties.,0,4,5
4,TRAIN_0004,My duties? All right.,0,6,2


In [None]:
CFG['NFOLD'], CFG['EPOCHS']

(7, 5)

## Train/Validation split

In [None]:
train_df = train_ds[train_ds.Kfold != fold].reset_index(drop=True)
valid_df = train_ds[train_ds.Kfold == fold].reset_index(drop=True)

train_len = len(train_df)
val_len = len(valid_df)

print(train_len)
print(val_len)

8562
1427


## Tokenizer Define

In [None]:
tokenizers = AutoTokenizer.from_pretrained(CFG["PLM"])

Downloading:   0%|          | 0.00/408 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

## CustomDataset

In [None]:
class CustomDataset(Dataset):
  
    def __init__(self, data, mode = "train"):
        self.dataset = data
        self.tokenizer = tokenizers
        self.mode = mode
        
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        text = self.dataset['Utterance'][idx]
        inputs = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]
    
        if self.mode == "train":
            y = self.dataset['Target'][idx]
            return input_ids, attention_mask, y
        
        else:
            return input_ids, attention_mask

In [None]:
train = CustomDataset(train_df, mode = "train")
valid = CustomDataset(valid_df, mode = "train")

train_dataloader = torch.utils.data.DataLoader(train, batch_size= CFG['BATCH_SIZE'], shuffle=True)
val_dataloader = torch.utils.data.DataLoader(valid, batch_size= CFG['BATCH_SIZE'], shuffle=False)

## Model Define

In [None]:
class BaseModel(nn.Module):

    def __init__(self, dropout=0.5, num_classes=len(le.classes_)):

        super(BaseModel, self).__init__()

        self.bert = AutoModel.from_pretrained(CFG["PLM"])

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(1024, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,
                                     return_dict=False
                                    )
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

## Train

In [None]:
def train(model, optimizer, train_loader, test_loader, device, fold=CFG["NFOLD"]):

    model.to(device)

    criterion = nn.CrossEntropyLoss().to(device)

    best_score = 0
    best_model = "None"
    for epoch_num in range(CFG["EPOCHS"]):

        model.train()
        train_loss = []
        for input_ids, attention_mask, train_label in tqdm(train_loader):

            optimizer.zero_grad()

            train_label = train_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)     
    
            batch_loss = criterion(output, train_label.long()) 
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            optimizer.step()

        val_loss, val_score = validation(model, criterion, test_loader, device)
        print(f'Epoch [{epoch_num}] of {fold}th Fold, Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')

        if best_score < val_score:
            best_model = model
            best_score = val_score
            torch.save(model.state_dict(), os.path.join(RECORDER_DIR, f"best_model-Fold-{fold}.pt"))
        
    return best_model                         

In [None]:
def competition_metric(true, pred):
    return f1_score(true, pred, average="macro")

def validation(model, criterion, test_loader, device):
    model.eval()

    val_loss = []
    model_preds = []
    true_labels = []  
    with torch.no_grad():
        for input_ids, attention_mask, valid_label in tqdm(test_loader):
            
            valid_label = valid_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)     
    
            batch_loss = criterion(output, valid_label.long()) 
            val_loss.append(batch_loss.item())      
            
            model_preds += output.argmax(1).detach().cpu().numpy().tolist()
            true_labels += valid_label.detach().cpu().numpy().tolist()
        val_f1 = competition_metric(true_labels, model_preds)
    return val_loss, val_f1  

## Run

In [None]:
from datetime import datetime, timezone, timedelta

PROJECT_DIR = './'
os.chdir(PROJECT_DIR)
kst = timezone(timedelta(hours=9))        
train_serial = datetime.now(tz=kst).strftime("%Y%m%d_%H%M%S")


RECORDER_DIR = os.path.join(PROJECT_DIR, 'results', train_serial)
os.makedirs(RECORDER_DIR, exist_ok=True)    

In [None]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

In [None]:
for fold in range(0, CFG['NFOLD']):
    print(f"======== Fold: {fold} =========")

    model = BaseModel()
    # model.eval()
    optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])

    infer_model = train(model, optimizer, train_dataloader, val_dataloader, device, fold)



Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at tae898/emoberta-large were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at tae898/emoberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1071/

Epoch [0] of 0th Fold, Train Loss : [1.04776] Val Loss : [0.77239] Val F1 Score : [0.51600]


100%|██████████| 1071/1071 [10:46<00:00,  1.66it/s]
100%|██████████| 179/179 [00:34<00:00,  5.16it/s]


Epoch [1] of 0th Fold, Train Loss : [0.79412] Val Loss : [0.72116] Val F1 Score : [0.59225]


100%|██████████| 1071/1071 [10:46<00:00,  1.66it/s]
100%|██████████| 179/179 [00:34<00:00,  5.16it/s]


Epoch [2] of 0th Fold, Train Loss : [0.71718] Val Loss : [0.70572] Val F1 Score : [0.66346]


100%|██████████| 1071/1071 [10:46<00:00,  1.66it/s]
100%|██████████| 179/179 [00:34<00:00,  5.16it/s]


Epoch [3] of 0th Fold, Train Loss : [0.65296] Val Loss : [0.72209] Val F1 Score : [0.64772]


100%|██████████| 1071/1071 [10:46<00:00,  1.66it/s]
100%|██████████| 179/179 [00:34<00:00,  5.16it/s]


Epoch [4] of 0th Fold, Train Loss : [0.60051] Val Loss : [0.72634] Val F1 Score : [0.67382]


Some weights of the model checkpoint at tae898/emoberta-large were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at tae898/emoberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1071/

Epoch [0] of 1th Fold, Train Loss : [1.03822] Val Loss : [0.79029] Val F1 Score : [0.53646]


100%|██████████| 1071/1071 [10:46<00:00,  1.66it/s]
100%|██████████| 179/179 [00:34<00:00,  5.17it/s]


Epoch [1] of 1th Fold, Train Loss : [0.79001] Val Loss : [0.73489] Val F1 Score : [0.65376]


100%|██████████| 1071/1071 [10:46<00:00,  1.66it/s]
100%|██████████| 179/179 [00:34<00:00,  5.17it/s]


Epoch [2] of 1th Fold, Train Loss : [0.71074] Val Loss : [0.71646] Val F1 Score : [0.63861]


100%|██████████| 1071/1071 [10:46<00:00,  1.66it/s]
100%|██████████| 179/179 [00:34<00:00,  5.17it/s]


Epoch [3] of 1th Fold, Train Loss : [0.64848] Val Loss : [0.72617] Val F1 Score : [0.66973]


100%|██████████| 1071/1071 [10:46<00:00,  1.66it/s]
100%|██████████| 179/179 [00:34<00:00,  5.17it/s]


Epoch [4] of 1th Fold, Train Loss : [0.60499] Val Loss : [0.72142] Val F1 Score : [0.67532]


Some weights of the model checkpoint at tae898/emoberta-large were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at tae898/emoberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1071/

Epoch [0] of 2th Fold, Train Loss : [1.02496] Val Loss : [0.78367] Val F1 Score : [0.50725]


 33%|███▎      | 356/1071 [03:35<07:12,  1.65it/s]


KeyboardInterrupt: ignored

## Inference

In [None]:
base_path = f'results/{train_serial}/'
print(base_path)

model_paths = [
    base_path + "best_model-Fold-0.pt",
    base_path + "best_model-Fold-1.pt",
    base_path + "best_model-Fold-2.pt",
    base_path + "best_model-Fold-3.pt",
    base_path + "best_model-Fold-4.pt",
    base_path + "best_model-Fold-5.pt",
    base_path + "best_model-Fold-6.pt",
    ]

results/20230112_145849/


In [None]:
test = pd.read_csv('test.csv')

In [None]:
test = CustomDataset(test, mode = "test")
test_dataloader = torch.utils.data.DataLoader(test, batch_size= 10, #CFG['BATCH_SIZE'], 
                                              shuffle=False)

In [None]:
def inference(model_paths, test_loader, device):

    test_predicts = []

    with torch.no_grad():

        for i, path in enumerate(model_paths):  
            test_predict = []
            
            model = BaseModel().to(device)
            model.load_state_dict(torch.load(path))
            model.eval()

            print(f"Prediction for model {i+1}")
            for input_ids, attention_mask in tqdm(test_loader):
                input_id = input_ids.to(device)
                mask = attention_mask.to(device)
                y_pred = model(input_id, mask)
                test_predict.append(y_pred.detach().cpu().numpy())

            test_predict1 = np.concatenate(np.array(test_predict), axis = 0) # test_predict1: [total_bs, 7]
            print(test_predict1.shape)
            
            test_predicts.append(test_predict1) # test_predicts: [[total_bs, 7],  [total_bs, 7],  .... ]

    test_predicts_final = np.mean(test_predicts, axis=0)
    
    return test_predicts_final

In [None]:
preds = inference(model_paths, test_dataloader, device)

In [None]:
n_preds = np.argmax(preds, axis = 1)
N_preds = le.inverse_transform(n_preds) 

## Submit

In [None]:
submit = pd.read_csv('sample_submission.csv')
submit.head()

In [None]:
submit['Target'] = N_preds
submit.head()

In [None]:
submit.to_csv(f"results/submit.csv", index=False)