<a href="https://colab.research.google.com/github/Yewon9/Emotion_Recognition_STT/blob/main/Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORT

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch
!pip install transformers
!pip install kobert-transformers
!pip install sentencepiece

Collecting kobert-transformers
  Downloading kobert_transformers-0.6.0-py3-none-any.whl.metadata (7.3 kB)
Downloading kobert_transformers-0.6.0-py3-none-any.whl (12 kB)
Installing collected packages: kobert-transformers
Successfully installed kobert-transformers-0.6.0


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
import torch.nn.functional as F
import pandas as pd

# DATA

In [4]:
emotion_data = pd.read_csv('emotion_tokenized_data.csv')
emotion_data

Unnamed: 0,Emotion,Path,Text,cleaned_text,tokenized
0,disgust,/content/drive/MyDrive/project/data_aihub/4/5e...,고등학교 동창인데 아 이렇게 더럽게 쓸 줄은 몰랐어,고등학교 동창인데 아 이렇게 더럽게 쓸 줄은 몰랐어,"{'input_ids': tensor([[ 2, 5441, 0, 6797,..."
1,sad,/content/drive/MyDrive/project/data_aihub/4/5e...,그럴 시간도 없다,그럴 시간도 없다,"{'input_ids': tensor([[2, 0, 0, 0, 3, 1, 1, 1,..."
2,neutral,/content/drive/MyDrive/project/data_aihub/4/5e...,그래 고마워,그래 고마워,"{'input_ids': tensor([[ 2, 5540, 0, 3,..."
3,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,맨날 그래 얘,맨날 그래 얘,"{'input_ids': tensor([[ 2, 0, 5540, 6854,..."
4,surprise,/content/drive/MyDrive/project/data_aihub/4/5e...,일주일에 다섯 번을 먹는다니까,일주일에 다섯 번을 먹는다니까,"{'input_ids': tensor([[2, 0, 0, 0, 0, 3, 1, 1,..."
...,...,...,...,...,...
11717,fear,/content/drive/MyDrive/project/KEMDy20/wav/Ses...,무서운데,무서운데,"{'input_ids': tensor([[2, 0, 3, 1, 1, 1, 1, 1,..."
11718,neutral,/content/drive/MyDrive/project/KEMDy20/wav/Ses...,그거 뭐야,그거 뭐야,"{'input_ids': tensor([[2, 0, 0, 3, 1, 1, 1, 1,..."
11719,surprise,/content/drive/MyDrive/project/KEMDy20/wav/Ses...,거기 안에다가 AI 넣으면 사람이야,거기 안에다가 ai 넣으면 사람이야,"{'input_ids': tensor([[2, 0, 0, 0, 0, 0, 3, 1,..."
11720,neutral,/content/drive/MyDrive/project/KEMDy20/wav/Ses...,그치 그,그치 그,"{'input_ids': tensor([[ 2, 0, 5538, 3,..."


In [5]:
label_mapping = {'happy': 0, 'sad': 1, 'angry': 2, 'neutral': 3, 'fear': 4, 'disgust': 5, 'surprise': 6}

In [6]:
emotion_data['Emotion'] = emotion_data['Emotion'].map(label_mapping)

# KoBERT

In [7]:
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [8]:
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['cleaned_text']
        label = self.df.iloc[idx]['Emotion']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            max_length = self.max_len,
            return_token_type_ids = False,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt',
            truncation = True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [9]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = EmotionDataset(df, tokenizer, max_len)
    return DataLoader(ds, batch_size = batch_size, num_workers = 4)

In [10]:
train_data_loader = create_data_loader(emotion_data, tokenizer, max_len = 64, batch_size = 16)

In [11]:
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels = len(emotion_data['Emotion'].unique()))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
optimizer = AdamW(model.parameters(), lr = 2e-5, correct_bias = False)



In [13]:
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

In [14]:
epochs = 3
for epoch in range(epochs):
    loss = train_epoch(model, train_data_loader, optimizer, device)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss}')

Epoch 1/3, Loss: 1.6784866470919788
Epoch 2/3, Loss: 1.7206479968022845
Epoch 3/3, Loss: 1.7504840440346794


In [15]:
def eval_model(model, data_loader, device):
    model.eval()
    total_acc = 0
    total_loss = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim = 1)
            total_acc += torch.sum(preds == labels)

    return total_acc.double() / len(data_loader.dataset), total_loss / len(data_loader)

In [16]:
accuracy, eval_loss = eval_model(model, train_data_loader, device)
print(f'Accuracy: {accuracy}, Loss: {eval_loss}')

Accuracy: 0.12386964681794917, Loss: 3.0646920973818754
