<a href="https://colab.research.google.com/github/Yewon9/Emotion_Recognition_STT/blob/main/Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORT

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch
!pip install transformers
!pip install kobert-transformers
!pip install sentencepiece
!pip install scikit-learn

Collecting kobert-transformers
  Downloading kobert_transformers-0.6.0-py3-none-any.whl.metadata (7.3 kB)
Downloading kobert_transformers-0.6.0-py3-none-any.whl (12 kB)
Installing collected packages: kobert-transformers
Successfully installed kobert-transformers-0.6.0


In [46]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# DATA

In [4]:
emotion_data = pd.read_csv('emotion_tokenized_data.csv')
emotion_data

Unnamed: 0,Emotion,Path,Text,cleaned_text,tokenized
0,disgust,/content/drive/MyDrive/project/data_aihub/4/5e...,고등학교 동창인데 아 이렇게 더럽게 쓸 줄은 몰랐어,고등학교 동창인데 아 이렇게 더럽게 쓸 줄은 몰랐어,"{'input_ids': tensor([[ 2, 5441, 0, 6797,..."
1,sad,/content/drive/MyDrive/project/data_aihub/4/5e...,그럴 시간도 없다,그럴 시간도 없다,"{'input_ids': tensor([[2, 0, 0, 0, 3, 1, 1, 1,..."
2,neutral,/content/drive/MyDrive/project/data_aihub/4/5e...,그래 고마워,그래 고마워,"{'input_ids': tensor([[ 2, 5540, 0, 3,..."
3,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,맨날 그래 얘,맨날 그래 얘,"{'input_ids': tensor([[ 2, 0, 5540, 6854,..."
4,surprise,/content/drive/MyDrive/project/data_aihub/4/5e...,일주일에 다섯 번을 먹는다니까,일주일에 다섯 번을 먹는다니까,"{'input_ids': tensor([[2, 0, 0, 0, 0, 3, 1, 1,..."
...,...,...,...,...,...
11717,fear,/content/drive/MyDrive/project/KEMDy20/wav/Ses...,무서운데,무서운데,"{'input_ids': tensor([[2, 0, 3, 1, 1, 1, 1, 1,..."
11718,neutral,/content/drive/MyDrive/project/KEMDy20/wav/Ses...,그거 뭐야,그거 뭐야,"{'input_ids': tensor([[2, 0, 0, 3, 1, 1, 1, 1,..."
11719,surprise,/content/drive/MyDrive/project/KEMDy20/wav/Ses...,거기 안에다가 AI 넣으면 사람이야,거기 안에다가 ai 넣으면 사람이야,"{'input_ids': tensor([[2, 0, 0, 0, 0, 0, 3, 1,..."
11720,neutral,/content/drive/MyDrive/project/KEMDy20/wav/Ses...,그치 그,그치 그,"{'input_ids': tensor([[ 2, 0, 5538, 3,..."


In [5]:
label_mapping = {'happy': 0, 'sad': 1, 'angry': 2, 'neutral': 3, 'fear': 4, 'disgust': 5, 'surprise': 6}

In [6]:
emotion_data['Emotion'] = emotion_data['Emotion'].map(label_mapping)

# KoBERT

In [7]:
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [8]:
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['cleaned_text']
        label = self.df.iloc[idx]['Emotion']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            max_length = self.max_len,
            return_token_type_ids = False,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt',
            truncation = True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [9]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = EmotionDataset(df, tokenizer, max_len)
    return DataLoader(ds, batch_size = batch_size, num_workers = 4)

In [10]:
train_data_loader = create_data_loader(emotion_data, tokenizer, max_len = 64, batch_size = 16)

In [11]:
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels = len(emotion_data['Emotion'].unique()))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
optimizer = AdamW(model.parameters(), lr = 5e-6, correct_bias = False)



In [18]:
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

In [22]:
epochs = 10
for epoch in range(epochs):
    loss = train_epoch(model, train_data_loader, optimizer, device)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss}')

Epoch 1/10, Loss: 1.7466356766500382
Epoch 2/10, Loss: 1.7147604104278846
Epoch 3/10, Loss: 1.6889800559449815
Epoch 4/10, Loss: 1.6747060719811413
Epoch 5/10, Loss: 1.745796323391783
Epoch 6/10, Loss: 1.6746181626111716
Epoch 7/10, Loss: 1.7588608002272341
Epoch 8/10, Loss: 1.6941215732575763
Epoch 9/10, Loss: 1.7193126027125454
Epoch 10/10, Loss: 1.6702555670094197


In [23]:
def eval_model(model, data_loader, device):
    model.eval()
    total_acc = 0
    total_loss = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim = 1)
            total_acc += torch.sum(preds == labels)

    return total_acc.double() / len(data_loader.dataset), total_loss / len(data_loader)

In [24]:
accuracy, eval_loss = eval_model(model, train_data_loader, device)
print(f'Accuracy: {accuracy}, Loss: {eval_loss}')

Accuracy: 0.12386964681794917, Loss: 2.789454570327254


# Logistic Regression

In [28]:
X = emotion_data['cleaned_text']
y = emotion_data['Emotion']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [30]:
vectorizer = TfidfVectorizer(max_features = 5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [31]:
lr_model = LogisticRegression(max_iter = 1000)
lr_model.fit(X_train_tfidf, y_train)

In [32]:
y_pred = lr_model.predict(X_test_tfidf)

In [33]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.7023454157782516
              precision    recall  f1-score   support

           0       0.73      0.74      0.73       327
           1       0.60      0.70      0.64       368
           2       0.77      0.71      0.74       418
           3       0.46      0.45      0.45       296
           4       0.80      0.82      0.81       354
           5       0.78      0.82      0.80       379
           6       0.81      0.59      0.68       203

    accuracy                           0.70      2345
   macro avg       0.70      0.69      0.69      2345
weighted avg       0.71      0.70      0.70      2345



# SVM

In [35]:
X = emotion_data['cleaned_text']
y = emotion_data['Emotion']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [37]:
vectorizer = TfidfVectorizer(max_features = 5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [38]:
svm_model = SVC(kernel = 'linear', C = 1.0)
svm_model.fit(X_train_tfidf, y_train)

In [39]:
y_pred = svm_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

SVM Accuracy: 0.7023454157782516
              precision    recall  f1-score   support

           0       0.76      0.74      0.75       327
           1       0.62      0.68      0.65       368
           2       0.76      0.71      0.73       418
           3       0.43      0.50      0.46       296
           4       0.82      0.82      0.82       354
           5       0.78      0.77      0.77       379
           6       0.79      0.63      0.70       203

    accuracy                           0.70      2345
   macro avg       0.71      0.69      0.70      2345
weighted avg       0.71      0.70      0.71      2345



# Random Forest

In [41]:
X = emotion_data['cleaned_text']
y = emotion_data['Emotion']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [43]:
vectorizer = TfidfVectorizer(max_features = 5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [44]:
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf_model.fit(X_train_tfidf, y_train)

In [45]:
y_pred = rf_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 0.6460554371002132
              precision    recall  f1-score   support

           0       0.69      0.69      0.69       327
           1       0.56      0.60      0.58       368
           2       0.76      0.63      0.69       418
           3       0.35      0.45      0.39       296
           4       0.79      0.75      0.77       354
           5       0.74      0.75      0.75       379
           6       0.71      0.59      0.64       203

    accuracy                           0.65      2345
   macro avg       0.66      0.64      0.64      2345
weighted avg       0.66      0.65      0.65      2345



# LSTM

In [47]:
X = emotion_data['cleaned_text']
y = emotion_data['Emotion']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [49]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [50]:
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen = max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen = max_len)

In [51]:
model = Sequential()
model.add(Embedding(input_dim = 5000, output_dim = 128, input_length = max_len))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))



In [52]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [53]:
model.fit(X_train_pad, y_train, epochs = 5, batch_size = 32, validation_data = (X_test_pad, y_test))

Epoch 1/5
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.1618 - loss: -37.2724 - val_accuracy: 0.1569 - val_loss: -95.6190
Epoch 2/5
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.1578 - loss: -116.3191 - val_accuracy: 0.1569 - val_loss: -165.4363
Epoch 3/5
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.1657 - loss: -185.6034 - val_accuracy: 0.1569 - val_loss: -234.4525
Epoch 4/5
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.1670 - loss: -256.1508 - val_accuracy: 0.1569 - val_loss: -302.9210
Epoch 5/5
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.1635 - loss: -327.8466 - val_accuracy: 0.1569 - val_loss: -371.2405


<keras.src.callbacks.history.History at 0x7f5d29eef0a0>

In [54]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"LSTM Accuracy: {accuracy}")

[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1451 - loss: -371.1404
LSTM Accuracy: 0.1569296419620514
