<a href="https://colab.research.google.com/github/Yewon9/Emotion_Recognition/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORT

In [120]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [121]:
!pip install transformers sentencepiece torch



In [122]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertTokenizerFast
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from torch.utils.data import Dataset
import numpy as np
from torch.optim import AdamW
from transformers import BertModel
from torch.optim.lr_scheduler import ReduceLROnPlateau

# DATA

In [139]:
df = pd.read_csv('emotion_tokenized_data.csv')
df

Unnamed: 0,Emotion,Path,Length,Text,cleaned_text,tokenized
0,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,4.565333,10년도 넘었어 고등학교 동창,10년도 넘었어 고등학교 동창,"{'input_ids': tensor([[ 2, 0, 0, 5441,..."
1,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,8.576000,10분마다 연락을 해 봤지 근데 아직도이 녀석 온다고 말하지 오지 않고 있어,10분마다 연락을 해 봤지 근데 아직도이 녀석 온다고 말하지 오지 않고 있어,"{'input_ids': tensor([[ 2, 0, 0, 7848,..."
2,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,7.040000,1년 채우고 그만두려고 했는데 아 지금 같아서는 진짜 회사 옮기고 싶다,1년 채우고 그만두려고 했는데 아 지금 같아서는 진짜 회사 옮기고 싶다,"{'input_ids': tensor([[ 2, 0, 0, 0,..."
3,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,5.034667,1시간 기다렸나 봐 다리도 아프고 짜증 나,1시간 기다렸나 봐 다리도 아프고 짜증 나,"{'input_ids': tensor([[ 2, 0, 0, 6393,..."
4,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,4.266667,1시간 기다렸는데 아직도 안 와,1시간 기다렸는데 아직도 안 와,"{'input_ids': tensor([[ 2, 0, 0, 0,..."
...,...,...,...,...,...,...
14377,sadness,/content/drive/MyDrive/project/data_aihub/4/5e...,10.880000,힘들다 그만둬야 할까 어떻게 해야 될까 정말 고민이 많이 되는 거 같아 어떡하면 좋겠니,힘들다 그만둬야 할까 어떻게 해야 될까 정말 고민이 많이 되는 거 같아 어떡하면 좋겠니,"{'input_ids': tensor([[ 2, 0, 0, 0,..."
14378,sadness,/content/drive/MyDrive/project/data_aihub/5_2/...,4.821333,힘들다 힘들어,힘들다 힘들어,"{'input_ids': tensor([[2, 0, 0, 3, 1, 1, 1, 1,..."
14379,sadness,/content/drive/MyDrive/project/data_aihub/4/5e...,5.248000,힘들어 다른 회사 또 준비를 할 거야 나는,힘들어 다른 회사 또 준비를 할 거야 나는,"{'input_ids': tensor([[ 2, 0, 5783, 7957,..."
14380,sadness,/content/drive/MyDrive/project/data_aihub/4/5e...,3.114667,힘들어하고 나도 너무 힘들어,힘들어하고 나도 너무 힘들어,"{'input_ids': tensor([[2, 0, 0, 0, 0, 3, 1, 1,..."


In [140]:
label_mapping = {'happiness': 0, 'sadness': 1, 'angry': 2, 'neutral': 3, 'fear': 4, 'disgust': 5}

In [141]:
df['Emotion'] = df['Emotion'].map(label_mapping)

In [142]:
df['Emotion'].unique()

array([2, 5, 4, 0, 3, 1])

# KoBERT

In [152]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [143]:
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["cleaned_text"]
        emotion = self.data.iloc[index]["Emotion"]

        # KoBERT 토큰화
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(emotion, dtype=torch.long)  # 감정 레이블
        }

In [144]:
class KoBERTEmotionClassifier(nn.Module):
    def __init__(self, n_classes):
        super(KoBERTEmotionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('monologg/kobert')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [145]:
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    total_loss = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        loss = nn.CrossEntropyLoss()(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

In [146]:
def eval_model(model, data_loader, device):
    model = model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            loss = nn.CrossEntropyLoss()(outputs, labels)

            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), total_loss / len(data_loader)

In [147]:
BATCH_SIZE = 16
MAX_LEN = 64
EPOCHS = 30
N_CLASSES = len(set(df['Emotion']))

In [148]:
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [149]:
train_dataset = EmotionDataset(df, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [150]:
model = KoBERTEmotionClassifier(n_classes=N_CLASSES).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

In [153]:
for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    accuracy, val_loss = eval_model(model, train_loader, device)

    print(f'Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss}, Validation Accuracy: {accuracy}, Validation Loss: {val_loss}')

Epoch 1/30, Train Loss: 1.7021667681493537, Validation Accuracy: 0.40091781393408427, Validation Loss: 1.4992945049708093
Epoch 2/30, Train Loss: 1.4828873832313316, Validation Accuracy: 0.4827562230565985, Validation Loss: 1.3365545149507194
Epoch 3/30, Train Loss: 1.3840375947740107, Validation Accuracy: 0.5157836184119037, Validation Loss: 1.2593018608178128
Epoch 4/30, Train Loss: 1.3035063598392007, Validation Accuracy: 0.553191489361702, Validation Loss: 1.1607770812392102
Epoch 5/30, Train Loss: 1.2250178917761771, Validation Accuracy: 0.5812126268947295, Validation Loss: 1.0833046991581117
Epoch 6/30, Train Loss: 1.1615974574916488, Validation Accuracy: 0.6111806424697538, Validation Loss: 1.002157277936267
Epoch 7/30, Train Loss: 1.0937285200175773, Validation Accuracy: 0.6280767626199415, Validation Loss: 0.959095823320584
Epoch 8/30, Train Loss: 1.0388928636063193, Validation Accuracy: 0.6503963287442636, Validation Loss: 0.8927882678715617
Epoch 9/30, Train Loss: 0.98746638

In [154]:
final_accuracy, final_loss = eval_model(model, train_loader, device)
print(f"Final Accuracy: {final_accuracy}, Final Loss: {final_loss}")

Final Accuracy: 0.7341816159087748, Final Loss: 0.6269935143073488


# SVM, Logistic Regression, Random Forest

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['cleaned_text'], df['Emotion'], test_size=0.2, random_state=42)

In [31]:
vectorizer = TfidfVectorizer(max_features = 5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [32]:
svm = SVC()
svm.fit(X_train, train_labels)
svm_preds = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(test_labels, svm_preds))
print("SVM F1 Score:", f1_score(test_labels, svm_preds, average="weighted"))

SVM Accuracy: 0.7413972888425443
SVM F1 Score: 0.7439418678902551


In [33]:
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, train_labels)
log_reg_preds = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(test_labels, log_reg_preds))
print("Logistic Regression F1 Score:", f1_score(test_labels, log_reg_preds, average="weighted"))

Logistic Regression Accuracy: 0.721584984358707
Logistic Regression F1 Score: 0.7224331753254929


In [34]:
rf = RandomForestClassifier()
rf.fit(X_train, train_labels)
rf_preds = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(test_labels, rf_preds))
print("Random Forest F1 Score:", f1_score(test_labels, rf_preds, average="weighted"))

Random Forest Accuracy: 0.708029197080292
Random Forest F1 Score: 0.7085823365525048
