<a href="https://colab.research.google.com/github/Yewon9/Emotion_Recognition/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORT

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [100]:
!pip install transformers sentencepiece torch



In [101]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertTokenizerFast
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from torch.utils.data import Dataset
import numpy as np
from torch.optim import AdamW
from transformers import BertModel
from torch.optim.lr_scheduler import ReduceLROnPlateau

# DATA

In [102]:
df = pd.read_csv('final_data.csv')
df

Unnamed: 0,Emotion,Path,Length,Text,cleaned_text,tokenized_text
0,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,4.565333,10년도 넘었어 고등학교 동창,10년도 넘었어 고등학교 동창,"['10년', '도', '넘다', '고등학교', '동창']"
1,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,8.576000,10분마다 연락을 해 봤지 근데 아직도이 녀석 온다고 말하지 오지 않고 있어,10분마다 연락을 해 봤지 근데 아직도이 녀석 온다고 말하지 오지 않고 있어,"['10분', '마다', '연락', '을', '해', '보다', '근데', '아직도..."
2,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,7.040000,1년 채우고 그만두려고 했는데 아 지금 같아서는 진짜 회사 옮기고 싶다,1년 채우고 그만두려고 했는데 아 지금 같아서는 진짜 회사 옮기고 싶다,"['1년', '채우다', '그만두다', '하다', '아', '지금', '같다', '..."
3,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,5.034667,1시간 기다렸나 봐 다리도 아프고 짜증 나,1시간 기다렸나 봐 다리도 아프고 짜증 나,"['1시간', '기다리다', '보다', '다리', '도', '아프다', '짜증', ..."
4,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,4.266667,1시간 기다렸는데 아직도 안 와,1시간 기다렸는데 아직도 안 와,"['1시간', '기다리다', '아직도', '안', '오다']"
...,...,...,...,...,...,...
14377,sadness,/content/drive/MyDrive/project/data_aihub/4/5e...,10.880000,힘들다 그만둬야 할까 어떻게 해야 될까 정말 고민이 많이 되는 거 같아 어떡하면 좋겠니,힘들다 그만둬야 할까 어떻게 해야 될까 정말 고민이 많이 되는 거 같아 어떡하면 좋겠니,"['힘들다', '그만두다', '하다', '어떻다', '하다', '되다', '정말',..."
14378,sadness,/content/drive/MyDrive/project/data_aihub/5_2/...,4.821333,힘들다 힘들어,힘들다 힘들어,"['힘들다', '힘들다']"
14379,sadness,/content/drive/MyDrive/project/data_aihub/4/5e...,5.248000,힘들어 다른 회사 또 준비를 할 거야 나는,힘들어 다른 회사 또 준비를 할 거야 나는,"['힘들다', '다른', '회사', '또', '준비', '를', '하다', '거야'..."
14380,sadness,/content/drive/MyDrive/project/data_aihub/4/5e...,3.114667,힘들어하고 나도 너무 힘들어,힘들어하고 나도 너무 힘들어,"['힘들다', '나다', '너무', '힘들다']"


In [103]:
label_encoder = LabelEncoder()
df['Emotion'] = label_encoder.fit_transform(df['Emotion'])

In [104]:
df['Emotion'].unique()

array([0, 1, 2, 3, 4, 5])

In [105]:
emotion_counts = df['Emotion'].value_counts()
emotion_counts

Unnamed: 0_level_0,count
Emotion,Unnamed: 1_level_1
4,2489
3,2464
1,2413
0,2403
5,2392
2,2221


In [66]:
#train_texts, test_texts, train_labels, test_labels = train_test_split(df['cleaned_text'], df['Emotion'], test_size=0.2, random_state=42)

# KoBERT

In [106]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [107]:
BATCH_SIZE = 16
MAX_LEN = 64
EPOCHS = 50  # Early Stopping으로 중간에 멈출 수 있도록 크게 설정
LEARNING_RATE = 2e-5
N_CLASSES = len(set(df['Emotion']))  # 감정 레이블 수

In [108]:
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [109]:
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["cleaned_text"]
        emotion = self.data.iloc[index]["Emotion"]

        # KoBERT 토큰화
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(emotion, dtype=torch.long)  # 감정 레이블
        }

In [110]:
class KoBERTEmotionClassifier(nn.Module):
    def __init__(self, n_classes):
        super(KoBERTEmotionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('monologg/kobert')
        self.drop = nn.Dropout(p=0.3)  # Dropout 비율 조정
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [111]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = EmotionDataset(train_df, tokenizer, MAX_LEN)
val_dataset = EmotionDataset(val_df, tokenizer, MAX_LEN)

In [112]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [113]:
model = KoBERTEmotionClassifier(n_classes=N_CLASSES).to(device)

In [114]:
optimizer_grouped_parameters = [
    {"params": [p for n, p in model.named_parameters() if "encoder.layer" not in n], "lr": LEARNING_RATE},
    {"params": [p for n, p in model.named_parameters() if "encoder.layer" in n], "lr": LEARNING_RATE * 0.5}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)

In [115]:
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)

In [116]:
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    total_loss = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        loss = nn.CrossEntropyLoss()(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

In [117]:
def eval_model(model, data_loader, device):
    model = model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            loss = nn.CrossEntropyLoss()(outputs, labels)

            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), total_loss / len(data_loader)

In [118]:
best_accuracy = 0
patience = 3  # Early Stopping patience 설정
early_stopping_counter = 0

In [119]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")

    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_accuracy, val_loss = eval_model(model, val_loader, device)

    print(f"Train Loss: {train_loss}, Validation Accuracy: {val_accuracy}, Validation Loss: {val_loss}")

    # 스케줄러 호출
    scheduler.step(val_loss)

    # Best Model 저장
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model = model.state_dict()
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    # Early Stopping 조건 체크
    if early_stopping_counter >= patience:
        print("Early stopping triggered.")
        break

Epoch 1/50
Train Loss: 1.7992483493354585, Validation Accuracy: 0.17031630170316303, Validation Loss: 1.7983086572753058
Epoch 2/50
Train Loss: 1.8007528997129865, Validation Accuracy: 0.17031630170316303, Validation Loss: 1.7915170007281833
Epoch 3/50
Train Loss: 1.7978847806652387, Validation Accuracy: 0.17274939172749393, Validation Loss: 1.7933992584546408
Epoch 4/50
Train Loss: 1.7993118560976453, Validation Accuracy: 0.1616266944734098, Validation Loss: 1.7944509393639034
Epoch 5/50
Train Loss: 1.7977581116888257, Validation Accuracy: 0.17031630170316303, Validation Loss: 1.7931141217549642
Epoch 6/50
Train Loss: 1.7940685543749066, Validation Accuracy: 0.17935349322210636, Validation Loss: 1.7906808469030593
Epoch 7/50
Train Loss: 1.7945228671034177, Validation Accuracy: 0.16962113312478277, Validation Loss: 1.7904384983910455
Epoch 8/50


KeyboardInterrupt: 

In [None]:
model.load_state_dict(best_model)
print("Training completed.")

# SVM, Logistic Regression, Random Forest

In [31]:
vectorizer = TfidfVectorizer(max_features = 5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [32]:
svm = SVC()
svm.fit(X_train, train_labels)
svm_preds = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(test_labels, svm_preds))
print("SVM F1 Score:", f1_score(test_labels, svm_preds, average="weighted"))

SVM Accuracy: 0.7413972888425443
SVM F1 Score: 0.7439418678902551


In [33]:
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, train_labels)
log_reg_preds = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(test_labels, log_reg_preds))
print("Logistic Regression F1 Score:", f1_score(test_labels, log_reg_preds, average="weighted"))

Logistic Regression Accuracy: 0.721584984358707
Logistic Regression F1 Score: 0.7224331753254929


In [34]:
rf = RandomForestClassifier()
rf.fit(X_train, train_labels)
rf_preds = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(test_labels, rf_preds))
print("Random Forest F1 Score:", f1_score(test_labels, rf_preds, average="weighted"))

Random Forest Accuracy: 0.708029197080292
Random Forest F1 Score: 0.7085823365525048


# LSTM

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts.tolist(), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), padding=True, truncation=True, max_length=512)

In [7]:
X_train = torch.tensor(train_encodings['input_ids'])
X_test = torch.tensor(test_encodings['input_ids'])
y_train = torch.tensor(LabelEncoder().fit_transform(train_labels))
y_test = torch.tensor(LabelEncoder().fit_transform(test_labels))

In [8]:
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

In [9]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        final_feature_map = lstm_out[:, -1, :]
        return self.fc(final_feature_map)

In [10]:
input_dim = 30522  # 예: 토크나이저의 단어 사전 크기
hidden_dim = 256
output_dim = len(df['Emotion'].unique())
n_layers = 2
model = LSTMClassifier(input_dim, hidden_dim, output_dim, n_layers).to(device)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 5

In [14]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{num_epochs}"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")



Epoch 1/5, Loss: 1.7954696612225638




Epoch 2/5, Loss: 1.7923242264323764




Epoch 3/5, Loss: 1.7919709934128656




Epoch 4/5, Loss: 1.792147852314843


                                                                       

Epoch 5/5, Loss: 1.791700851254993




In [15]:
def calculate_accuracy(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return accuracy_score(all_labels, all_preds)

In [16]:
train_accuracy = calculate_accuracy(model, train_loader)
test_accuracy = calculate_accuracy(model, test_loader)

Evaluating:   0%|          | 0/360 [00:00<?, ?it/s]


NameError: name 'device' is not defined

In [None]:
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")