<a href="https://colab.research.google.com/github/Yewon9/Emotion_Recognition_STT/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!pip install transformers sentencepiece torch



In [6]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
df = pd.read_csv('emotion_tokenized_data.csv')
df

Unnamed: 0,Emotion,Path,Text,cleaned_text,tokenized
0,angry,/content/drive/MyDrive/project/data_aihub/4/5e...,약속 있어서 친구 기다리고 있는데 아직 안 왔어,약속 있어서 친구 기다리고 있는데 아직 안 왔어,"{'input_ids': tensor([[ 2, 0, 0, 0,..."
1,sad,/content/drive/MyDrive/project/data_aihub/4/5e...,나 결국 여자 친구와 헤어졌어,나 결국 여자 친구와 헤어졌어,"{'input_ids': tensor([[ 2, 5655, 0, 6924,..."
2,sad,/content/drive/MyDrive/project/data_aihub/5/5e...,이 회사 들어올 때 얼마나 힘들었는지 알잖아 조금만 더 버텨 보고 싶어,이 회사 들어올 때 얼마나 힘들었는지 알잖아 조금만 더 버텨 보고 싶어,"{'input_ids': tensor([[ 2, 7096, 7957, 0,..."
3,sad,/content/drive/MyDrive/project/data_aihub/4/5e...,회초리도 맞고 빠따도 맞아서 혼났지,회초리도 맞고 빠따도 맞아서 혼났지,"{'input_ids': tensor([[2, 0, 0, 0, 0, 0, 3, 1,..."
4,sad,/content/drive/MyDrive/project/data_aihub/4/5e...,거의 다 떨어져가,거의 다 떨어져가,"{'input_ids': tensor([[ 2, 0, 5782, 0,..."
...,...,...,...,...,...
16551,surprise,/content/drive/MyDrive/project/data_aihub/5_2/...,그런 영상도 있었어,그런 영상도 있었어,"{'input_ids': tensor([[ 2, 5541, 0, 0,..."
16552,surprise,/content/drive/MyDrive/project/data_aihub/5_2/...,야 나 이벤트 당첨됐어,야 나 이벤트 당첨됐어,"{'input_ids': tensor([[ 2, 6844, 5655, 0,..."
16553,surprise,/content/drive/MyDrive/project/data_aihub/4/5e...,엄청 놀랐어,엄청 놀랐어,"{'input_ids': tensor([[2, 0, 0, 3, 1, 1, 1, 1,..."
16554,surprise,/content/drive/MyDrive/project/data_aihub/5_2/...,갑자기 해피가 목줄을 끊고 막 뛰어 가잖아 그래서 못 잡을까 봐 너무 놀랐어,갑자기 해피가 목줄을 끊고 막 뛰어 가잖아 그래서 못 잡을까 봐 너무 놀랐어,"{'input_ids': tensor([[ 2, 0, 0, 0,..."


In [19]:
label_mapping = {'happy': 0, 'sad': 1, 'angry': 2, 'neutral': 3, 'fear': 4, 'disgust': 5, 'surprise': 6}

In [20]:
df['Emotion'] = df['Emotion'].map(label_mapping)

In [8]:
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["cleaned_text"]
        emotion = self.data.iloc[index]["Emotion"]

        # KoBERT 토큰화
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(emotion, dtype=torch.long)  # 감정 레이블
        }

In [9]:
class KoBERTEmotionClassifier(nn.Module):
    def __init__(self, n_classes):
        super(KoBERTEmotionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('monologg/kobert')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [10]:
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    total_loss = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        loss = nn.CrossEntropyLoss()(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

In [11]:
def eval_model(model, data_loader, device):
    model = model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            loss = nn.CrossEntropyLoss()(outputs, labels)

            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), total_loss / len(data_loader)

In [14]:
BATCH_SIZE = 16
MAX_LEN = 64
EPOCHS = 10
N_CLASSES = len(set(df['Emotion']))

In [15]:
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [16]:
train_dataset = EmotionDataset(df, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [17]:
model = KoBERTEmotionClassifier(n_classes=N_CLASSES).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]



In [21]:
for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    accuracy, val_loss = eval_model(model, train_loader, device)

    print(f'Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss}, Validation Accuracy: {accuracy}, Validation Loss: {val_loss}')

Epoch 1/10, Train Loss: 1.9146511157353718, Validation Accuracy: 0.2564629137472819, Validation Loss: 1.8405430794913988
Epoch 2/10, Train Loss: 1.7847372491578548, Validation Accuracy: 0.38119110896351777, Validation Loss: 1.6570483928717277
Epoch 3/10, Train Loss: 1.652733079592387, Validation Accuracy: 0.4269147136989611, Validation Loss: 1.5404509318623565
Epoch 4/10, Train Loss: 1.5463326753625548, Validation Accuracy: 0.48949021502778445, Validation Loss: 1.3988793852248629
Epoch 5/10, Train Loss: 1.4534983325119755, Validation Accuracy: 0.5194491423049046, Validation Loss: 1.3121600261632946
Epoch 6/10, Train Loss: 1.3699607435631866, Validation Accuracy: 0.5568978014013046, Validation Loss: 1.232083277022781
Epoch 7/10, Train Loss: 1.3023654811336223, Validation Accuracy: 0.5837762744624305, Validation Loss: 1.1465725967561564
Epoch 8/10, Train Loss: 1.2432146596447857, Validation Accuracy: 0.6028630103889828, Validation Loss: 1.0789618409198263
Epoch 9/10, Train Loss: 1.189148