In [1]:
import os
import sys
sys.path.append(os.path.abspath('..'))
from utils.data_preprocessing import DataPreprocessor
from transformers import DistilBertTokenizerFast
import pandas as pd
import numpy as np

In [2]:
df = DataPreprocessor('../data/track-b.csv')
df.preprocess()
data = df.data
data

Unnamed: 0,id,text,anger,fear,joy,sadness,surprise
0,eng_train_track_b_00001,colorado middle of nowhere,0,1,0,0,1
1,eng_train_track_b_00002,this involved swimming a pretty large lake tha...,0,2,0,0,0
2,eng_train_track_b_00003,it was one of my most shameful experiences,0,1,0,3,0
3,eng_train_track_b_00004,after all i had vegetables coming out my ears ...,0,0,0,0,0
4,eng_train_track_b_00005,then the screaming started,0,3,0,1,2
...,...,...,...,...,...,...,...
2763,eng_train_track_b_02764,she cants her hip against my waist into my sid...,0,0,2,0,1
2764,eng_train_track_b_02765,i then did the dishes whitened my teeth watche...,0,0,0,0,0
2765,eng_train_track_b_02766,it just kind of gradually vanished over a coup...,0,0,0,0,1
2766,eng_train_track_b_02767,i didnt look out of my hands,0,1,0,0,0


In [10]:
# emotion_cols = ['anger', 'fear', 'joy', 'sadness', 'surprise']
# X = data['text'].values
# y = data[emotion_cols].values


In [20]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit


class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.float)
        }


X = data["text"]  # This should already be a Series
y = data[["anger", "fear", "joy", "sadness", "surprise"]].values
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)


tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# encodings = tokenizer(
#     list(X),
#     truncation=True,
#     padding=True,
#     return_tensors='pt'
# )

for train_idx, val_idx in msss.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
# train_idx, val_idx = train_test_split(range(len(X)), test_size=0.2, random_state=42)
# train_dataset = EmotionDataset({k: v[train_idx] for k, v in encodings.items()}, y[train_idx])
# val_dataset = EmotionDataset({k: v[val_idx] for k, v in encodings.items()}, y[val_idx])
train_dataset = EmotionDataset(X_train, y_train, tokenizer)
val_dataset = EmotionDataset(X_val, y_val, tokenizer)



In [21]:
import torch.nn as nn
from transformers import DistilBertModel

class EmotionRegressor(nn.Module):
    def __init__(self, dropout=0.5, num_emotions=5):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(dropout)
        self.regressor = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_emotions)
        )

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = output.last_hidden_state[:, 0]  # [CLS] token
        pooled = self.dropout(pooled)
        return self.regressor(pooled)


In [22]:
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from tqdm import tqdm

def train_model(model, train_dataset, val_dataset, num_epochs=10, batch_size=16, patience=3, learning_rate=2e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    loss_fn = nn.MSELoss()

    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(0.1 * total_steps), 
        num_training_steps=total_steps
    )

    best_val_loss = float("inf")
    patience_counter = 0
    best_model_state = None

    for epoch in range(num_epochs):
        model.train()
        train_losses = []

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

            train_losses.append(loss.item())

        avg_train_loss = np.mean(train_losses)

        # Validation
        model.eval()
        val_losses = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = loss_fn(outputs, labels)
                val_losses.append(loss.item())

        avg_val_loss = np.mean(val_losses)
        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f} | Val Loss = {avg_val_loss:.4f}")

        # Early Stopping Logic
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            best_model_state = model.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered!")
                break

    # Load best model
    if best_model_state:
        model.load_state_dict(best_model_state)

    return model


In [23]:
model = EmotionRegressor(dropout=0.5)
trained_model = train_model(model, train_dataset, val_dataset, num_epochs=10, batch_size=16, patience=3)


Epoch 1/10: 100%|██████████| 139/139 [00:35<00:00,  3.89it/s]


Epoch 1: Train Loss = 0.6698 | Val Loss = 0.5591


Epoch 2/10: 100%|██████████| 139/139 [00:35<00:00,  3.89it/s]


Epoch 2: Train Loss = 0.4503 | Val Loss = 0.3947


Epoch 3/10: 100%|██████████| 139/139 [00:36<00:00,  3.85it/s]


Epoch 3: Train Loss = 0.3271 | Val Loss = 0.3681


Epoch 4/10: 100%|██████████| 139/139 [00:35<00:00,  3.93it/s]


Epoch 4: Train Loss = 0.2503 | Val Loss = 0.3599


Epoch 5/10: 100%|██████████| 139/139 [00:35<00:00,  3.91it/s]


Epoch 5: Train Loss = 0.2032 | Val Loss = 0.3454


Epoch 6/10: 100%|██████████| 139/139 [00:35<00:00,  3.90it/s]


Epoch 6: Train Loss = 0.1684 | Val Loss = 0.3467


Epoch 7/10: 100%|██████████| 139/139 [00:35<00:00,  3.90it/s]


Epoch 7: Train Loss = 0.1466 | Val Loss = 0.3453


Epoch 8/10: 100%|██████████| 139/139 [00:35<00:00,  3.90it/s]


Epoch 8: Train Loss = 0.1313 | Val Loss = 0.3461


Epoch 9/10: 100%|██████████| 139/139 [00:35<00:00,  3.90it/s]


Epoch 9: Train Loss = 0.1232 | Val Loss = 0.3502


Epoch 10/10: 100%|██████████| 139/139 [00:35<00:00,  3.90it/s]


Epoch 10: Train Loss = 0.1180 | Val Loss = 0.3480
Early stopping triggered!


In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def evaluate_model(model, test_loader):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            all_preds.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    return all_preds, all_labels



In [25]:
def print_metrics(preds, labels, emotion_labels=["anger", "fear", "joy", "sadness", "surprise"]):
    for i, emotion in enumerate(emotion_labels):
        mae = mean_absolute_error(labels[:, i], preds[:, i])
        rmse = np.sqrt(mean_squared_error(labels[:, i], preds[:, i]))
        r2 = r2_score(labels[:, i], preds[:, i])
        print(f"\n🧠 Emotion: {emotion}")
        print(f"   - MAE:  {mae:.4f}")
        print(f"   - RMSE: {rmse:.4f}")
        print(f"   - R²:   {r2:.4f}")


In [26]:
# Assuming test_loader is your DataLoader for test set
test_loader = DataLoader(val_dataset, batch_size=16)  # Using validation set as test set for demonstration
preds, labels = evaluate_model(trained_model, test_loader )
print_metrics(preds, labels)



🧠 Emotion: anger
   - MAE:  0.2437
   - RMSE: 0.4697
   - R²:   0.3812

🧠 Emotion: fear
   - MAE:  0.5265
   - RMSE: 0.7099
   - R²:   0.4393

🧠 Emotion: joy
   - MAE:  0.2965
   - RMSE: 0.5237
   - R²:   0.4631

🧠 Emotion: sadness
   - MAE:  0.4481
   - RMSE: 0.6858
   - R²:   0.3917

🧠 Emotion: surprise
   - MAE:  0.3575
   - RMSE: 0.5275
   - R²:   0.3860


In [27]:

# Save the model's state_dict
torch.save(model.state_dict(), "emotion_classifier_model_1.pt")

# Save the tokenizer
# tokenizer.save_pretrained("emotion_classifier_tokenizer")