In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# !pip install transformers sentencepiece accelerate -q

# !pip install optuna transformers -q
!pip install transformers contractions -q


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizerFast, RobertaModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import re
import emoji
from contractions import fix as fix_contractions
import torch.optim as optim


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 256
BATCH_SIZE = 8           # roberta-large â‡’ keep small
EPOCHS = 13              # enough for large model
LR = 1e-5                # slower LR for large models


print("The Variables")
print(f"Max Length : {MAX_LEN} | Batch Size: {BATCH_SIZE} | Epochs: {EPOCHS} | Learning Rate: {LR}")

In [2]:
data_path= '../Data/train.csv'

In [3]:
import pandas as pd

df = pd.read_csv(data_path)

In [None]:
# X = input text
X = df['text']

# y = emotion labels (multi-label target)
y = df[['anger', 'fear', 'joy', 'sadness', 'surprise']]

In [None]:
# Step 6: Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    X.tolist(),y.values, test_size=0.2, random_state=42
)

In [None]:
def clean_text(text):
    text = re.sub(r'http\S+|www.\S+', '<URL>', text)
    text = re.sub(r'@\w+', '<USER>', text)
    text = fix_contractions(text)
    text = emoji.demojize(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
X_train = [clean_text(x) for x in train_texts]
X_val   = [clean_text(x) for x in val_texts]

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large")

def encode_batch(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

In [None]:
train_enc = encode_batch(X_train)
val_enc   = encode_batch(X_val)

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item


In [None]:
train_dataset = EmotionDataset(train_enc, train_labels)
val_dataset   = EmotionDataset(val_enc, val_labels)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [None]:
class EmotionClassifier(nn.Module):
    def __init__(self, num_labels=5):
        super().__init__()
        
        self.roberta = RobertaModel.from_pretrained("roberta-large")
        self.dropout = nn.Dropout(0.3)
        self.norm = nn.LayerNorm(1024)  # roberta-large has the hidden dim = 1024
        self.classifier = nn.Linear(1024, num_labels)

    def forward(self, input_ids, attention_mask):
        out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        hidden = out.last_hidden_state  # (B, L, 1024)

        # mean pooling
        masked = hidden * attention_mask.unsqueeze(-1)
        pooled = masked.sum(1) / attention_mask.sum(1, keepdim=True)

        x = self.norm(pooled)
        x = self.dropout(x)
        return self.classifier(x)   # raw logits


In [None]:
# Initializing the Model:
model = EmotionClassifier(num_labels=5).to(device)

In [None]:
criterion = nn.BCEWithLogitsLoss()

optimizer = optim.AdamW([
    {"params": model.roberta.parameters(), "lr": LR},
    {"params": model.classifier.parameters(), "lr": 3e-5},
])

In [None]:


num_training_steps = len(train_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)


In [None]:
print("Number of epochs:", EPOCHS)
print("Max Length: ", MAX_LEN)
def train_epoch(model, loader, optimizer, scheduler, criterion):
    model.train()
    total_loss = 0

    for batch in tqdm(loader, desc="Training", leave=False):
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        y = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(ids, mask)
        loss = criterion(logits, y)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


In [None]:
def evaluate(model, loader, threshold=0.5):
    model.eval()
    preds, trues = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Validating", leave=False):
            ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            y = batch["labels"].to(device)

            logits = model(ids, mask)
            probs = torch.sigmoid(logits)
            pred = (probs > threshold).long()

            preds.extend(pred.cpu().tolist())
            trues.extend(y.cpu().tolist())

    return f1_score(trues, preds, average="macro")


In [None]:
for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion)
    val_f1 = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {train_loss:.4f} | Val F1: {val_f1:.4f}")


# On to the test set:

In [5]:
test_data_path = "../Data/test.csv"

In [6]:
df_test = pd.read_csv(test_data_path)

In [7]:
df_test.shape

(1707, 2)

In [None]:
df_test.dropna(subset=['text'], inplace=True)
df_test = df_test[['text']]   # remove 'id' column
print("Test size:", len(df_test))
df_test.head()

In [None]:
test_encodings = tokenizer(
    df_test['text'].tolist(),
    truncation=True,
    padding='max_length',  
    max_length=MAX_LEN,
    return_tensors=None # keeps the lists (good for dataset)
)

In [None]:
class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx], dtype=torch.long)
            for key, val in self.encodings.items()
        }

test_dataset = TestDataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)

        # model returns logits directly
        logits = model(
            input_ids=ids,
            attention_mask=mask
        ) 
        

        probs = torch.sigmoid(logits)     # convert to probability [0..1]

        preds = (probs > 0.50).int().cpu()   # I have kept the threshold = 0.50
        predictions.append(preds)

predictions = torch.cat(predictions, dim=0)

print("Predictions shape:", predictions.shape)

In [None]:
import numpy as np

In [None]:
emotion_cols = ['anger', 'fear', 'joy', 'sadness', 'surprise']
ids = np.arange(0, df_test.shape[0])

In [None]:
df_preds = pd.DataFrame(predictions.numpy(), columns=emotion_cols)

In [None]:
# Adding the id column
df_preds.insert(0,'id',ids)

In [None]:
# For re-assurance
len(df_preds)

In [None]:
# Converting it to the submission file.
df_preds.to_csv("submission.csv", index=False)

In [None]:
torch.cuda.empty_cache()

# Cleaining the GPU's history which ensures that it doesn't run slow with time.