In [4]:
!# run once if wandb not installed
!pip install -q wandb transformers datasets ftfy emoji scikit-learn


In [5]:
import os, re, random, math, gc
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import wandb
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import (
    RobertaTokenizerFast,
    AutoConfig,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from ftfy import fix_text
import emoji

In [61]:
class Config:
    batch_size = 32
    lr = 1e-3
    epochs = 5
    max_len = 100
    embedding_dim = 128
    hidden_dim = 128
    num_layers = 1
    dropout = 0.2
    vocab_size = 11000
    num_labels = 5
    device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
wandb.login(key="96513a18cdfc4585db6e8a169369fcc713e8ef2c")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
config = Config()

In [38]:
wandb.init(project="22f3000757-t32025",name="Scratchh Model",config=vars(config))

In [9]:
LABEL_COLS = ['anger','fear','joy','sadness','surprise']

# ------------------- LOAD DATA -------------------
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

In [10]:
def tokenize(text):
    return text.lower().split()

# Build vocab from training set
from collections import Counter
counter = Counter()
for t in train_df['text']:
    counter.update(tokenize(t))

In [11]:
vocab = [w for w,_ in counter.most_common(config.vocab_size-2)]
word2idx = {w: idx+2 for idx, w in enumerate(vocab)}
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1

In [12]:
def encode_sentence(sentence):
    tokens = tokenize(sentence)
    ids = [word2idx.get(t, 1) for t in tokens]
    ids = ids[:config.max_len]
    ids += [0] * (config.max_len - len(ids))
    return ids

In [30]:
class TextDataset(Dataset):
    def __init__(self, df):
        self.texts = df["text"].tolist()
        self.labels = df[LABEL_COLS].values.astype(np.float32)

    def __getitem__(self, idx):
       x = encode_sentence(self.texts[idx])
       y = self.labels[idx]
       return torch.tensor(x), torch.tensor(y, dtype=torch.float32)


    def __len__(self):
        return len(self.texts)

In [14]:
from torch.utils.data import random_split

# Train / Validation split
full_ds = TextDataset(train_df)
val_size = int(0.2 * len(full_ds))
train_size = len(full_ds) - val_size
train_ds, val_ds = random_split(full_ds, [train_size, val_size])

In [15]:
train_loader = DataLoader(train_ds, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=config.batch_size, shuffle=False)
# test_ds = TextDataset(test_df)
# test_loader = DataLoader(test_ds, batch_size=config.batch_size, shuffle=False)

In [60]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_labels, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim, hidden_dim,
            num_layers=num_layers, batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim, num_labels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.lstm(emb)
        out = out[:, -1, :]  # last hidden state
        out = self.dropout(out)
        logits = self.fc(out)
        return logits


In [62]:
model = LSTMClassifier(
    vocab_size=config.vocab_size,
    embedding_dim=config.embedding_dim,
    hidden_dim=config.hidden_dim,
    num_layers=config.num_layers,
    num_labels=config.num_labels,   # = 5
    dropout=config.dropout
).to(config.device)


In [63]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)


In [64]:
# ------------------- FUNCTIONS -------------------

def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for x_batch, y_batch in tqdm(loader, desc="Training"):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(x_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * x_batch.size(0)

    epoch_loss = running_loss / len(loader.dataset)
    return epoch_loss

In [65]:
from sklearn.metrics import f1_score
import numpy as np

def validate(model, loader, criterion, device):
    model.eval()

    all_logits = []
    all_labels = []
    running_loss = 0.0

    with torch.no_grad():
        for x_batch, y_batch in loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(x_batch)
            loss = criterion(logits, y_batch)
            running_loss += loss.item() * x_batch.size(0)

            all_logits.append(logits.sigmoid().cpu().numpy())
            all_labels.append(y_batch.cpu().numpy())

    all_logits = np.concatenate(all_logits)
    all_labels = np.concatenate(all_labels)

    # ----- THRESHOLD TUNING -----
    thresholds = []
    for i in range(all_labels.shape[1]):
        best_t = 0.3
        best_f1 = -1
        for t in np.arange(0.2, 0.51, 0.02):
            preds = (all_logits[:, i] > t).astype(int)
            f1 = f1_score(all_labels[:, i], preds)
            if f1 > best_f1:
                best_f1 = f1
                best_t = t
        thresholds.append(best_t)

    # ----- APPLY THRESHOLDS -----
    final_preds = np.zeros_like(all_logits)
    for i, t in enumerate(thresholds):
        final_preds[:, i] = (all_logits[:, i] > t).astype(int)

    macro_f1 = f1_score(all_labels, final_preds, average="macro")

    val_loss = running_loss / len(loader.dataset)
    return val_loss, macro_f1, thresholds


In [66]:
config.epochs = 5
# ------------------- TRAINING LOOP -------------------
for epoch in range(1, config.epochs + 1):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, config.device)
    val_loss, val_f1, thresholds = validate(model, val_loader, criterion, config.device)

    print(f"Epoch {epoch} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    wandb.log({"epoch": epoch, "train_loss": train_loss,"val_f1":val_f1, "val_loss": val_loss, "ths":thresholds})

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 0.5780 | Val Loss: 0.5677


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch 2 | Train Loss: 0.5682 | Val Loss: 0.5664


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch 3 | Train Loss: 0.5682 | Val Loss: 0.5685


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch 4 | Train Loss: 0.5684 | Val Loss: 0.5658


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch 5 | Train Loss: 0.5678 | Val Loss: 0.5689


### The test dataset

In [54]:
class TestDataset(Dataset):
    def __init__(self, df):
        self.texts = df["text"].tolist()

    def __getitem__(self, idx):
        x = encode_sentence(self.texts[idx])
        return torch.tensor(x, dtype=torch.long)

    def __len__(self):
        return len(self.texts)

# ------------------- TEST DATA LOADER -------------------
test_ds = TestDataset(test_df)
test_loader = DataLoader(test_ds, batch_size=config.batch_size, shuffle=False)


In [55]:
model.eval()
final_preds = []

with torch.no_grad():
    for x_batch in test_loader:
        x_batch = x_batch.to(config.device)

        logits = model(x_batch)
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).int().cpu().numpy()

        final_preds.append(preds)

final_preds = np.vstack(final_preds)


In [56]:
final_preds

array([[0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0]], dtype=int32)

In [57]:
df_scratch = pd.DataFrame(final_preds, columns=LABEL_COLS)

In [58]:
df_scratch.insert(0, "id", np.arange(0,1707))

In [59]:
df_scratch

Unnamed: 0,id,anger,fear,joy,sadness,surprise
0,0,0,1,0,0,0
1,1,0,1,0,0,0
2,2,0,1,0,0,0
3,3,0,1,0,0,0
4,4,0,1,0,0,0
...,...,...,...,...,...,...
1702,1702,0,1,0,0,0
1703,1703,0,1,0,0,0
1704,1704,0,1,0,0,0
1705,1705,0,1,0,0,0


In [68]:
df_scratch.to_csv("submission.csv", index=False)

In [21]:
wandb.finish()