In [1]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from tqdm.auto import tqdm

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

  from .autonotebook import tqdm as notebook_tqdm


Device: cuda


In [2]:
# Load preprocessed CSVs from the 'without context' directory
data_dir = "../data/de_berta_v3"

train_df = pd.read_csv(os.path.join(data_dir, "train.csv"))
val_df   = pd.read_csv(os.path.join(data_dir, "val.csv"))
test_df  = pd.read_csv(os.path.join(data_dir, "test.csv"))

print("Train shape:", train_df.shape)
print("Val shape:",   val_df.shape)
print("Test shape:",  test_df.shape)
print(train_df.head())

Train shape: (461323, 3)
Val shape: (56607, 3)
Test shape: (55983, 3)
    movie_id                                               text  label
0  tt0111161  In its Oscar year, Shawshank Redemption (writt...      1
1  tt0111161  The Shawshank Redemption is without a doubt on...      1
2  tt0111161  I believe that this film is the best story eve...      1
3  tt0111161  **Yes, there are SPOILERS here**This film has ...      1
4  tt0111161  At the heart of this extraordinary movie is a ...      1


In [3]:
# Tokenizer for RoBERTa
MODEL_NAME = "roberta-base"
MAX_LEN = 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer vocab size:", tokenizer.vocab_size)

Tokenizer vocab size: 50265


In [4]:
# Dataset Class
class IMDBSpoilerDataset(Dataset):
    def __init__(self, df, tokenizer, max_len: int = 256):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text  = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids":      encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels":         torch.tensor(label, dtype=torch.long)
        }

train_dataset = IMDBSpoilerDataset(train_df, tokenizer, max_len=MAX_LEN)
val_dataset   = IMDBSpoilerDataset(val_df,   tokenizer, max_len=MAX_LEN)
test_dataset  = IMDBSpoilerDataset(test_df,  tokenizer, max_len=MAX_LEN)

In [5]:
# DataLoaders
BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

batch = next(iter(train_loader))
for k, v in batch.items():
    print(k, v.shape, v.dtype)

input_ids torch.Size([16, 256]) torch.int64
attention_mask torch.Size([16, 256]) torch.int64
labels torch.Size([16]) torch.int64


In [6]:
# Load Model
NUM_LABELS = 2

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="single_label_classification"
)

model.to(DEVICE)
print("Model loaded on:", DEVICE)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on: cuda


In [7]:
# Class weights, optimizer, scheduler
from collections import Counter

# Compute class weights from train_df
label_counts = Counter(train_df["label"].tolist())
total = sum(label_counts.values())
class_weights = [
    total / (NUM_LABELS * label_counts.get(i, 1)) for i in range(NUM_LABELS)
]

print("Label counts:", label_counts)
print("Class weights:", class_weights)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

EPOCHS = 4
LR = 2e-5
WARMUP_RATIO = 0.1

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

total_steps = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print("Total steps:", total_steps, "| Warmup steps:", warmup_steps)

Label counts: Counter({0: 341802, 1: 119521})
Class weights: [0.674839526977607, 1.929882614770626]
Total steps: 115332 | Warmup steps: 11533


In [8]:
# Training and evaluation functions
def train_one_epoch(model, data_loader, optimizer, scheduler, criterion, device):
    model.train()
    epoch_loss = 0.0

    all_preds = []
    all_labels = []

    pbar = tqdm(data_loader, desc="Train", leave=False)
    for batch in pbar:
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item() * input_ids.size(0)

        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
        labels_np = labels.detach().cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels_np)

    avg_loss = epoch_loss / len(data_loader.dataset)
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="binary")

    return avg_loss, acc, f1

def eval_model(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0.0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        pbar = tqdm(data_loader, desc="Eval", leave=False)
        for batch in pbar:
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels         = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            epoch_loss += loss.item() * input_ids.size(0)

            preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
            labels_np = labels.detach().cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels_np)

    avg_loss = epoch_loss / len(data_loader.dataset)
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="binary")

    return avg_loss, acc, f1, all_labels, all_preds

In [9]:
# Training Loop
best_val_f1 = 0.0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    
    train_loss, train_acc, train_f1 = train_one_epoch(
        model, train_loader, optimizer, scheduler, criterion, DEVICE
    )
    print(f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
    
    val_loss, val_acc, val_f1, _, _ = eval_model(
        model, val_loader, criterion, DEVICE
    )
    print(f"Val   Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")
    
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), "best_roberta_model.pt")
        print("Saved best model!")


Epoch 1/4


                                                              

Train Loss: 0.4878 | Acc: 0.7767 | F1: 0.4145


                                                         

Val   Loss: 0.4814 | Acc: 0.7824 | F1: 0.3797
Saved best model!

Epoch 2/4


                                                              

Train Loss: 0.4554 | Acc: 0.7971 | F1: 0.4947


                                                         

Val   Loss: 0.4888 | Acc: 0.7889 | F1: 0.4645
Saved best model!

Epoch 3/4


                                                              

Train Loss: 0.4208 | Acc: 0.8153 | F1: 0.5664


                                                         

Val   Loss: 0.4698 | Acc: 0.7852 | F1: 0.5385
Saved best model!

Epoch 4/4


                                                              

Train Loss: 0.3819 | Acc: 0.8368 | F1: 0.6342


                                                         

Val   Loss: 0.5038 | Acc: 0.7844 | F1: 0.5391
Saved best model!


In [10]:
# Test Evaluation
model.load_state_dict(torch.load("best_roberta_model.pt"))
test_loss, test_acc, test_f1, test_labels, test_preds = eval_model(
    model, test_loader, criterion, DEVICE
)

print(f"Test Loss: {test_loss:.4f} | Acc: {test_acc:.4f} | F1: {test_f1:.4f}")
print("\nClassification Report:\n")
print(classification_report(test_labels, test_preds))

                                                         

Test Loss: 0.5346 | Acc: 0.7713 | F1: 0.5541

Classification Report:

              precision    recall  f1-score   support

           0       0.81      0.89      0.85     39586
           1       0.65      0.49      0.55     16397

    accuracy                           0.77     55983
   macro avg       0.73      0.69      0.70     55983
weighted avg       0.76      0.77      0.76     55983

