In [1]:

import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import kagglehub
from transformers import AutoTokenizer

import torch
from torch.utils.data import Dataset

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)


  from .autonotebook import tqdm as notebook_tqdm


Device: cuda


In [2]:
# Download dataset via kagglehub
path = kagglehub.dataset_download("rmisra/imdb-spoiler-dataset")
print("Path to dataset files:", path)

# Load JSON lines into DataFrames
movie_details = pd.read_json(os.path.join(path, "IMDB_movie_details.json"), lines=True)
reviews       = pd.read_json(os.path.join(path, "IMDB_reviews.json"),       lines=True)

print("movie_details shape:", movie_details.shape)
print("reviews shape:", reviews.shape)
print(reviews[["movie_id", "is_spoiler", "review_text"]].head())


Path to dataset files: /home/electronic/.cache/kagglehub/datasets/rmisra/imdb-spoiler-dataset/versions/1
movie_details shape: (1572, 7)
reviews shape: (573913, 7)
    movie_id  is_spoiler                                        review_text
0  tt0111161        True  In its Oscar year, Shawshank Redemption (writt...
1  tt0111161        True  The Shawshank Redemption is without a doubt on...
2  tt0111161        True  I believe that this film is the best story eve...
3  tt0111161        True  **Yes, there are SPOILERS here**This film has ...
4  tt0111161        True  At the heart of this extraordinary movie is a ...


In [3]:
import re
import html

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    
    # Unescape HTML entities
    text = html.unescape(text)
    
    # Remove URLs
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    
    # Replace multiple whitespace with single space
    text = re.sub(r"\s+", " ", text)
    
    # Strip leading/trailing spaces
    text = text.strip()
    
    return text


In [4]:
# Cell 4: Merge movie metadata (optional) and build the final text column

# We merge so that later we can experiment with adding plot_summary / synopsis if needed
merged = reviews.merge(
    movie_details[["movie_id", "plot_summary", "plot_synopsis"]],
    on="movie_id",
    how="left",
    suffixes=("", "_movie")
)

# # Option 1: Use ONLY review_text as input (recommended baseline)
# def build_input_text(row):
#     return clean_text(row["review_text"])

# Option 2 (later): concat review + plot_summary / synopsis:
def build_input_text(row):
    parts = [row["review_text"], row.get("plot_summary", ""), row.get("plot_synopsis", "")]
    parts = [clean_text(p) for p in parts if isinstance(p, str) and p.strip() != ""]
    return " ".join(parts)

merged["text"] = merged.apply(build_input_text, axis=1)

# Keep only what we need
data = merged[["movie_id", "text", "is_spoiler"]].copy()

# Drop rows with missing labels or empty text
data = data.dropna(subset=["is_spoiler"])
data = data[data["text"].str.len() > 0].reset_index(drop=True)

print(data.head())
print(data["is_spoiler"].value_counts())


    movie_id                                               text  is_spoiler
0  tt0111161  In its Oscar year, Shawshank Redemption (writt...        True
1  tt0111161  The Shawshank Redemption is without a doubt on...        True
2  tt0111161  I believe that this film is the best story eve...        True
3  tt0111161  **Yes, there are SPOILERS here**This film has ...        True
4  tt0111161  At the heart of this extraordinary movie is a ...        True
is_spoiler
False    422989
True     150924
Name: count, dtype: int64


In [5]:
# Cell 5: Encode labels as integers (0 = non-spoiler, 1 = spoiler)

label_map = {False: 0, True: 1}
data["label"] = data["is_spoiler"].map(label_map).astype(int)

data = data[["movie_id", "text", "label"]].reset_index(drop=True)
print(data.head())
print(data["label"].value_counts(normalize=True))


    movie_id                                               text  label
0  tt0111161  In its Oscar year, Shawshank Redemption (writt...      1
1  tt0111161  The Shawshank Redemption is without a doubt on...      1
2  tt0111161  I believe that this film is the best story eve...      1
3  tt0111161  **Yes, there are SPOILERS here**This film has ...      1
4  tt0111161  At the heart of this extraordinary movie is a ...      1
label
0    0.737026
1    0.262974
Name: proportion, dtype: float64


In [6]:
# Cell 6: Movie-level train/val/test split (to avoid leakage across splits)

# Get unique movie IDs
movie_ids = data["movie_id"].unique()
print("Unique movies:", len(movie_ids))

# First: train vs temp (val+test)
train_movie_ids, temp_movie_ids = train_test_split(
    movie_ids,
    test_size=0.2,
    random_state=SEED,
    shuffle=True
)

# Then: val vs test from temp
val_movie_ids, test_movie_ids = train_test_split(
    temp_movie_ids,
    test_size=0.5,
    random_state=SEED,
    shuffle=True
)

print("Train movies:", len(train_movie_ids))
print("Val movies:  ", len(val_movie_ids))
print("Test movies: ", len(test_movie_ids))

# Map back to reviews
train_df = data[data["movie_id"].isin(train_movie_ids)].reset_index(drop=True)
val_df   = data[data["movie_id"].isin(val_movie_ids)].reset_index(drop=True)
test_df  = data[data["movie_id"].isin(test_movie_ids)].reset_index(drop=True)

print("Train samples:", len(train_df))
print("Val samples:  ", len(val_df))
print("Test samples: ", len(test_df))

print("Train label distribution:\n", train_df["label"].value_counts(normalize=True))
print("Val label distribution:\n",   val_df["label"].value_counts(normalize=True))
print("Test label distribution:\n",  test_df["label"].value_counts(normalize=True))


Unique movies: 1572
Train movies: 1257
Val movies:   157
Test movies:  158
Train samples: 461323
Val samples:   56607
Test samples:  55983
Train label distribution:
 label
0    0.740917
1    0.259083
Name: proportion, dtype: float64
Val label distribution:
 label
0    0.734909
1    0.265091
Name: proportion, dtype: float64
Test label distribution:
 label
0    0.707108
1    0.292892
Name: proportion, dtype: float64


In [7]:
# Cell 7: Load DeBERTa-v3-base tokenizer (slow, explicit class)

from transformers import DebertaV2Tokenizer

MODEL_NAME = "microsoft/deberta-v3-base"

tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 256  # spoiler cues can be later in the review

print("Tokenizer vocab size:", tokenizer.vocab_size)
print("Tokenizer type:", type(tokenizer))


Tokenizer vocab size: 128000
Tokenizer type: <class 'transformers.models.deberta_v2.tokenization_deberta_v2.DebertaV2Tokenizer'>


In [8]:
# Cell 9: PyTorch Dataset for on-the-fly tokenization with DeBERTa-v3-base

class IMDBSpoilerDataset(Dataset):
    def __init__(self, df, tokenizer, max_len: int = 256):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text  = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        # Remove batch dimension (since return_tensors="pt" adds it)
        item = {
            "input_ids":      encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels":         torch.tensor(label, dtype=torch.long)
        }

        # DeBERTa-v3 does not need token_type_ids
        return item

train_dataset = IMDBSpoilerDataset(train_df, tokenizer, max_len=MAX_LEN)
val_dataset   = IMDBSpoilerDataset(val_df,   tokenizer, max_len=MAX_LEN)
test_dataset  = IMDBSpoilerDataset(test_df,  tokenizer, max_len=MAX_LEN)

print("Train dataset size:", len(train_dataset))
print("Val dataset size:",   len(val_dataset))
print("Test dataset size:",  len(test_dataset))

# Quick sanity check
sample = train_dataset[0]
for k, v in sample.items():
    print(k, v.shape, v.dtype)
print("Decoded text:", tokenizer.decode(sample["input_ids"], skip_special_tokens=True))
print("Label:", sample["labels"].item())


Train dataset size: 461323
Val dataset size: 56607
Test dataset size: 55983
input_ids torch.Size([256]) torch.int64
attention_mask torch.Size([256]) torch.int64
labels torch.Size([]) torch.int64
Decoded text: In its Oscar year, Shawshank Redemption (written and directed by Frank Darabont, after the novella Rita Hayworth and the Shawshank Redemption, by Stephen King) was nominated for seven Academy Awards, and walked away with zero. Best Picture went to Forrest Gump, while Shawshank and Pulp Fiction were "just happy to be nominated." Of course hindsight is 20/20, but while history looks back on Gump as a good film, Pulp and Redemption are remembered as some of the all-time best. Pulp, however, was a success from the word "go," making a huge splash at Cannes and making its writer-director an American master after only two films. For Andy Dufresne and Co., success didn't come easy. Fortunately, failure wasn't a life sentence.After opening on 33 screens with take of $727,327, the $25M film

In [9]:
# Cell 10: Save preprocessed splits for DeBERTa-v3

import os
import json

save_dir = "../data/de_berta_v3"
os.makedirs(save_dir, exist_ok=True)

train_path = os.path.join(save_dir, "train.csv")
val_path   = os.path.join(save_dir, "val.csv")
test_path  = os.path.join(save_dir, "test.csv")
label_map_path = os.path.join(save_dir, "label_map.json")

# Save splits
train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

# Save label map (just for documentation / reuse)
with open(label_map_path, "w") as f:
    json.dump(label_map, f)

print("Saved:")
print("  ", train_path)
print("  ", val_path)
print("  ", test_path)
print("  ", label_map_path)


Saved:
   ../data/de_berta_v3/train.csv
   ../data/de_berta_v3/val.csv
   ../data/de_berta_v3/test.csv
   ../data/de_berta_v3/label_map.json


In [1]:
# Cell 1: Imports and load preprocessed CSVs

import os
import pandas as pd

from transformers import DebertaV2Tokenizer
import torch
from torch.utils.data import Dataset, DataLoader

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

data_dir = "../data/de_berta_v3"

train_df = pd.read_csv(os.path.join(data_dir, "train.csv"))
val_df   = pd.read_csv(os.path.join(data_dir, "val.csv"))
test_df  = pd.read_csv(os.path.join(data_dir, "test.csv"))

print("Train shape:", train_df.shape)
print("Val shape:",   val_df.shape)
print("Test shape:",  test_df.shape)
print(train_df.head())


  from .autonotebook import tqdm as notebook_tqdm


Device: cuda
Train shape: (461323, 3)
Val shape: (56607, 3)
Test shape: (55983, 3)
    movie_id                                               text  label
0  tt0111161  In its Oscar year, Shawshank Redemption (writt...      1
1  tt0111161  The Shawshank Redemption is without a doubt on...      1
2  tt0111161  I believe that this film is the best story eve...      1
3  tt0111161  **Yes, there are SPOILERS here**This film has ...      1
4  tt0111161  At the heart of this extraordinary movie is a ...      1


In [2]:
# Cell 2: Tokenizer

MODEL_NAME = "microsoft/deberta-v3-base"
MAX_LEN = 256

tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer vocab size:", tokenizer.vocab_size)


Tokenizer vocab size: 128000


In [3]:
# Cell 3: Dataset class (same as before)

class IMDBSpoilerDataset(Dataset):
    def __init__(self, df, tokenizer, max_len: int = 256):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text  = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids":      encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels":         torch.tensor(label, dtype=torch.long)
        }

train_dataset = IMDBSpoilerDataset(train_df, tokenizer, max_len=MAX_LEN)
val_dataset   = IMDBSpoilerDataset(val_df,   tokenizer, max_len=MAX_LEN)
test_dataset  = IMDBSpoilerDataset(test_df,  tokenizer, max_len=MAX_LEN)


In [4]:
# Cell 4: DataLoaders (what was previously optional Cell 10)

BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

batch = next(iter(train_loader))
for k, v in batch.items():
    print(k, v.shape, v.dtype)


input_ids torch.Size([16, 256]) torch.int64
attention_mask torch.Size([16, 256]) torch.int64
labels torch.Size([16]) torch.int64


In [5]:
# Cell 11: Model + training imports

import torch
import torch.nn as nn
from transformers import (
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix


In [6]:
# Cell 12: Load DeBERTa-v3-base model for binary classification

NUM_LABELS = 2
MODEL_NAME = "microsoft/deberta-v3-base"  # same as tokenizer

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="single_label_classification"
)

model.to(DEVICE)
print("Model loaded on:", DEVICE)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on: cuda


In [7]:
# Cell 13: Class weights, optimizer, scheduler

from collections import Counter

# Compute class weights from train_df
label_counts = Counter(train_df["label"].tolist())
total = sum(label_counts.values())
class_weights = [
    total / (NUM_LABELS * label_counts.get(i, 1)) for i in range(NUM_LABELS)
]

print("Label counts:", label_counts)
print("Class weights:", class_weights)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

EPOCHS = 4
LR = 2e-5
WARMUP_RATIO = 0.1

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

total_steps = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print("Total steps:", total_steps, "| Warmup steps:", warmup_steps)


Label counts: Counter({0: 341802, 1: 119521})
Class weights: [0.674839526977607, 1.929882614770626]
Total steps: 115332 | Warmup steps: 11533


In [8]:
# Cell 14: Training and evaluation functions

def train_one_epoch(model, data_loader, optimizer, scheduler, criterion, device):
    model.train()
    epoch_loss = 0.0

    all_preds = []
    all_labels = []

    pbar = tqdm(data_loader, desc="Train", leave=False)
    for batch in pbar:
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item() * input_ids.size(0)

        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
        labels_np = labels.detach().cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels_np)

    avg_loss = epoch_loss / len(data_loader.dataset)
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="binary")

    return avg_loss, acc, f1


def eval_model(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0.0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        pbar = tqdm(data_loader, desc="Eval", leave=False)
        for batch in pbar:
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels         = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            epoch_loss += loss.item() * input_ids.size(0)

            preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
            labels_np = labels.detach().cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels_np)

    avg_loss = epoch_loss / len(data_loader.dataset)
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="binary")

    return avg_loss, acc, f1, all_labels, all_preds


In [9]:
# Cell 15: Main training loop

BEST_MODEL_PATH = "deberta_v3_best.pt"

best_val_f1 = 0.0

for epoch in range(1, EPOCHS + 1):
    print("=" * 70)
    print(f"Epoch {epoch}/{EPOCHS}")

    train_loss, train_acc, train_f1 = train_one_epoch(
        model,
        train_loader,
        optimizer,
        scheduler,
        criterion,
        DEVICE
    )

    val_loss, val_acc, val_f1, _, _ = eval_model(
        model,
        val_loader,
        criterion,
        DEVICE
    )

    print(f"Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, F1: {train_f1:.4f}")
    print(f"Val   - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, F1: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"✓ New best model saved with F1 = {best_val_f1:.4f}")

print("=" * 70)
print("Training completed.")
print(f"Best validation F1: {best_val_f1:.4f}")


Epoch 1/4


                                                                

Train - Loss: 0.4742, Acc: 0.7838, F1: 0.4349
Val   - Loss: 0.4590, Acc: 0.7834, F1: 0.5337
✓ New best model saved with F1 = 0.5337
Epoch 2/4


                                                                

Train - Loss: 0.4278, Acc: 0.8095, F1: 0.5441
Val   - Loss: 0.4634, Acc: 0.7927, F1: 0.5211
Epoch 3/4


                                                                

Train - Loss: 0.3822, Acc: 0.8351, F1: 0.6281
Val   - Loss: 0.4811, Acc: 0.7831, F1: 0.5585
✓ New best model saved with F1 = 0.5585
Epoch 4/4


                                                                

Train - Loss: 0.3290, Acc: 0.8651, F1: 0.7098
Val   - Loss: 0.5339, Acc: 0.7830, F1: 0.5500
Training completed.
Best validation F1: 0.5585




In [None]:
# Cell 16: Test evaluation with the best saved model

# Reload model weights from best checkpoint
best_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="single_label_classification"
)
best_model.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=DEVICE))
best_model.to(DEVICE)

test_loss, test_acc, test_f1, test_labels, test_preds = eval_model(
    best_model,
    test_loader,
    criterion,
    DEVICE
)

print(f"Test - Loss: {test_loss:.4f}, Acc: {test_acc:.4f}, F1: {test_f1:.4f}")

print("\nClassification report:\n")
print(classification_report(test_labels, test_preds, digits=4))

print("Confusion matrix:")
print(confusion_matrix(test_labels, test_preds))


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                         

Test - Loss: 0.5019, Acc: 0.7744, F1: 0.5839

Classification report:

              precision    recall  f1-score   support

           0     0.8207    0.8713    0.8452     39586
           1     0.6349    0.5404    0.5839     16397

    accuracy                         0.7744     55983
   macro avg     0.7278    0.7058    0.7145     55983
weighted avg     0.7663    0.7744    0.7687     55983

Confusion matrix:
[[34491  5095]
 [ 7536  8861]]




: 