In [1]:
# Load the dataset
# from  datasets import load_dataset
# dataset = load_dataset("stanfordnlp/imdb")
# train_data = dataset["train"]
# test_data = dataset["test"]
import pandas as pd

train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

train_data.head()
# test_data[0]
# print(test_data[0])

Unnamed: 0,text,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [3]:
import re
import unicodedata
import pandas as pd
import requests
import os
from dotenv import load_dotenv
from time import sleep
import spacy

# ------------------------------
# Load API key and spaCy model
# ------------------------------
load_dotenv()
API_KEY = os.getenv("API_KEY")
BASE_URL = "https://api.themoviedb.org/3"

nlp = spacy.load("en_core_web_sm")  # NER model

# ------------------------------
# Normalize text function
# ------------------------------
def normalize_text(text):
    return unicodedata.normalize("NFKC", text).lower()

# ------------------------------
# Movie titles
# ------------------------------

#
sample_titles = [
   "The Shawshank Redemption", "The Godfather", "The Dark Knight", "Pulp Fiction", "Forrest Gump",
    "Fight Club", "Inception", "The Matrix", "Goodfellas", "The Lord of the Rings: The Return of the King",
    "Interstellar", "Parasite", "The Silence of the Lambs", "Saving Private Ryan", "Schindler’s List",
    "Gladiator", "Titanic", "The Green Mile", "The Departed", "Django Unchained",
    "The Prestige", "Whiplash", "The Lion King", "Toy Story", "Avengers: Endgame",
    "Avengers: Infinity War", "Iron Man", "Black Panther", "Joker", "The Social Network",
    "The Wolf of Wall Street", "La La Land", "Mad Max: Fury Road", "The Revenant", "Get Out",
    "Oppenheimer", "Barbie", "Dune", "The Batman", "Spider-Man: No Way Home",
    "Everything Everywhere All at Once", "The Irishman", "12 Years a Slave", "Moonlight", "Spotlight",
    "Birdman", "Arrival", "Blade Runner 2049", "No Country for Old Men", "The Big Short",
    "The Hateful Eight", "Once Upon a Time in Hollywood", "There Will Be Blood",
    "The Curious Case of Benjamin Button", "The Shape of Water", "The Theory of Everything",
    "Bohemian Rhapsody", "Rocketman", "A Star Is Born", "The Imitation Game", "The King's Speech",
    "Slumdog Millionaire", "Life of Pi", "Gravity", "Cast Away", "The Truman Show",
    "Eternal Sunshine of the Spotless Mind", "Requiem for a Dream", "American Beauty", "The Sixth Sense",
    "Se7en", "The Usual Suspects", "Memento", "Oldboy", "Pan’s Labyrinth",
    "Amélie", "The Pianist", "The Lives of Others", "City of God", "Crouching Tiger, Hidden Dragon",
    "Spirited Away", "Howl’s Moving Castle", "Princess Mononoke", "My Neighbor Totoro", "WALL·E",
    "Up", "Inside Out", "Coco", "Soul", "Minari", "The Banshees of Inisherin",
    "Casino Royale", "South Park: Bigger, Longer & Uncut", "A Fistful of Dollars", "Rosemary's Baby",
    "The Incredibles", "Black Swan", "Deadpool", "The Breakfast Club", "The Untouchables",
    "Shaun of the Dead", "True Romance", "Harry Potter and the Prisoner of Azkaban", "Hot Fuzz",
    "In Bruges", "Boyhood", "Straight Outta Compton", "Drive", "Moneyball", "Brazil", "Chronicle",
    "Still Alice", "Triangle", "The Endless", "The Man from Earth",
    "The Secret in Their Eyes", "The Fall", "The Hunt", "Incendies", "The Intouchables",
    "Prisoners", "Enemy", "Locke", "The Lobster", "Under the Skin",
    "Ex Machina", "Annihilation", "The Florida Project", "Room", "Brooklyn",
    "Carol", "The Farewell", "Portrait of a Lady on Fire", "The Handmaiden", "Shoplifters",
    "A Separation", "Toni Erdmann", "Cold War", "Wild Tales", "The Square"
]

# normalize titles for regex patterns
sample_titles_norm = [normalize_text(t) for t in sample_titles]

title_patterns = {
    title: re.compile(r"(?<!\w)" + re.escape(title) + r"(?!\w)", re.IGNORECASE)
    for title in sample_titles_norm
}

# ------------------------------
# Detect titles function
# ------------------------------
def detect_titles_regex(text, patterns):
    text_norm = normalize_text(text)
    detected = []
    for title, pattern in patterns.items():
        if pattern.search(text_norm):
            detected.append(title)
    return detected

# ------------------------------
# TMDb metadata caching
# ------------------------------
metadata_cache = {}

def get_movie_metadata(title):
    title_key = title.strip().lower()
    if title_key in metadata_cache:
        return metadata_cache[title_key]

    query_title = title.strip().title()  # proper case for API
    search_url = f"{BASE_URL}/search/movie?api_key={API_KEY}&query={query_title}"
    try:
        search_response = requests.get(search_url).json()
    except requests.RequestException:
        return None

    if not search_response.get("results"):
        return None

    movie_id = search_response["results"][0]["id"]

    credits_url = f"{BASE_URL}/movie/{movie_id}/credits?api_key={API_KEY}"
    try:
        credits_response = requests.get(credits_url).json()
    except requests.RequestException:
        return None

    actors = [member["name"] for member in credits_response.get("cast", [])[:7]]
    directors = [member["name"] for member in credits_response.get("crew", []) if member["job"] == "Director"]

    result = {"actors": actors, "directors": directors}
    metadata_cache[title_key] = result
    sleep(0.25)  # rate limit
    return result

# ------------------------------
# Enrich with metadata
# ------------------------------
def enrich_with_metadata(row):
    titles = row.get("detected_titles", [])
    if not titles:
        row["actors"] = None
        row["directors"] = None
        row["ner_entities"] = []
        return row

    metadata = get_movie_metadata(titles[0])
    row["actors"] = metadata.get("actors") if metadata else []
    row["directors"] = metadata.get("directors") if metadata else []

    # ------------------------------
    # Use spaCy NER to detect persons in review
    # ------------------------------
    doc = nlp(row["text"])
    persons_in_review = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

    # ------------------------------
    # Link detected persons to movie metadata
    # ------------------------------
    relevant_entities = []
    for person in persons_in_review:
        if person in row["actors"] or person in row["directors"]:
            relevant_entities.append(person)

    row["ner_entities"] = relevant_entities
    return row

# ------------------------------
# Load dataset
# ------------------------------
train_df = pd.DataFrame(train_data)

# Detect titles
train_df["detected_titles"] = train_df["text"].apply(lambda x: detect_titles_regex(x, title_patterns))

# Filter rows with at least one detected title
matched_reviews = train_df[train_df["detected_titles"].map(len) > 0].copy()

# Initialize columns
matched_reviews["actors"] = None
matched_reviews["directors"] = None
matched_reviews["ner_entities"] = None

# Enrich with metadata and NER
matched_reviews = matched_reviews.apply(enrich_with_metadata, axis=1)

# ------------------------------
# Save final DataFrame - for development purposes
# ------------------------------
matched_reviews.to_csv("matched_reviews_with_metadata_ner.csv", index=False)
print("Pipeline complete. Saved matched reviews with NER metadata.")

#Took about 38 minutes to run the full pipeline on the training set.


  from .autonotebook import tqdm as notebook_tqdm


Pipeline complete. Saved matched reviews with NER metadata.


In [4]:
filtered = matched_reviews[matched_reviews["detected_titles"] != "up"][10:]
filtered.head(10)




Unnamed: 0,text,label,detected_titles,actors,directors,ner_entities
30,"Taut and organically gripping, Edward Dmytryk'...",1,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
33,One of the most significant quotes from the en...,1,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
35,"I bought this film at Blockbuster for $3.00, b...",0,[requiem for a dream],"[Ellen Burstyn, Jared Leto, Jennifer Connelly,...",[Darren Aronofsky],[]
37,"Ever watched a movie that lost the plot? Well,...",0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
39,"After sitting through this pile of dung, my hu...",0,"[gravity, up]","[Sandra Bullock, George Clooney, Ed Harris, Or...",[Alfonso Cuarón],[]
40,It had all the clichés of movies of this type ...,0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
41,"This movie is based on the book, ""A Many Splen...",1,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
42,"Of all the films I have seen, this one, The Ra...",0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
44,"This movie struck home for me. Being 29, I rem...",1,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
47,"How this film could be classified as Drama, I ...",0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]


In [5]:
# Baseline Classification Models without NER Metadata

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Prepare data for modeling
df = matched_reviews.copy()

# Ensure text is string
df["text"] = df["text"].astype(str)

X = df["text"]
y = df["label"]     # 0 = negative, 1 = positive

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=20_000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)



In [6]:
# Logistic Regression Model
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train_vec, y_train)

# Evaluation
pred_lr = log_reg.predict(X_test_vec)

print("==== Logistic Regression ====")
print("Accuracy:", accuracy_score(y_test, pred_lr))
print("F1 Score:", f1_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))

==== Logistic Regression ====
Accuracy: 0.8583132530120482
F1 Score: 0.8579710144927536
              precision    recall  f1-score   support

           0       0.87      0.84      0.86      1059
           1       0.84      0.87      0.86      1016

    accuracy                           0.86      2075
   macro avg       0.86      0.86      0.86      2075
weighted avg       0.86      0.86      0.86      2075



In [7]:
# SVM Model
svm_clf = LinearSVC()
svm_clf.fit(X_train_vec, y_train)

# Evaluation
pred_svm = svm_clf.predict(X_test_vec)

print("==== Linear SVM ====")
print("Accuracy:", accuracy_score(y_test, pred_svm))
print("F1 Score:", f1_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))


==== Linear SVM ====
Accuracy: 0.8650602409638555
F1 Score: 0.8642095053346266
              precision    recall  f1-score   support

           0       0.88      0.85      0.87      1059
           1       0.85      0.88      0.86      1016

    accuracy                           0.87      2075
   macro avg       0.87      0.87      0.87      2075
weighted avg       0.87      0.87      0.87      2075



In [8]:
# Baseline Classification Models with NER Metadata
import numpy as np
from textblob import TextBlob

def compute_entity_features(row):
    titles = row["detected_titles"]

    # actors/directors are lists, not strings
    actors = row.get("actors", [])
    directors = row.get("directors", [])

    num_titles = len(titles)
    num_actors = len(actors)
    num_directors = len(directors)

    text = row["text"].lower()

    # Count actor mentions
    actor_mentions = 0
    for a in actors:
        actor_mentions += text.count(a.lower())

    # Count director mentions
    director_mentions = 0
    for d in directors:
        director_mentions += text.count(d.lower())

    # Sentiment toward entity names
    entity_tokens = actors + directors
    entity_sentiment = 0

    if entity_tokens:
        combined = " ".join(entity_tokens)
        try:
            entity_sentiment = TextBlob(combined).sentiment.polarity
        except:
            entity_sentiment = 0

    return pd.Series({
        "num_titles": num_titles,
        "num_actors": num_actors,
        "num_directors": num_directors,
        "actor_mentions": actor_mentions,
        "director_mentions": director_mentions,
        "entity_sentiment": entity_sentiment
    })


# Compute entity features
entity_features = matched_reviews.apply(compute_entity_features, axis=1)
full_df = pd.concat([matched_reviews, entity_features], axis=1)

In [9]:
# from scipy.sparse import hstack
#
# # TF-IDF Vectorization on review text with NER features
# tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
# X_text = tfidf.fit_transform(full_df["text"])
#
# X_train_text = tfidf.fit_transform(X_train_text_raw)
# X_test_text = tfidf.transform(X_test_text_raw)
#
# # Combine text features with NER features
# X_entity = full_df[[
#     "num_titles", "num_actors", "num_directors",
#     "actor_mentions", "director_mentions",
#     "entity_sentiment"
# ]].fillna(0).values
#
# X = hstack([X_text, X_entity])
# y = full_df["label"]
#
# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )

In [10]:
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler

# Split BEFORE vectorization
X_text_raw = full_df["text"]
X_entity = full_df[[
    "num_titles", "num_actors", "num_directors",
    "actor_mentions", "director_mentions",
    "entity_sentiment"
]].fillna(0).values
y = full_df["label"]

# Train-test split on raw data
X_text_train, X_text_test, X_entity_train, X_entity_test, y_train, y_test = train_test_split(
    X_text_raw, X_entity, y, test_size=0.2, random_state=42, stratify=y
)

# Now fit TF-IDF only on training text
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_text = tfidf.fit_transform(X_text_train)  # Learn from train only
X_test_text = tfidf.transform(X_text_test)        # Apply to test

# Scale entity features (fit on train, transform both)
scaler = StandardScaler(with_mean=False)  # Sparse-compatible
X_entity_train_scaled = scaler.fit_transform(X_entity_train)
X_entity_test_scaled = scaler.transform(X_entity_test)

# Combine features
X_train = hstack([X_train_text, X_entity_train])
X_test = hstack([X_test_text, X_entity_test])

In [11]:
# Logistic Regression with NER features
log_clf = LogisticRegression(max_iter=500)
log_clf.fit(X_train, y_train)

pred_log = log_clf.predict(X_test)

print("==== Entity-Aware Logistic Regression ====")
print("Accuracy:", accuracy_score(y_test, pred_log))
print("F1 Score:", f1_score(y_test, pred_log))
print(classification_report(y_test, pred_log))

==== Entity-Aware Logistic Regression ====
Accuracy: 0.8674698795180723
F1 Score: 0.8659190638712823
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1059
           1       0.86      0.87      0.87      1016

    accuracy                           0.87      2075
   macro avg       0.87      0.87      0.87      2075
weighted avg       0.87      0.87      0.87      2075



In [12]:
# SVM with NER features
svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)

pred_svm = svm_clf.predict(X_test)

print("==== Entity-Aware SVM ====")
print("Accuracy:", accuracy_score(y_test, pred_svm))
print("F1 Score:", f1_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))

==== Entity-Aware SVM ====
Accuracy: 0.8665060240963856
F1 Score: 0.864812103465105
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1059
           1       0.86      0.87      0.86      1016

    accuracy                           0.87      2075
   macro avg       0.87      0.87      0.87      2075
weighted avg       0.87      0.87      0.87      2075



In [13]:
# # DistilBERT Model for Sentiment Classification
# import torch
# from torch.utils.data import Dataset, DataLoader
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# from torch.optim import AdamW
# from sklearn.metrics import classification_report, accuracy_score, f1_score
# from tqdm import tqdm
# import numpy as np
#
# # Check for GPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
#
# # ------------------------------
# # Custom Dataset Class
# # ------------------------------
# class ReviewDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_length=512):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_length = max_length
#
#     def __len__(self):
#         return len(self.texts)
#
#     def __getitem__(self, idx):
#         text = str(self.texts.iloc[idx])
#         label = self.labels.iloc[idx]
#
#         encoding = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length=self.max_length,
#             padding='max_length',
#             truncation=True,
#             return_attention_mask=True,
#             return_tensors='pt'
#         )
#
#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#             'label': torch.tensor(label, dtype=torch.long)
#         }
#
# # ------------------------------
# # Prepare data
# # ------------------------------
# # Use the same train-test split as before
# X_train_text, X_test_text, y_train, y_test = train_test_split(
#     full_df["text"], full_df["label"],
#     test_size=0.2,
#     random_state=42,
#     stratify=full_df["label"]
# )
#
# # Initialize tokenizer
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
#
# # Create datasets
# train_dataset = ReviewDataset(X_train_text, y_train, tokenizer)
# test_dataset = ReviewDataset(X_test_text, y_test, tokenizer)
#
# # Create dataloaders
# batch_size = 16
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=batch_size)
#
# # ------------------------------
# # Initialize Model
# # ------------------------------
# model = DistilBertForSequenceClassification.from_pretrained(
#     'distilbert-base-uncased',
#     num_labels=2  # binary classification
# )
# model.to(device)
#
# # Optimizer
# optimizer = AdamW(model.parameters(), lr=2e-5)
#
# # ------------------------------
# # Training Function
# # ------------------------------
# def train_epoch(model, dataloader, optimizer, device):
#     model.train()
#     total_loss = 0
#
#     for batch in tqdm(dataloader, desc="Training"):
#         optimizer.zero_grad()
#
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['label'].to(device)
#
#         outputs = model(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             labels=labels
#         )
#
#         loss = outputs.loss
#         total_loss += loss.item()
#
#         loss.backward()
#         optimizer.step()
#
#     return total_loss / len(dataloader)
#
# # ------------------------------
# # Evaluation Function
# # ------------------------------
# def evaluate(model, dataloader, device):
#     model.eval()
#     predictions = []
#     true_labels = []
#
#     with torch.no_grad():
#         for batch in tqdm(dataloader, desc="Evaluating"):
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['label'].to(device)
#
#             outputs = model(
#                 input_ids=input_ids,
#                 attention_mask=attention_mask
#             )
#
#             logits = outputs.logits
#             preds = torch.argmax(logits, dim=1)
#
#             predictions.extend(preds.cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())
#
#     return np.array(predictions), np.array(true_labels)
#
# # ------------------------------
# # Train the Model
# # ------------------------------
# num_epochs = 3
#
# print("Starting training...")
# for epoch in range(num_epochs):
#     print(f"\nEpoch {epoch + 1}/{num_epochs}")
#
#     train_loss = train_epoch(model, train_loader, optimizer, device)
#     print(f"Average training loss: {train_loss:.4f}")
#
#     # Evaluate on test set after each epoch
#     predictions, true_labels = evaluate(model, test_loader, device)
#     accuracy = accuracy_score(true_labels, predictions)
#     f1 = f1_score(true_labels, predictions)
#
#     print(f"Test Accuracy: {accuracy:.4f}")
#     print(f"Test F1 Score: {f1:.4f}")
#
# # ------------------------------
# # Final Evaluation
# # ------------------------------
# print("\n==== DistilBERT Final Results ====")
# predictions, true_labels = evaluate(model, test_loader, device)
#
# print("Accuracy:", accuracy_score(true_labels, predictions))
# print("F1 Score:", f1_score(true_labels, predictions))
# print(classification_report(true_labels, predictions))
#
# # ------------------------------
# # Save model (optional)
# # ------------------------------
# model.save_pretrained("./distilbert_sentiment_model")
# tokenizer.save_pretrained("./distilbert_sentiment_model")
# print("\nModel saved to ./distilbert_sentiment_model")

In [None]:
# TinyBERT Model for Sentiment Classification
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import classification_report, accuracy_score, f1_score
from tqdm import tqdm
import numpy as np

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ------------------------------
# Custom Dataset Class
# ------------------------------
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# ------------------------------
# Prepare data
# ------------------------------
# Use the same train-test split as before
X_train_text, X_test_text, y_train, y_test = train_test_split(
    full_df["text"], full_df["label"],
    test_size=0.2,
    random_state=42,
    stratify=full_df["label"]
)

# Initialize TinyBERT tokenizer and model
print("Loading TinyBERT model...")
tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
model = AutoModelForSequenceClassification.from_pretrained(
    'huawei-noah/TinyBERT_General_4L_312D',
    num_labels=2
)
model.to(device)
print("Model loaded successfully!")

# Create datasets (using max_length=256 for faster training)
train_dataset = ReviewDataset(X_train_text, y_train, tokenizer, max_length=256)
test_dataset = ReviewDataset(X_test_text, y_test, tokenizer, max_length=256)

# Create dataloaders (increased batch size for speed)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# ------------------------------
# Training Function
# ------------------------------
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

# ------------------------------
# Evaluation Function
# ------------------------------
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return np.array(predictions), np.array(true_labels)

# ------------------------------
# Train the Model
# ------------------------------
num_epochs = 2  # Reduced to 2 for faster training

print("\nStarting training...")
print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print(f"Batch size: {batch_size}")
print(f"Epochs: {num_epochs}\n")

for epoch in range(num_epochs):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print('='*50)

    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f"Average training loss: {train_loss:.4f}")

    # Evaluate on test set after each epoch
    predictions, true_labels = evaluate(model, test_loader, device)
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)

    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test F1 Score: {f1:.4f}")

# ------------------------------
# Final Evaluation
# ------------------------------
print("\n" + "="*50)
print("==== TinyBERT Final Results ====")
print("="*50)
predictions, true_labels = evaluate(model, test_loader, device)

accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nDetailed Classification Report:")
print(classification_report(true_labels, predictions, target_names=['Negative', 'Positive']))

# ------------------------------
# Save model (optional)
# ------------------------------
save_model = input("\nSave model? (y/n): ").lower()
if save_model == 'y':
    model.save_pretrained("./tinybert_sentiment_model")
    tokenizer.save_pretrained("./tinybert_sentiment_model")
    print("Model saved to ./tinybert_sentiment_model")

Using device: cpu
Loading TinyBERT model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!

Starting training...
Training samples: 8297
Test samples: 2075
Batch size: 32
Epochs: 2


Epoch 1/2


Training:  74%|███████▍  | 192/260 [18:03<09:10,  8.10s/it]