In [None]:
import pandas as pd

df = pd.read_csv("movies_preprocessed.csv")  # or .json, .parquet, etc.
print(df.head())
print(df['clean_summary'].iloc[0])
print(df['genres'].iloc[0])


   movie_id                                       plot_summary  \
0  23890098  Shlykov, a hard-working taxi driver and Lyosha...   
1  31186339  The nation of Panem consists of a wealthy Capi...   
2  20663735  Poovalli Induchoodan  is sentenced for six yea...   
3   2231378  The Lemon Drop Kid , a New York City swindler,...   
4    595909  Seventh-day Adventist Church pastor Michael Ch...   

                                              genres  \
0                          ['Drama', 'World cinema']   
1  ['Action/Adventure', 'Science Fiction', 'Actio...   
2        ['Musical', 'Action', 'Drama', 'Bollywood']   
3                     ['Screwball comedy', 'Comedy']   
4  ['Crime Fiction', 'Drama', 'Docudrama', 'World...   

                                       clean_summary  
0  shlykov hardworking taxi driver lyosha saxopho...  
1  nation panem consists wealthy capitol twelve p...  
2  poovalli induchoodan sentenced six year prison...  
3  lemon drop kid new york city swindler illeg

In [None]:
import ast

def normalize_genres(cell):
    if isinstance(cell, list):
        genres = cell
    elif isinstance(cell, str):
        try:
            genres = ast.literal_eval(cell)  # safely parse stringified list
        except (ValueError, SyntaxError):
            genres = []
    else:
        genres = []

    return [g.strip().lower() for g in genres if isinstance(g, str) and g.strip()]



In [None]:
import pandas as pd
import ast
from collections import Counter

# === STEP 1: Normalize Genre Column ===

def safe_parse(genres):
    """Handles both actual lists and stringified lists safely."""
    if isinstance(genres, list):
        return genres
    if isinstance(genres, str):
        try:
            parsed = ast.literal_eval(genres)
            if isinstance(parsed, list):
                return parsed
        except:
            pass
    return []

# Apply parsing
df['genres'] = df['genres'].apply(safe_parse)

# Normalize: lowercasing and stripping spaces
df['genres'] = df['genres'].apply(lambda genres: [g.strip().lower() for g in genres])

# === STEP 2: Count Individual Genres ===

genre_counter = Counter(g for genres in df['genres'] for g in genres)
print(f"Total unique genres: {len(genre_counter)}")
print(genre_counter.most_common(20))

Total unique genres: 363
[('drama', 19134), ('comedy', 10467), ('romance film', 6666), ('thriller', 6530), ('action', 5868), ('world cinema', 5153), ('crime fiction', 4275), ('horror', 4082), ('black-and-white', 3731), ('indie', 3668), ('action/adventure', 3553), ('adventure', 3248), ('family film', 3219), ('short film', 3192), ('romantic drama', 2572), ('animation', 2441), ('musical', 2414), ('science fiction', 2339), ('mystery', 2119), ('romantic comedy', 2075)]


In [None]:
from collections import Counter

# Step 1: Flatten all genres and count frequency
all_genres = [genre.strip().lower() for sublist in df['genres'] for genre in sublist]
genre_counts = Counter(all_genres)

# Step 2: Set the threshold
min_threshold = 1000
valid_genres = {genre for genre, count in genre_counts.items() if count >= min_threshold}

# Step 3: Filter genres in each row
df['genres'] = df['genres'].apply(lambda genres: [g.strip().lower() for g in genres if g.strip().lower() in valid_genres])

# Optional: Remove rows that now have no genres left
df = df[df['genres'].map(len) > 0].reset_index(drop=True)


In [None]:
genre_counter = Counter(g for genres in df['genres'] for g in genres)
print(f"Total unique genres: {len(genre_counter)}")
print(genre_counter.most_common(33))

Total unique genres: 33
[('drama', 19134), ('comedy', 10467), ('romance film', 6666), ('thriller', 6530), ('action', 5868), ('world cinema', 5153), ('crime fiction', 4275), ('horror', 4082), ('black-and-white', 3731), ('indie', 3668), ('action/adventure', 3553), ('adventure', 3248), ('family film', 3219), ('short film', 3192), ('romantic drama', 2572), ('animation', 2441), ('musical', 2414), ('science fiction', 2339), ('mystery', 2119), ('romantic comedy', 2075), ('fantasy', 2012), ('comedy film', 1778), ('crime thriller', 1682), ('war film', 1556), ('period piece', 1321), ('japanese movies', 1290), ('comedy-drama', 1261), ('film adaptation', 1225), ('documentary', 1213), ('silent film', 1199), ('psychological thriller', 1138), ('bollywood', 1058), ('western', 1022)]


In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils.class_weight import compute_class_weight

# Load dataset
df = pd.read_csv("movies_preprocessed.csv")

# Ensure genres are list of strings
import ast
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Filter genres based on min threshold
from collections import Counter
all_genres = [genre.strip().lower() for sublist in df['genres'] for genre in sublist]
genre_counts = Counter(all_genres)
min_threshold = 1000
valid_genres = {g for g, c in genre_counts.items() if c >= min_threshold}
df['genres'] = df['genres'].apply(lambda g: [x.strip().lower() for x in g if x.strip().lower() in valid_genres])
df = df[df['genres'].map(len) > 0].reset_index(drop=True)


In [None]:
# Binarize genres
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_summary'], y, test_size=0.2, random_state=42
)


In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_data(texts, max_len=256):
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
from torch.utils.data import Dataset, DataLoader

class GenreDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

# Tokenize
train_encodings = tokenize_data(X_train)
test_encodings = tokenize_data(X_test)

# Create datasets
train_dataset = GenreDataset(train_encodings, y_train)
test_dataset = GenreDataset(test_encodings, y_test)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
from transformers import RobertaModel, RobertaConfig
import torch.nn as nn

class RobertaMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super(RobertaMultiLabelClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.classifier = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(self.roberta.config.hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        logits = self.classifier(cls_output)
        return logits


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = []
for i in range(y_train.shape[1]):
    weights = compute_class_weight(
        class_weight='balanced',
        classes=np.array([0, 1]),  # ✅ FIX: Use np.array here
        y=y_train[:, i]
    )
    class_weights.append(weights[1])  # Positive class weight for class 1

import torch

# Convert the list of class weights into a torch tensor
pos_weight = torch.tensor(class_weights, dtype=torch.float)




In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
from transformers import RobertaModel
import torch.nn as nn

class RobertaMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super(RobertaMultiLabelClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

model = RobertaMultiLabelClassifier(num_labels=len(mlb.classes_)).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(device))  # ✅ uses fixed pos_weight



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def train_model(epochs=3, model_save_path='best_roberta_model.pt'):
    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        loop = tqdm(train_loader, leave=True)

        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            loop.set_description(f"Epoch {epoch + 1}")
            loop.set_postfix(train_loss=loss.item())

        avg_train_loss = total_train_loss / len(train_loader)

        # === VALIDATION STEP ===
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(test_loader)
        print(f"\nEpoch {epoch + 1} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        # Save if this is the best model so far
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), model_save_path)
            print(f"✅ Saved new best model to {model_save_path}")


In [None]:
train_model(epochs=3)

Epoch 1: 100%|██████████| 2057/2057 [22:48<00:00,  1.50it/s, train_loss=0.582]



Epoch 1 - Train Loss: 0.6521, Val Loss: 0.5554
✅ Saved new best model to best_roberta_model.pt


Epoch 2: 100%|██████████| 2057/2057 [22:50<00:00,  1.50it/s, train_loss=0.643]



Epoch 2 - Train Loss: 0.5373, Val Loss: 0.5321
✅ Saved new best model to best_roberta_model.pt


Epoch 3: 100%|██████████| 2057/2057 [22:50<00:00,  1.50it/s, train_loss=0.668]



Epoch 3 - Train Loss: 0.4798, Val Loss: 0.5203
✅ Saved new best model to best_roberta_model.pt


In [None]:
model.load_state_dict(torch.load('best_roberta_model.pt'))
model.to(device)
model.eval()


RobertaMultiLabelClassifier(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
import numpy as np
import torch

def evaluate_model(model, dataloader, threshold=0.5, multilabel_binarizer=None, device='cuda'):
    model.eval()
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(logits).cpu().numpy()

            all_probs.append(probs)
            all_labels.append(labels.cpu().numpy())

    all_probs = np.concatenate(all_probs)
    all_labels = np.concatenate(all_labels)

    y_pred = (all_probs >= threshold).astype(int)

    # === Main Evaluation Metrics ===
    print("\n=== Evaluation Metrics ===")
    print(f"Micro F1:        {f1_score(all_labels, y_pred, average='micro'):.4f}")
    print(f"Macro F1:        {f1_score(all_labels, y_pred, average='macro'):.4f}")
    print(f"Weighted F1:     {f1_score(all_labels, y_pred, average='weighted'):.4f}")
    print(f"Micro Precision: {precision_score(all_labels, y_pred, average='micro'):.4f}")
    print(f"Macro Precision: {precision_score(all_labels, y_pred, average='macro'):.4f}")
    print(f"Micro Recall:    {recall_score(all_labels, y_pred, average='micro'):.4f}")
    print(f"Macro Recall:    {recall_score(all_labels, y_pred, average='macro'):.4f}")

    if multilabel_binarizer:
        print("\n=== Per-Genre Performance ===")
        print(classification_report(
            all_labels, y_pred, target_names=multilabel_binarizer.classes_, zero_division=0
        ))

    return y_pred



In [None]:
# Move model to GPU if available
model.to(device)

# Evaluate on test set
predictions = evaluate_model(
    model,
    dataloader=test_loader,
    threshold=0.5,
    multilabel_binarizer=mlb,
    device=device
)




=== Evaluation Metrics ===
Micro F1:        0.4877
Macro F1:        0.4504
Weighted F1:     0.5284
Micro Precision: 0.3680
Macro Precision: 0.3421
Micro Recall:    0.7227
Macro Recall:    0.7317

=== Per-Genre Performance ===
                        precision    recall  f1-score   support

                action       0.45      0.78      0.57      1175
      action/adventure       0.33      0.79      0.47       717
             adventure       0.32      0.79      0.46       620
             animation       0.44      0.89      0.58       509
       black-and-white       0.35      0.56      0.43       701
             bollywood       0.22      0.94      0.35       223
                comedy       0.59      0.63      0.61      2083
           comedy film       0.12      0.59      0.20       376
          comedy-drama       0.14      0.56      0.22       261
         crime fiction       0.46      0.64      0.54       788
        crime thriller       0.29      0.68      0.40       338
    

In [None]:
import torch

# Save model weights
torch.save(model.state_dict(), "roberta_genre_classifier.pt")

# Save tokenizer
tokenizer.save_pretrained("roberta_tokenizer")


('roberta_tokenizer/tokenizer_config.json',
 'roberta_tokenizer/special_tokens_map.json',
 'roberta_tokenizer/vocab.json',
 'roberta_tokenizer/merges.txt',
 'roberta_tokenizer/added_tokens.json')

In [None]:
# Recreate the model with same architecture
model = RobertaMultiLabelClassifier(num_labels=len(mlb.classes_))
model.load_state_dict(torch.load("roberta_genre_classifier.pt"))
model.to(device)

# Load tokenizer
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta_tokenizer")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Re-import necessary components
import torch
from transformers import RobertaTokenizer
import torch.nn as nn

# Define your model class again (same as before)
class RobertaMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super(RobertaMultiLabelClassifier, self).__init__()
        from transformers import RobertaModel
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

# Load model with the correct number of output labels
num_labels = 33  # Update this if needed
model = RobertaMultiLabelClassifier(num_labels)
model.load_state_dict(torch.load("roberta_genre_classifier.pt", map_location=torch.device('cpu')))  # or 'cuda'
model.eval()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaMultiLabelClassifier(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")


In [None]:
def predict_genres(text, model, tokenizer, threshold=0.5, label_names=None):
    # Tokenize input
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=256
    )

    # Inference
    with torch.no_grad():
        logits = model(inputs['input_ids'], inputs['attention_mask'])
        probs = torch.sigmoid(logits).squeeze().numpy()

    # Thresholding
    predictions = (probs >= threshold).astype(int)

    # Show labels
    if label_names:
        predicted_labels = [label for label, flag in zip(label_names, predictions) if flag == 1]
        return predicted_labels
    else:
        return predictions


In [None]:
# Example text
sample_summary = "Lola  attempts to gain her father's trust fund  by hiring a Hispanic husband, Bo , offering him $100,000 per year if he marries her. Bo accepts, and then departs from his Texas home and returns with Lola to her home in New Mexico. Lola finds out he is in fact Caucasian. Lola's best friend, Nina , and Bo's brother  join the trip, as does Bo's girlfriend, Nikki , and her clumsy friend, Dotty , in secret. When introduced to Bo, Lola's father, Victor , refuses to allow them to get married, until Bo convinces him by playing an antique guitar. In the mean time, Lola's ex-fiancÃ©, Marco , and his secret girlfriend, Simona , who is also Lola's sister trying to keep her from having the inheritance, plot against Lola in order to gain the trust fund for themselves. Throughout the course of the story, Lola and Bo actually fall for each other. However, after Simona discovers Bo and Lola have a plot of their own, she uses Nikki to ruin the plot. However, before Nikki could do any damage, Victor and Bo get into an argument, which has both Lola and her father kick Bo out. In response, Lola's ex-fiancÃ© is able to win back Lola's heart, but Lola only agrees for the money. Later that night, Nikki and Bo discover Simona and Marco's plot and are consequently held prisoner, so they may not warn anyone. However, Nikki's clumsy friend Dotty finds Nikki and Bo, frees them, and Bo is able to warn Lola. Still angry about the previous night, Lola kicks Bo out once more and proceeds to marry for the money. Fortunately for her, she trusts Bo's words and turns the tables on Simona and Marco, and also turns down the money from her father and departs, claiming that money ruined the family long before that day. She does not want the money if it means turning into a selfish and greedy person like her father.  At the end of the film, Bo is working as a bartender, where Lola manages to find him. They reconcile and show their love for each other once again, and they remain together and happy."

# If you have your MultiLabelBinarizer still:
# labels = mlb.classes_
# Or manually set if you know them:
labels = ['action', 'action/adventure', 'adventure', 'animation', 'black-and-white', 'bollywood', 'comedy',
          'comedy film', 'comedy-drama', 'crime fiction', 'crime thriller', 'documentary', 'drama', 'family film',
          'fantasy', 'film adaptation', 'horror', 'indie', 'japanese movies', 'musical', 'mystery', 'period piece',
          'psychological thriller', 'romance film', 'romantic comedy', 'romantic drama', 'science fiction',
          'short film', 'silent film', 'thriller', 'war film', 'western', 'world cinema']

# Predict
predicted_genres = predict_genres(sample_summary, model, tokenizer, threshold=0.9, label_names=labels)
print("Predicted Genres:", predicted_genres)


Predicted Genres: ['romantic comedy']


In [None]:
# === Save the MultiLabelBinarizer for future use (e.g., in GUI) ===
import pickle

with open("mlb.pkl", "wb") as f:
    pickle.dump(mlb, f)
