In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from typing import Dict, Any, Tuple

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
movies_df = pd.read_csv('top_movies.csv')
movies_df.isna().sum()

movie_name     0
genre          3
description    0
dtype: int64

In [3]:
movies_df = movies_df.dropna()
movies_df.isna().sum()

movie_name     0
genre          0
description    0
dtype: int64

In [4]:
all_genres = set()
for genres in movies_df['genre']:
    for genre in genres.split(','):
        all_genres.add(genre.strip())
        
genre_to_index = {genre: idx for idx, genre in enumerate(sorted(all_genres))}

print("Genre to index mapping:", genre_to_index)

Genre to index mapping: {'Action': 0, 'Adventure': 1, 'Animation': 2, 'Comedy': 3, 'Crime': 4, 'Drama': 5, 'Family': 6, 'Fantasy': 7, 'History': 8, 'Horror': 9, 'Music': 10, 'Mystery': 11, 'Romance': 12, 'Science Fiction': 13, 'TV Movie': 14, 'Thriller': 15, 'War': 16, 'Western': 17}


In [5]:
class MovieDescriptionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, genre_to_index: Dict[str, int], max_length: int = 128):
        self.descriptions = dataframe['description'].tolist()
        self.genres = dataframe['genre'].tolist()
        self.tokenizer = tokenizer
        self.genre_to_index = genre_to_index
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.descriptions)

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        description = self.descriptions[idx]
        genre_string = self.genres[idx]

        encoding = self.tokenizer(
            description,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        label = torch.zeros(len(self.genre_to_index))
        for genre in genre_string.split(','):
            genre = genre.strip()
            if genre in self.genre_to_index:
                label[self.genre_to_index[genre]] = 1

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': label
        }

In [6]:
train_df, val_df = train_test_split(movies_df, test_size=0.2, random_state=42)

In [7]:
train_dataset = MovieDescriptionDataset(train_df, tokenizer, genre_to_index, max_length=128)
val_dataset = MovieDescriptionDataset(val_df, tokenizer, genre_to_index, max_length=128)

In [8]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [9]:
class TransformerBlock(nn.Module):
    def __init__(self, emb_dim: int, 
                 num_heads: int, 
                 feedforward_dim: int = 256, 
                 dropout: float = 0.3):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim=emb_dim, num_heads=num_heads, batch_first=True)
        self.layernorm1 = nn.LayerNorm(emb_dim)
        self.dropout1 = nn.Dropout(dropout)

        self.feedforward = nn.Sequential(
            nn.Linear(emb_dim, feedforward_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(feedforward_dim, emb_dim)
        )
        self.layernorm2 = nn.LayerNorm(emb_dim)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Multi-head attention block
        attn_output, _ = self.attention(x, x, x)
        x = self.layernorm1(x + self.dropout1(attn_output))  # Residual + LayerNorm

        # Feed-forward block
        ff_output = self.feedforward(x)
        x = self.layernorm2(x + self.dropout2(ff_output))  # Residual + LayerNorm

        return x


In [10]:
class TextClassifier(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        emb_dim: int,
        hidden_dim: int,
        num_classes: int,
        num_heads: int = 4,
        max_seq_length: int = 128,
        num_attention_layers: int = 2,
        feedforward_dim: int = 256
    ) -> None:
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_length, emb_dim))

        # Stack Transformer blocks
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(emb_dim, num_heads, feedforward_dim) for _ in range(num_attention_layers)
        ])

        self.dropout = nn.Dropout(0.3)

        self.fc1 = nn.Linear(emb_dim, hidden_dim)
        self.layernorm_final = nn.LayerNorm(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.emb(x) + self.positional_encoding[:, :x.size(1), :]

        for transformer_block in self.transformer_blocks:
            x = transformer_block(x)

        x = x.mean(dim=1)

        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.layernorm_final(x)
        x = self.fc2(x)

        return x


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
vocab_size = 30522
emb_dim = 128
hidden_dim = 256

num_classes = len(genre_to_index)
max_seq_length = 128

batch_size = 16

In [13]:
model = TextClassifier(vocab_size, emb_dim, hidden_dim, num_classes, num_heads=8).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    print(f"\nEpoch [{epoch + 1}/{num_epochs}]")
    loop = tqdm(train_dataloader, leave=True)

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids)
        loss = criterion(outputs, labels)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_dataloader)
    print(f"Average Training Loss: {avg_loss:.4f}")


Epoch [1/10]


100%|█████████████████████████████| 471/471 [01:09<00:00,  6.82it/s, loss=0.319]


Average Training Loss: 0.3652

Epoch [2/10]


 43%|████████████▍                | 203/471 [00:32<00:41,  6.48it/s, loss=0.365]

In [None]:
def evaluate_model(model: torch.nn.Module, 
                   dataloader: DataLoader, 
                   threshold: float = 0.5, 
                   device: str = 'cpu') -> Tuple[float, float, float, float]:
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].cpu().numpy()

            outputs = model(input_ids)
            probs = torch.sigmoid(outputs).cpu().numpy()

            preds = (probs >= threshold).astype(int)

            all_labels.extend(labels)
            all_preds.extend(preds)

    # Compute multi-label metrics
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    subset_acc = accuracy_score(all_labels, all_preds)

    print(f"Validation F1 Score: {f1:.4f}")
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation Subset Accuracy: {subset_acc:.4f}\n")

    return f1, precision, recall, subset_acc

In [None]:
evaluate_model(model, val_dataloader, device=device)

In [None]:
def predict_genres(model, tokenizer, description, index_to_genre, threshold=0.7, max_length=128, device='cpu'):
    model.eval()
    genres = []

    encoding = tokenizer(
        description,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)

    with torch.no_grad():
        outputs = model(input_ids)
        probs = torch.sigmoid(outputs)
        predicted_indices = (probs[0] >= threshold).nonzero(as_tuple=True)[0].tolist()
        genres = [index_to_genre[idx] for idx in predicted_indices]

    return genres
