In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from typing import Dict, Any, Tuple, Optional

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
movies_df = pd.read_csv('top_movies.csv')
movies_df.isna().sum()

movie_name     0
genre          3
description    0
dtype: int64

In [5]:
movies_df = movies_df.dropna()
movies_df.isna().sum()

movie_name     0
genre          0
description    0
dtype: int64

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

empty_sequences = []

for idx, description in enumerate(movies_df['description'].tolist()):
    encoding = tokenizer(
        description,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

    attention_mask = encoding['attention_mask'].squeeze(0)
    if attention_mask.sum().item() == 0:
        print(f"⚠️ Empty sequence found at index {idx}: {description}")
        empty_sequences.append(idx)

print(f"\n✅ Total empty sequences found: {len(empty_sequences)}")


✅ Total empty sequences found: 0


In [7]:
all_genres = set()
for genres in movies_df['genre']:
    for genre in genres.split(','):
        all_genres.add(genre.strip())
        
genre_to_index = {genre: idx for idx, genre in enumerate(sorted(all_genres))}

print("Genre to index mapping:", genre_to_index)

Genre to index mapping: {'Action': 0, 'Adventure': 1, 'Animation': 2, 'Comedy': 3, 'Crime': 4, 'Drama': 5, 'Family': 6, 'Fantasy': 7, 'History': 8, 'Horror': 9, 'Music': 10, 'Mystery': 11, 'Romance': 12, 'Science Fiction': 13, 'TV Movie': 14, 'Thriller': 15, 'War': 16, 'Western': 17}


In [8]:
class MovieDescriptionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, genre_to_index: Dict[str, int], max_length: int = 128):
        self.descriptions = dataframe['description'].tolist()
        self.genres = dataframe['genre'].tolist()
        self.tokenizer = tokenizer
        self.genre_to_index = genre_to_index
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.descriptions)

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        description = self.descriptions[idx]
        genre_string = self.genres[idx]

        encoding = self.tokenizer(
            description,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        label = torch.zeros(len(self.genre_to_index))
        for genre in genre_string.split(','):
            genre = genre.strip()
            if genre in self.genre_to_index:
                label[self.genre_to_index[genre]] = 1

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': label
        }

In [9]:
train_df, val_df = train_test_split(movies_df, test_size=0.2, random_state=42)

In [10]:
train_dataset = MovieDescriptionDataset(train_df, tokenizer, genre_to_index, max_length=128)
val_dataset = MovieDescriptionDataset(val_df, tokenizer, genre_to_index, max_length=128)

print(len(train_dataset))

7533


In [11]:
sample = train_dataset[0]

print("Input IDs:", sample['input_ids'])
print("Attention Mask:", sample['attention_mask'])
print("Labels:", sample['labels'])

Input IDs: tensor([  101,  1999,  1996,  3865,  1010,  1037,  7101,  2003,  4704,  2011,
         2010,  2316,  2074,  2077,  2027,  2468,  2600, 18795,  2015,  1012,
         3174,  2086,  2101,  1010,  1996,  7101,  5927,  2010,  2117,  3382,
         2012,  2732,  9527, 13368,  2043,  2002,  2003,  2356,  2000,  4685,
         2007,  2010,  9454,  7833,  1005,  1055,  2152,  2082,  2600,  2316,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,    

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [13]:
class SelfAttentionPooling(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.attention_vector = nn.Parameter(torch.randn(emb_dim))

    def forward(self, x, attention_mask):
        # x: [batch_size, seq_length, emb_dim]
        scores = torch.matmul(x, self.attention_vector)  # [batch_size, seq_length]
        scores = scores.masked_fill(~attention_mask.bool(), float('-inf'))  # Mask out padding tokens
        attention_weights = torch.softmax(scores, dim=1).unsqueeze(-1)  # [batch_size, seq_length, 1]

        pooled = (x * attention_weights).sum(dim=1)  # Weighted sum: [batch_size, emb_dim]
        return pooled

In [14]:
class TextClassifier(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        emb_dim: int,
        hidden_dim: int,
        num_classes: int,
        num_heads: int = 8,
        max_seq_length: int = 512,
        num_attention_layers: int = 4, 
        feedforward_dim: int = 256,
        num_dropout_samples: int = 8
    ) -> None:
        super().__init__()
        self.num_dropout_samples = num_dropout_samples
        self.mc_dropout_enabled = True 

        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_length, emb_dim))

        self.attention_pooling = SelfAttentionPooling(emb_dim)

        transformer_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim,
            nhead=num_heads,
            dim_feedforward=feedforward_dim,
            dropout=0.3,
            batch_first=True
        )

        self.transformer_encoder = nn.TransformerEncoder(
            transformer_layer,
            num_layers=num_attention_layers
        )

        self.dropout = nn.Dropout(0.5)

        self.fc1 = nn.Linear(emb_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, num_classes)

        self.layernorm_final = nn.LayerNorm(hidden_dim // 2)

    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor, debugging: bool = False) -> torch.Tensor:
        if debugging:
            print(f"Input shape after embedding input IDs: {x.shape}")

        x = self.emb(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x, src_key_padding_mask=~attention_mask.bool())

        if debugging:
            print(f"Output shape after transformer: {x.shape}")

        x = self.attention_pooling(x, attention_mask)

        if debugging:
            print(f"Output shape after attention pooling: {x.shape}")

        residual = x
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.layernorm_final(x + residual)

        if debugging:
            print(f"Shape before final classifier: {x.shape}")

        if self.training and self.mc_dropout_enabled:
            logits_list = []
            for _ in range(self.num_dropout_samples):
                dropped = self.dropout(x)
                logits = self.fc3(dropped)
                logits_list.append(logits)

            if debugging:
                print(f"Shape of each logits before stacking: {logits.shape}")

            logits = torch.stack(logits_list, dim=0).mean(dim=0)

            if debugging:
                print(f"Final logits shape after averaging dropout samples: {logits.shape}")
        else:
            logits = self.fc3(x)
            if debugging:
                print(f"Final logits shape (single pass): {logits.shape}")

        return logits

In [15]:
class FocalLoss(nn.Module):
    def __init__(self, gamma: float = 2.0, pos_weight: Optional[torch.Tensor] = None, reduction: str = 'mean') -> None:
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.pos_weight = pos_weight
        self.reduction = reduction

    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
        bce_loss = F.binary_cross_entropy_with_logits(logits, targets, pos_weight=self.pos_weight, reduction='none')
        
        probs = torch.sigmoid(logits)
        probs = torch.clamp(probs, min=1e-6, max=1 - 1e-6)

        focal_weight = torch.where(targets == 1, 1 - probs, probs) ** self.gamma
        loss = focal_weight * bce_loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
vocab_size = 30522
emb_dim = 128
hidden_dim = 256
num_heads = 8

num_classes = len(genre_to_index)
max_seq_length = 512
num_attention_layers = 4
feedforward_dim = 256
num_dropout_samples = 5

batch_size = 16

In [18]:
model = TextClassifier(
    vocab_size=vocab_size,
    emb_dim=emb_dim,
    hidden_dim=hidden_dim,
    num_classes=num_classes,
    num_heads=num_heads,
    max_seq_length=max_seq_length,
    num_attention_layers=num_attention_layers,
    feedforward_dim=feedforward_dim,
    num_dropout_samples=num_dropout_samples
).to(device)

In [19]:
criterion = FocalLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)

In [20]:
num_epochs = 10
threshold = 0.4

for epoch in range(num_epochs):
    print(f"\nEpoch [{epoch + 1}/{num_epochs}]")

    # 🔹 Training Phase
    model.train()
    total_loss = 0
    all_train_labels = []
    all_train_preds = []

    loop = tqdm(train_dataloader, leave=True)

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)  # ✅ Get the mask
        labels = batch['labels'].to(device).float()

        # ✅ Pass the mask to the model
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item())

        probs = torch.sigmoid(outputs).detach().cpu().numpy()
        preds = (probs >= threshold).astype(int)

        all_train_labels.extend(labels.cpu().numpy())
        all_train_preds.extend(preds)

    avg_train_loss = total_loss / len(train_dataloader)
    train_precision = precision_score(all_train_labels, all_train_preds, average='micro', zero_division=0)
    train_recall = recall_score(all_train_labels, all_train_preds, average='micro', zero_division=0)
    train_f1 = f1_score(all_train_labels, all_train_preds, average='micro', zero_division=0)
    train_subset_acc = accuracy_score(all_train_labels, all_train_preds)

    # 🔹 Validation Phase (Integrated)
    model.eval()
    val_loss = 0
    all_val_labels = []
    all_val_preds = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).float()

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            probs = torch.sigmoid(outputs).cpu().numpy()
            preds = (probs >= threshold).astype(int)

            all_val_labels.extend(labels.cpu().numpy())
            all_val_preds.extend(preds)

    avg_val_loss = val_loss / len(val_dataloader)
    val_precision = precision_score(all_val_labels, all_val_preds, average='micro', zero_division=0)
    val_recall = recall_score(all_val_labels, all_val_preds, average='micro', zero_division=0)
    val_f1 = f1_score(all_val_labels, all_val_preds, average='micro', zero_division=0)
    val_subset_acc = accuracy_score(all_val_labels, all_val_preds)

    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    print(f"Train Precision: {train_precision:.4f} | Val Precision: {val_precision:.4f}")
    print(f"Train Recall:    {train_recall:.4f}    | Val Recall:    {val_recall:.4f}")
    print(f"Train F1 Score:  {train_f1:.4f}        | Val F1 Score:  {val_f1:.4f}")
    print(f"Train Subset Acc: {train_subset_acc:.4f} | Val Subset Acc: {val_subset_acc:.4f}")



Epoch [1/10]


Epoch [1/10]: 100%|████████████████| 942/942 [02:36<00:00,  6.02it/s, loss=0.12]
  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)



Epoch 1 Summary:
Train Loss: 0.1076 | Val Loss: 0.0960
Train Precision: 0.2847 | Val Precision: 0.3621
Train Recall:    0.4156    | Val Recall:    0.4621
Train F1 Score:  0.3379        | Val F1 Score:  0.4060
Train Subset Acc: 0.0049 | Val Subset Acc: 0.0011

Epoch [2/10]


Epoch [2/10]: 100%|██████████████| 942/942 [02:37<00:00,  5.97it/s, loss=0.0895]



Epoch 2 Summary:
Train Loss: 0.1010 | Val Loss: 0.0953
Train Precision: 0.3152 | Val Precision: 0.4066
Train Recall:    0.4153    | Val Recall:    0.3594
Train F1 Score:  0.3584        | Val F1 Score:  0.3816
Train Subset Acc: 0.0077 | Val Subset Acc: 0.0318

Epoch [3/10]


Epoch [3/10]: 100%|███████████████| 942/942 [02:26<00:00,  6.42it/s, loss=0.088]



Epoch 3 Summary:
Train Loss: 0.0978 | Val Loss: 0.0924
Train Precision: 0.3468 | Val Precision: 0.3816
Train Recall:    0.4517    | Val Recall:    0.4961
Train F1 Score:  0.3923        | Val F1 Score:  0.4314
Train Subset Acc: 0.0181 | Val Subset Acc: 0.0249

Epoch [4/10]


Epoch [4/10]: 100%|███████████████| 942/942 [02:36<00:00,  6.01it/s, loss=0.116]



Epoch 4 Summary:
Train Loss: 0.0944 | Val Loss: 0.0905
Train Precision: 0.3692 | Val Precision: 0.3910
Train Recall:    0.4922    | Val Recall:    0.5169
Train F1 Score:  0.4219        | Val F1 Score:  0.4452
Train Subset Acc: 0.0259 | Val Subset Acc: 0.0292

Epoch [5/10]


Epoch [5/10]: 100%|██████████████| 942/942 [02:37<00:00,  5.99it/s, loss=0.0762]



Epoch 5 Summary:
Train Loss: 0.0912 | Val Loss: 0.0885
Train Precision: 0.3883 | Val Precision: 0.4045
Train Recall:    0.5248    | Val Recall:    0.5413
Train F1 Score:  0.4464        | Val F1 Score:  0.4630
Train Subset Acc: 0.0281 | Val Subset Acc: 0.0287

Epoch [6/10]


Epoch [6/10]: 100%|██████████████| 942/942 [02:42<00:00,  5.81it/s, loss=0.0956]



Epoch 6 Summary:
Train Loss: 0.0884 | Val Loss: 0.0872
Train Precision: 0.4100 | Val Precision: 0.4195
Train Recall:    0.5575    | Val Recall:    0.5603
Train F1 Score:  0.4725        | Val F1 Score:  0.4798
Train Subset Acc: 0.0321 | Val Subset Acc: 0.0318

Epoch [7/10]


Epoch [7/10]: 100%|██████████████| 942/942 [02:46<00:00,  5.64it/s, loss=0.0757]



Epoch 7 Summary:
Train Loss: 0.0855 | Val Loss: 0.0853
Train Precision: 0.4239 | Val Precision: 0.4103
Train Recall:    0.5921    | Val Recall:    0.6068
Train F1 Score:  0.4941        | Val F1 Score:  0.4895
Train Subset Acc: 0.0389 | Val Subset Acc: 0.0303

Epoch [8/10]


Epoch [8/10]: 100%|███████████████| 942/942 [02:43<00:00,  5.75it/s, loss=0.082]



Epoch 8 Summary:
Train Loss: 0.0827 | Val Loss: 0.0842
Train Precision: 0.4392 | Val Precision: 0.4110
Train Recall:    0.6238    | Val Recall:    0.6334
Train F1 Score:  0.5155        | Val F1 Score:  0.4985
Train Subset Acc: 0.0393 | Val Subset Acc: 0.0287

Epoch [9/10]


Epoch [9/10]: 100%|██████████████| 942/942 [02:42<00:00,  5.78it/s, loss=0.0796]



Epoch 9 Summary:
Train Loss: 0.0800 | Val Loss: 0.0826
Train Precision: 0.4531 | Val Precision: 0.4312
Train Recall:    0.6479    | Val Recall:    0.6308
Train F1 Score:  0.5333        | Val F1 Score:  0.5122
Train Subset Acc: 0.0433 | Val Subset Acc: 0.0387

Epoch [10/10]


Epoch [10/10]: 100%|█████████████| 942/942 [02:51<00:00,  5.50it/s, loss=0.0669]



Epoch 10 Summary:
Train Loss: 0.0776 | Val Loss: 0.0814
Train Precision: 0.4633 | Val Precision: 0.4501
Train Recall:    0.6663    | Val Recall:    0.6258
Train F1 Score:  0.5466        | Val F1 Score:  0.5236
Train Subset Acc: 0.0489 | Val Subset Acc: 0.0430


In [23]:
torch.save(model.state_dict(), "movie_genre_model.pth")

In [27]:
def predict_genres(description, model, tokenizer, genre_to_index, threshold=0.5, device=None):
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model.to(device)
    model.eval()
    
    index_to_genre = {v: k for k, v in genre_to_index.items()}

    # Tokenize the input description
    encoding = tokenizer(
        description,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        probs = torch.sigmoid(outputs).cpu().numpy()

    predicted_labels = (probs >= threshold).astype(int)
    predicted_genres = [index_to_genre[i] for i, label in enumerate(predicted_labels[0]) if label == 1]

    return predicted_genres

In [28]:
model_test = TextClassifier(
    vocab_size=vocab_size,
    emb_dim=emb_dim,
    hidden_dim=hidden_dim,
    num_classes=num_classes,
    num_heads=num_heads,
    max_seq_length=max_seq_length,
    num_attention_layers=num_attention_layers,
    feedforward_dim=feedforward_dim,
    num_dropout_samples=num_dropout_samples
).to(device)

model_test.load_state_dict(torch.load("movie_genre_model.pth"))
model_test.eval()

TextClassifier(
  (emb): Embedding(30522, 128)
  (attention_pooling): SelfAttentionPooling()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=256, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=128, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_feat

In [29]:
description = (
    "A relentless high-speed chase through shadowy, abandoned streets catapults the protagonist "
    "into a nightmarish world where unspeakable horrors await at every turn. Pursued not only by "
    "ruthless mercenaries but also by terrifying supernatural forces, each moment is a desperate "
    "fight for survival. As the line between reality and nightmare blurs, the hero must navigate "
    "crumbling buildings, escape grotesque monsters lurking in the darkness, and confront "
    "blood-soaked secrets that threaten to consume them. The pulse-pounding action is matched "
    "only by the creeping dread that no place is safe and no one can be trusted in this brutal "
    "race against time and terror."
)

In [30]:
predictions = predict_genres(description, model_test, tokenizer, genre_to_index, device='cpu')
print(predictions)

['Action', 'Science Fiction']
