# PyTorch Model for Movie Genre Classification

This notebook sets up PyTorch Dataset and DataLoader classes for training neural network models on movie genre classification.

## Features:
- PyTorch Dataset class for movie genre data
- DataLoader setup with batching and shuffling
- Integration with existing preprocessing pipeline
- Ready for feedforward neural networks or GRU/LSTM models

In [None]:
# Import required libraries
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, hamming_loss, jaccard_score

# Project imports
from descriptions.config import INTERIM_DATA_DIR, MODELS_DIR
from descriptions.dataset import load_interim
from descriptions.modeling.train import prepare_features_and_labels

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

print("✓ All imports successful")

[32m2025-12-14 20:35:08.688[0m | [1mINFO    [0m | [36mdescriptions.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/christianfullerton/Developer/Python Workspace/movie_genre_model[0m


Using device: cpu
✓ All imports successful


## 1. PyTorch Dataset Class

In [None]:
class MovieGenreDataset(Dataset):
    """
    PyTorch Dataset for movie genre classification.
    
    Handles TF-IDF features and multi-label genre targets.
    """
    
    def __init__(self, features: np.ndarray, labels: np.ndarray):
        """
        Initialize dataset.
        
        Args:
            features: Feature matrix (n_samples, n_features) - TF-IDF features
            labels: Binary label matrix (n_samples, n_labels) - genre labels
        """
        # Convert to float32 for PyTorch
        self.features = torch.FloatTensor(features.astype(np.float32))
        self.labels = torch.FloatTensor(labels.astype(np.float32))
        
        assert len(self.features) == len(self.labels), \
            f"Features and labels must have same length. Got {len(self.features)} and {len(self.labels)}"
    
    def __len__(self) -> int:
        """Return the number of samples in the dataset."""
        return len(self.features)
    
    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Get a single sample from the dataset.
        
        Args:
            idx: Index of the sample
        
        Returns:
            Tuple of (features, labels) as PyTorch tensors
        """
        return self.features[idx], self.labels[idx]
    
    def get_feature_dim(self) -> int:
        """Get the number of features."""
        return self.features.shape[1]
    
    def get_num_labels(self) -> int:
        """Get the number of genre labels."""
        return self.labels.shape[1]


# Test the Dataset class
print("✓ MovieGenreDataset class defined")
print(f"  - Input: features (n_samples, n_features), labels (n_samples, n_labels)")
print(f"  - Output: PyTorch FloatTensors")

✓ MovieGenreDataset class defined
  - Input: features (n_samples, n_features), labels (n_samples, n_labels)
  - Output: PyTorch FloatTensors


## 2. Load and Prepare Data

In [None]:
# Load data
print("Loading data...")
data = load_interim(INTERIM_DATA_DIR / "cleaned_movies.csv")
print(f"✓ Loaded {len(data)} samples")

# Split data BEFORE preprocessing (prevents data leakage)
RANDOM_STATE = 42
TEST_SIZE = 0.2

print(f"\nSplitting data (test_size={TEST_SIZE}, random_state={RANDOM_STATE})...")
data_train, data_test = train_test_split(
    data, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=True
)
print(f"✓ Train: {len(data_train)} samples, Test: {len(data_test)} samples")

# Prepare features and labels using existing pipeline
print("\nPreparing features and labels...")
X_train_df, y_train, vectorizer, mlb, feature_selector = prepare_features_and_labels(
    data_train, 
    vectorizer=None, 
    mlb=None, 
    feature_selector=None,
    k_features=6000  # Use best config from grid search
)

X_test_df, y_test, _, _, _ = prepare_features_and_labels(
    data_test,
    vectorizer=vectorizer,
    mlb=mlb,
    feature_selector=feature_selector
)

# Convert to numpy arrays
X_train = X_train_df.values if isinstance(X_train_df, pd.DataFrame) else X_train_df
X_test = X_test_df.values if isinstance(X_test_df, pd.DataFrame) else X_test_df

print(f"\n✓ Data prepared:")
print(f"  Training: {X_train.shape[0]} samples, {X_train.shape[1]} features, {y_train.shape[1]} labels")
print(f"  Test: {X_test.shape[0]} samples, {X_test.shape[1]} features, {y_test.shape[1]} labels")
print(f"  Feature type: {X_train.dtype}, Label type: {y_train.dtype}")

Loading data...
[32m2025-12-14 20:35:10.646[0m | [1mINFO    [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m99[0m - [1mLoading interim data from /Users/christianfullerton/Developer/Python Workspace/movie_genre_model/data/interim/cleaned_movies.csv...[0m
[32m2025-12-14 20:35:10.739[0m | [34m[1mDEBUG   [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m103[0m - [34m[1mLoaded with index column[0m
[32m2025-12-14 20:35:10.740[0m | [32m[1mSUCCESS [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m108[0m - [32m[1m✓ Data loaded successfully: 9087 rows, 2 columns[0m
✓ Loaded 9087 samples

Splitting data (test_size=0.2, random_state=42)...
✓ Train: 7269 samples, Test: 1818 samples

Preparing features and labels...
[32m2025-12-14 20:35:10.745[0m | [1mINFO    [0m | [36mdescriptions.modeling.train[0m:[36mprepare_features_and_labels[0m:[36m127[0m - [1mGenerating TF-IDF features from descriptions...[0m
[32m2025-12-14 20:35:1

## 3. Create PyTorch Datasets

In [None]:
# Create training dataset
train_dataset = MovieGenreDataset(X_train, y_train)
print(f"✓ Training dataset created: {len(train_dataset)} samples")
print(f"  Feature dimension: {train_dataset.get_feature_dim()}")
print(f"  Number of labels: {train_dataset.get_num_labels()}")

# Create test dataset
test_dataset = MovieGenreDataset(X_test, y_test)
print(f"\n✓ Test dataset created: {len(test_dataset)} samples")

# Create validation dataset (split from training)
VAL_SIZE = 0.2
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train, y_train, test_size=VAL_SIZE, random_state=RANDOM_STATE
)

train_dataset_final = MovieGenreDataset(X_train_final, y_train_final)
val_dataset = MovieGenreDataset(X_val, y_val)

print(f"\n✓ Validation split created:")
print(f"  Final training: {len(train_dataset_final)} samples")
print(f"  Validation: {len(val_dataset)} samples")

# Test dataset indexing
sample_features, sample_labels = train_dataset[0]
print(f"\n✓ Sample data shape:")
print(f"  Features: {sample_features.shape}")
print(f"  Labels: {sample_labels.shape}")
print(f"  Feature dtype: {sample_features.dtype}, Label dtype: {sample_labels.dtype}")

✓ Training dataset created: 7253 samples
  Feature dimension: 6000
  Number of labels: 14

✓ Test dataset created: 1807 samples

✓ Validation split created:
  Final training: 5802 samples
  Validation: 1451 samples

✓ Sample data shape:
  Features: torch.Size([6000])
  Labels: torch.Size([14])
  Feature dtype: torch.float32, Label dtype: torch.float32


## 4. Create DataLoaders

In [None]:
# DataLoader parameters
BATCH_SIZE = 32
NUM_WORKERS = 0  # Set to 0 for Windows/Mac, can use 2-4 on Linux

# Create training DataLoader with shuffling
train_loader = DataLoader(
    train_dataset_final,
    batch_size=BATCH_SIZE,
    shuffle=True,  # Shuffle for training
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False  # Faster GPU transfer
)

# Create validation DataLoader (no shuffling needed)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,  # No shuffle for validation
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)

# Create test DataLoader (no shuffling needed)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,  # No shuffle for testing
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)

print("✓ DataLoaders created:")
print(f"  Training batches: {len(train_loader)} (batch_size={BATCH_SIZE})")
print(f"  Validation batches: {len(val_loader)}")
print(f"  Test batches: {len(test_loader)}")

# Test a batch
sample_batch_features, sample_batch_labels = next(iter(train_loader))
print(f"\n✓ Sample batch:")
print(f"  Features shape: {sample_batch_features.shape} (batch_size, n_features)")
print(f"  Labels shape: {sample_batch_labels.shape} (batch_size, n_labels)")
print(f"  On device: {sample_batch_features.device}")

✓ DataLoaders created:
  Training batches: 182 (batch_size=32)
  Validation batches: 46
  Test batches: 57

✓ Sample batch:
  Features shape: torch.Size([32, 6000]) (batch_size, n_features)
  Labels shape: torch.Size([32, 14]) (batch_size, n_labels)
  On device: cpu


## 5. Example: Simple Feedforward Model

Ready-to-use model architecture for multi-label classification.

In [None]:
class GenreClassifier(nn.Module):
    """
    Multi-label genre classifier using feedforward neural network.
    
    Architecture:
    - Input: TF-IDF features
    - Hidden layers: Fully connected with ReLU and Dropout
    - Output: Logits for each genre (sigmoid applied in loss function)
    
    Note: BatchNorm removed to prevent kernel crashes. Dropout provides sufficient regularization.
    """
    
    def __init__(
        self,
        input_size: int,
        num_labels: int,
        hidden_sizes: list = [512, 256, 128],
        dropout_rate: float = 0.3,
    ):
        super(GenreClassifier, self).__init__()
        
        layers = []
        prev_size = input_size
        
        # Build hidden layers
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            # BatchNorm removed to prevent kernel crashes - dropout provides sufficient regularization
            prev_size = hidden_size
        
        # Output layer (no activation - BCEWithLogitsLoss handles sigmoid)
        layers.append(nn.Linear(prev_size, num_labels))
        
        self.network = nn.Sequential(*layers)
        
        # Initialize weights
        self._initialize_weights()
    
    def _initialize_weights(self):
        """Initialize network weights using Xavier uniform."""
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass.
        
        Args:
            x: Input features (batch_size, input_size)
        
        Returns:
            Logits for each label (batch_size, num_labels)
        """
        return self.network(x)


# Initialize model
input_size = train_dataset.get_feature_dim()
num_labels = train_dataset.get_num_labels()

model = GenreClassifier(
    input_size=input_size,
    num_labels=num_labels,
    hidden_sizes=[512, 256, 128],
    dropout_rate=0.3
).to(device)

print(f"✓ Model created:")
print(f"  Architecture: {input_size} -> [512, 256, 128] -> {num_labels}")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"  Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"  Model on device: {next(model.parameters()).device}")

# Test forward pass (ensure model is in eval mode)
model.eval()
with torch.no_grad():
    test_output = model(sample_batch_features.to(device))
    print(f"\n✓ Forward pass test:")
    print(f"  Input shape: {sample_batch_features.shape}")
    print(f"  Output shape: {test_output.shape}")
    print(f"  Output range: [{test_output.min().item():.2f}, {test_output.max().item():.2f}]")
model.train()  # Set back to training mode

✓ Model created:
  Architecture: 6000 -> [512, 256, 128] -> 14
  Total parameters: 3,240,334
  Trainable parameters: 3,240,334
  Model on device: cpu

✓ Forward pass test:
  Input shape: torch.Size([32, 6000])
  Output shape: torch.Size([32, 14])
  Output range: [-4.48, 4.12]


: 

## 6. Training Setup (Ready to Use)

Loss function, optimizer, and training loop structure.

In [None]:
# Loss function - BCEWithLogitsLoss for multi-label classification
# This combines sigmoid + BCE loss for numerical stability
criterion = nn.BCEWithLogitsLoss()

# Optimizer
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001  # L2 regularization

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',  # Minimize validation loss
    factor=0.5,  # Reduce LR by half
    patience=5,  # Wait 5 epochs before reducing
    verbose=True
)

print("✓ Training setup complete:")
print(f"  Loss function: BCEWithLogitsLoss (multi-label binary cross-entropy)")
print(f"  Optimizer: Adam (lr={LEARNING_RATE}, weight_decay={WEIGHT_DECAY})")
print(f"  Scheduler: ReduceLROnPlateau (factor=0.5, patience=5)")

# Test loss calculation
with torch.no_grad():
    test_output = model(sample_batch_features.to(device))
    test_loss = criterion(test_output, sample_batch_labels.to(device))
    print(f"\n✓ Loss calculation test:")
    print(f"  Sample batch loss: {test_loss.item():.4f}")

## 7. Training Loop Template

Ready-to-use training loop with validation and metrics calculation.

In [None]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    total_loss = 0.0
    num_batches = 0
    
    for batch_features, batch_labels in train_loader:
        batch_features = batch_features.to(device)
        batch_labels = batch_labels.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    return total_loss / num_batches


def validate(model, val_loader, criterion, device, threshold=0.5):
    """Validate model and calculate metrics."""
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch_features, batch_labels in val_loader:
            batch_features = batch_features.to(device)
            batch_labels = batch_labels.to(device)
            
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            total_loss += loss.item()
            
            # Convert logits to probabilities and then to binary predictions
            probs = torch.sigmoid(outputs).cpu().numpy()
            preds = (probs >= threshold).astype(int)
            
            all_preds.append(preds)
            all_labels.append(batch_labels.cpu().numpy())
    
    # Concatenate all predictions
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    
    # Calculate metrics
    avg_loss = total_loss / len(val_loader)
    f1 = f1_score(all_labels, all_preds, average='micro', zero_division=0)
    precision = precision_score(all_labels, all_preds, average='micro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='micro', zero_division=0)
    hamming = hamming_loss(all_labels, all_preds)
    jaccard = jaccard_score(all_labels, all_preds, average='micro', zero_division=0)
    
    metrics = {
        'loss': avg_loss,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'hamming_loss': hamming,
        'jaccard': jaccard
    }
    
    return metrics


# Example training loop (commented out - uncomment to train)
EPOCHS = 50
PATIENCE = 10
best_val_loss = float('inf')
patience_counter = 0
history = {'train_loss': [], 'val_loss': [], 'val_f1': []}

print(f"Starting training for {EPOCHS} epochs...")
print("=" * 60)

for epoch in range(EPOCHS):
    # Training
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    
    # Validation
    val_metrics = validate(model, val_loader, criterion, device)
    
    # Learning rate scheduling
    scheduler.step(val_metrics['loss'])
    
    # Track history
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_metrics['loss'])
    history['val_f1'].append(val_metrics['f1'])
    
    # Early stopping
    if val_metrics['loss'] < best_val_loss:
        best_val_loss = val_metrics['loss']
        patience_counter = 0
        # Save best model
        torch.save(model.state_dict(), MODELS_DIR / 'pytorch_best_model.pt')
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print(f"Early stopping at epoch {epoch + 1}")
            model.load_state_dict(torch.load(MODELS_DIR / 'pytorch_best_model.pt'))
            break
    
    # Print progress
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch {epoch+1:3d}/{EPOCHS} | "
              f"Train Loss: {train_loss:.4f} | "
              f"Val Loss: {val_metrics['loss']:.4f} | "
              f"Val F1: {val_metrics['f1']:.4f} | "
              f"Val Precision: {val_metrics['precision']:.4f} | "
              f"Val Recall: {val_metrics['recall']:.4f})

print("=" * 60)
print("Training complete!")

print("✓ Training functions defined:")
print("  - train_epoch(): Train for one epoch")
print("  - validate(): Validate and calculate metrics")
print("  - Training loop template ready (commented out)")

## 8. Quick Test: Single Epoch

Test the training setup with one epoch to ensure everything works.

In [None]:
# Test training for one epoch
print("Testing training setup with one epoch...")
print("-" * 60)

# Train one epoch
train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
print(f"✓ Training epoch complete: Loss = {train_loss:.4f}")

# Validate
val_metrics = validate(model, val_loader, criterion, device, threshold=0.5)
print(f"\n✓ Validation complete:")
print(f"  Loss: {val_metrics['loss']:.4f}")
print(f"  F1: {val_metrics['f1']:.4f} ({val_metrics['f1']*100:.2f}%)")
print(f"  Precision: {val_metrics['precision']:.4f} ({val_metrics['precision']*100:.2f}%)")
print(f"  Recall: {val_metrics['recall']:.4f} ({val_metrics['recall']*100:.2f}%)")
print(f"  Hamming Loss: {val_metrics['hamming_loss']:.4f}")
print(f"  Jaccard: {val_metrics['jaccard']:.4f} ({val_metrics['jaccard']*100:.2f}%)")

print("\n" + "=" * 60)
print("✓ Dataset and DataLoader setup complete and tested!")
print("=" * 60)
print("\nNext steps:")
print("1. Uncomment the training loop in cell 15 to train the model")
print("2. Adjust hyperparameters (hidden_sizes, dropout_rate, learning_rate)")
print("3. Experiment with different architectures (GRU, LSTM)")
print("4. Compare with sklearn LinearSVC performance")