# Model Testing and Comparison Notebook

This notebook provides a framework for testing and comparing different machine learning models for movie genre classification.

## Features:
- Easy model configuration and testing
- Automatic metric calculation and comparison
- Visualization of results
- Support for multiple model types (Logistic Regression, XGBoost, etc.)


In [1]:
# Import required libraries
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, precision_score, recall_score, 
    hamming_loss, jaccard_score, confusion_matrix
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import time
from typing import Dict, List, Tuple, Any
import json

# Try importing XGBoost
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
    print("✓ XGBoost available")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("⚠ XGBoost not available. Install with: pip install xgboost")

# Project imports
from descriptions.config import INTERIM_DATA_DIR, MODELS_DIR
from descriptions.dataset import load_interim
from descriptions.modeling.train import prepare_features_and_labels
from descriptions.modeling.preprocess import load_preprocessors

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("✓ All imports successful")


[32m2025-12-06 22:55:49.607[0m | [1mINFO    [0m | [36mdescriptions.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/christianfullerton/Developer/Python Workspace/movie_genre_model[0m


✓ XGBoost available
✓ All imports successful


## 1. Load and Prepare Data


In [2]:
# Load and prepare data
print("Loading data...")
data = load_interim(INTERIM_DATA_DIR / "cleaned_movies.csv")
print(f"✓ Loaded {len(data)} samples")
print(f"Columns: {list(data.columns)}")
print(f"\nFirst few rows:")
print(data.head())

# Split data into train and test sets BEFORE preprocessing (prevents data leakage)
RANDOM_STATE = 42
TEST_SIZE = 0.2
X, y = data['description'], data['genre']

print("\nSplitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
print("✓ Data split complete")


Loading data...
[32m2025-12-06 22:55:51.886[0m | [1mINFO    [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m99[0m - [1mLoading interim data from /Users/christianfullerton/Developer/Python Workspace/movie_genre_model/data/interim/cleaned_movies.csv...[0m
[32m2025-12-06 22:55:52.081[0m | [34m[1mDEBUG   [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m103[0m - [34m[1mLoaded with index column[0m
[32m2025-12-06 22:55:52.082[0m | [32m[1mSUCCESS [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m108[0m - [32m[1m✓ Data loaded successfully: 9087 rows, 2 columns[0m
✓ Loaded 9087 samples
Columns: ['genre', 'description']

First few rows:
                                     genre  \
movie_name                                   
he_hawshank_edemption         Drama, Crime   
he_odfather                   Drama, Crime   
he_odfather_art_              Drama, Crime   
chindlers_ist          Drama, History, War   
12_ngry_en             

In [3]:
# Preprocess data: TF-IDF features and multi-label encoding
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert genre strings to lists of genre strings
def preprocess_genres(genre_series):
    """Convert genre strings to lists of genre strings."""
    return genre_series.fillna("").astype(str).str.split(r"\s*,\s*").apply(
        lambda genres: sorted({g.strip() for g in genres if g.strip()})
    )

print("Preprocessing genres...")
y_train_list = preprocess_genres(y_train)
y_test_list = preprocess_genres(y_test)

print(f"Sample y_train: {y_train_list.iloc[0]}")
print(f"Sample y_test: {y_test_list.iloc[0]}")

# Transform text to TF-IDF features
print("\nTransforming text to TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=10000, 
    ngram_range=(1, 2), 
    stop_words='english'
)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

print(f"✓ TF-IDF features: {X_train.shape[1]} features")

# Transform genres to binary labels
print("\nTransforming genres to binary labels...")
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train_list)
y_test = mlb.transform(y_test_list)

print(f"✓ Binary labels: {y_train.shape[1]} genres")
print(f"✓ Training labels shape: {y_train.shape}")
print(f"✓ Test labels shape: {y_test.shape}")



Preprocessing genres...
Sample y_train: ['Horror', 'Mystery']
Sample y_test: ['Adventure', 'Science Fiction', 'Western']

Transforming text to TF-IDF features...
✓ TF-IDF features: 10000 features

Transforming genres to binary labels...
✓ Binary labels: 18 genres
✓ Training labels shape: (7269, 18)
✓ Test labels shape: (1818, 18)


In [4]:
## 2. SGDClassifier Grid Search and Evaluation

# Import required modules
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer
from descriptions.modeling.evaluate import evaluate_model

# Create base model with OneVsRestClassifier for multi-label classification
model_sgd = OneVsRestClassifier(SGDClassifier(random_state=42))
param_grid = {
    'estimator__loss': ['hinge', 'log_loss', 'modified_huber'],
    'estimator__penalty': ['l2', 'elasticnet'],
    'estimator__alpha': [0.0001, 0.001, 0.01, 0.1],
    'estimator__learning_rate': ['optimal', 'adaptive'],
    'estimator__max_iter': [2000, 3000],
    'estimator__tol': [1e-3, 1e-4],
    'estimator__early_stopping': [True],
}

# Calculate total combinations
total_combinations = np.prod([len(v) for v in param_grid.values()])
print(f"Total parameter combinations: {total_combinations}")
print(f"With 5-fold CV: {total_combinations * 5} model fits")
print(f"Estimated time: ~{total_combinations * 5 * 0.5 / 60:.1f} minutes")

# Create custom scorer for multi-label classification (micro-averaged F1)
def multi_label_f1_micro(y_true, y_pred):
    """Custom scorer for multi-label F1 micro."""
    return f1_score(y_true, y_pred, average='micro', zero_division=0)

scorer = make_scorer(multi_label_f1_micro)

# Grid Search with cross-validation
print("\nStarting Grid Search for SGDClassifier...")
print(f"Testing parameter combinations with 5-fold CV")
print("="*60)

grid_search_sgd = GridSearchCV(
    estimator=model_sgd,
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring=scorer,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

grid_search_sgd.fit(X_train, y_train)

print("\n" + "="*60)
print("Grid Search Complete!")
print("="*60)
print(f"Best parameters: {grid_search_sgd.best_params_}")
print(f"Best cross-validation score (F1 micro): {grid_search_sgd.best_score_:.4f}")
print("="*60)

# Get the best model
best_model_sgd = grid_search_sgd.best_estimator_

# Evaluate the best model
print("\nEvaluating Best SGDClassifier Model...")
metrics_sgd = evaluate_model(best_model_sgd, X_test, y_test)

print("\n" + "="*60)
print("BEST SGDCLASSIFIER METRICS (Test Set)")
print("="*60)
print(f"  F1 Score:       {metrics_sgd['f1']:.4f} ({metrics_sgd['f1']*100:.2f}%)")
print(f"  Precision:      {metrics_sgd['precision']:.4f} ({metrics_sgd['precision']*100:.2f}%)")
print(f"  Recall:         {metrics_sgd['recall']:.4f} ({metrics_sgd['recall']*100:.2f}%)")
print(f"  Hamming Loss:   {metrics_sgd['hamming_loss']:.4f} ({metrics_sgd['hamming_loss']*100:.2f}%)")
print(f"  Jaccard Score:  {metrics_sgd['jaccard']:.4f} ({metrics_sgd['jaccard']*100:.2f}%)")
print("="*60)

# Store for comparison
test_metrics = metrics_sgd
train_metrics = evaluate_model(best_model_sgd, X_train, y_train)



Total parameter combinations: 192
With 5-fold CV: 960 model fits
Estimated time: ~8.0 minutes

Starting Grid Search for SGDClassifier...
Testing parameter combinations with 5-fold CV
Fitting 5 folds for each of 192 candidates, totalling 960 fits


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


[CV] END estimator__alpha=0.0001, estimator__early_stopping=True, estimator__learning_rate=optimal, estimator__loss=hinge, estimator__max_iter=2000, estimator__penalty=l2, estimator__tol=0.001; total time=   1.2s
[CV] END estimator__alpha=0.0001, estimator__early_stopping=True, estimator__learning_rate=optimal, estimator__loss=hinge, estimator__max_iter=2000, estimator__penalty=l2, estimator__tol=0.001; total time=   1.2s
[CV] END estimator__alpha=0.0001, estimator__early_stopping=True, estimator__learning_rate=optimal, estimator__loss=hinge, estimator__max_iter=2000, estimator__penalty=l2, estimator__tol=0.0001; total time=   1.2s
[CV] END estimator__alpha=0.0001, estimator__early_stopping=True, estimator__learning_rate=optimal, estimator__loss=hinge, estimator__max_iter=2000, estimator__penalty=l2, estimator__tol=0.001; total time=   1.2s
[CV] END estimator__alpha=0.0001, estimator__early_stopping=True, estimator__learning_rate=optimal, estimator__loss=hinge, estimator__max_iter=2000

## 3. Overfitting Analysis and Regularization Techniques


### 3.2. Finding the Middle Ground: Balanced Regularization


In [None]:
# Balanced Approach: Moderate features (7000) and moderate regularization (alpha: 0.001-0.01)
print("=" * 70)
print("FINDING MIDDLE GROUND: BALANCED REGULARIZATION")
print("=" * 70)

# Re-split data to get fresh text (X_train was already transformed to sparse matrix earlier)
# Use the same random_state to ensure consistent train/test split
X_text, y_text = data['description'], data['genre']
X_train_text, X_test_text, _, _ = train_test_split(
    X_text, y_text, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# Moderate features
tfidf_balanced = TfidfVectorizer(
    max_features=5000, 
    ngram_range=(1, 1), 
    stop_words='english', 
    sublinear_tf=True,
    max_df=0.7,
    min_df=2)
X_train_balanced = tfidf_balanced.fit_transform(X_train_text)
X_test_balanced = tfidf_balanced.transform(X_test_text)

# Grid Search with Moderate Regularization
model_sgd_balanced = OneVsRestClassifier(SGDClassifier(random_state=42))
param_grid_balanced = [
    {'estimator__loss': ['modified_huber', 'log_loss'], 'estimator__penalty': ['l1'],
     'estimator__alpha': [0.001, 0.005, 0.01], 'estimator__learning_rate': ['optimal'],
     'estimator__max_iter': [2000], 'estimator__tol': [1e-3], 'estimator__early_stopping': [True]},
    {'estimator__loss': ['modified_huber', 'log_loss'], 'estimator__penalty': ['l2'],
     'estimator__alpha': [0.001, 0.005, 0.01], 'estimator__learning_rate': ['optimal'],
     'estimator__max_iter': [2000], 'estimator__tol': [1e-3], 'estimator__early_stopping': [True]},
    {'estimator__loss': ['modified_huber', 'log_loss'], 'estimator__penalty': ['elasticnet'],
     'estimator__alpha': [0.001, 0.005, 0.01], 'estimator__l1_ratio': [0.3, 0.5, 0.7],
     'estimator__learning_rate': ['optimal'], 'estimator__max_iter': [2000],
     'estimator__tol': [1e-3], 'estimator__early_stopping': [True]}
]

print("Running Balanced Grid Search...")
grid_search_sgd_balanced = GridSearchCV(
    estimator=model_sgd_balanced, param_grid=param_grid_balanced,
    cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring=scorer,
    n_jobs=-1, verbose=2, return_train_score=True
)
grid_search_sgd_balanced.fit(X_train_balanced, y_train_reduced)

best_model_sgd_balanced = grid_search_sgd_balanced.best_estimator_
test_metrics_balanced = evaluate_model(best_model_sgd_balanced, X_test_balanced, y_test_reduced)
train_metrics_balanced = evaluate_model(best_model_sgd_balanced, X_train_balanced, y_train_reduced)
train_test_gap_balanced = train_metrics_balanced['f1'] - test_metrics_balanced['f1']

print(f"\nBalanced Model: Train F1={train_metrics_balanced['f1']:.4f}, Test F1={test_metrics_balanced['f1']:.4f}, Gap={train_test_gap_balanced:.4f}")


FINDING MIDDLE GROUND: BALANCED REGULARIZATION


AttributeError: 'csr_matrix' object has no attribute 'lower'

### 3.3. Model Comparison and Final Recommendation


In [None]:
# Comprehensive Comparison of All Three Models
print("=" * 70)
print("COMPREHENSIVE MODEL COMPARISON")
print("=" * 70)

comparison_df = pd.DataFrame({
    'Model': ['Original', 'Over-Regularized', 'Balanced'],
    'Train F1': [train_metrics['f1'], train_metrics_regularized['f1'], train_metrics_balanced['f1']],
    'Test F1': [test_metrics['f1'], test_metrics_regularized['f1'], test_metrics_balanced['f1']],
    'Gap': [
        train_metrics['f1'] - test_metrics['f1'],
        train_test_gap_regularized,
        train_test_gap_balanced
    ]
})
print(comparison_df.to_string(index=False))

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Model Comparison: Original vs Regularized vs Balanced', fontsize=16, fontweight='bold')

models = ['Original\n(Overfitting)', 'Over-\nRegularized', 'Balanced\n(Target)']
train_f1s = [train_metrics['f1'], train_metrics_regularized['f1'], train_metrics_balanced['f1']]
test_f1s = [test_metrics['f1'], test_metrics_regularized['f1'], test_metrics_balanced['f1']]
gaps = [train_metrics['f1'] - test_metrics['f1'], train_test_gap_regularized, train_test_gap_balanced]

# Plot 1: Train vs Test F1
ax1 = axes[0]
x = np.arange(len(models))
width = 0.35
bars1 = ax1.bar(x - width/2, train_f1s, width, label='Train F1', alpha=0.8, color='steelblue')
bars2 = ax1.bar(x + width/2, test_f1s, width, label='Test F1', alpha=0.8, color='coral')
ax1.set_ylabel('F1 Score', fontsize=12)
ax1.set_title('Train vs Test F1 Score', fontsize=13, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(models)
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')
ax1.set_ylim([0, 1.0])
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height, f'{height:.3f}', ha='center', va='bottom', fontsize=9)

# Plot 2: Overfitting Gap
ax2 = axes[1]
colors = ['red' if gap > 0.10 else 'orange' if gap > 0.05 else 'green' for gap in gaps]
bars = ax2.bar(models, gaps, alpha=0.8, color=colors)
ax2.axhline(0.05, color='green', linestyle='--', linewidth=2, label='Excellent (< 0.05)')
ax2.axhline(0.10, color='orange', linestyle='--', linewidth=2, label='Good (< 0.10)')
ax2.set_ylabel('Overfitting Gap', fontsize=12)
ax2.set_title('Overfitting Gap Comparison', fontsize=13, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')
for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height, f'{height:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Plot 3: Test F1 Performance
ax3 = axes[2]
test_f1_colors = ['green' if f1 > 0.40 else 'orange' if f1 > 0.30 else 'red' for f1 in test_f1s]
bars = ax3.bar(models, test_f1s, alpha=0.8, color=test_f1_colors)
ax3.axhline(0.40, color='green', linestyle='--', linewidth=2, label='Good (> 0.40)')
ax3.axhline(0.30, color='orange', linestyle='--', linewidth=2, label='Acceptable (> 0.30)')
ax3.set_ylabel('Test F1 Score', fontsize=12)
ax3.set_title('Model Performance (Test F1)', fontsize=13, fontweight='bold')
ax3.legend()
ax3.grid(True, alpha=0.3, axis='y')
ax3.set_ylim([0, 1.0])
for bar in bars:
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height, f'{height:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

# Final Recommendation
print("\n" + "=" * 70)
print("RECOMMENDATION")
print("=" * 70)
if train_test_gap_balanced < 0.10 and test_metrics_balanced['f1'] > 0.40:
    print("✓ BALANCED MODEL is the best choice!")
    print(f"  - Acceptable overfitting gap: {train_test_gap_balanced:.4f}")
    print(f"  - Good performance: {test_metrics_balanced['f1']:.4f} F1")
elif train_test_gap_balanced < 0.10:
    print("⚠ BALANCED MODEL reduces overfitting but performance is moderate")
else:
    print("⚠ BALANCED MODEL still shows some overfitting")
print("=" * 70)
