# Model Testing and Comparison Notebook

This notebook provides a framework for testing and comparing different machine learning models for movie genre classification.

## Features:
- Easy model configuration and testing
- Automatic metric calculation and comparison
- Visualization of results
- Support for multiple model types (Logistic Regression, XGBoost, etc.)


In [2]:
# Import required libraries
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, precision_score, recall_score, 
    hamming_loss, jaccard_score, confusion_matrix
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest
import time
from typing import Dict, List, Tuple, Any
import json

# Try importing XGBoost
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
    print("✓ XGBoost available")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("⚠ XGBoost not available. Install with: pip install xgboost")

# Project imports
from descriptions.config import INTERIM_DATA_DIR, MODELS_DIR
from descriptions.dataset import load_interim
from descriptions.modeling.train import prepare_features_and_labels
from descriptions.modeling.preprocess import load_preprocessors

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("✓ All imports successful")


[32m2025-12-14 12:32:51.834[0m | [1mINFO    [0m | [36mdescriptions.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/christianfullerton/Developer/Python Workspace/movie_genre_model[0m


✓ XGBoost available
✓ All imports successful


## 1. Load and Prepare Data


In [3]:
# Load and prepare data
print("Loading data...")
data = load_interim(INTERIM_DATA_DIR / "cleaned_movies.csv")
print(f"✓ Loaded {len(data)} samples")
print(f"Columns: {list(data.columns)}")
print(f"\nFirst few rows:")
print(data.head())

# Split data into train and test sets BEFORE preprocessing (prevents data leakage)
RANDOM_STATE = 42
TEST_SIZE = 0.2
X, y = data['description'], data['genre']

print("\nSplitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
print("✓ Data split complete")


Loading data...
[32m2025-12-14 12:32:53.413[0m | [1mINFO    [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m99[0m - [1mLoading interim data from /Users/christianfullerton/Developer/Python Workspace/movie_genre_model/data/interim/cleaned_movies.csv...[0m
[32m2025-12-14 12:32:53.510[0m | [34m[1mDEBUG   [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m103[0m - [34m[1mLoaded with index column[0m
[32m2025-12-14 12:32:53.510[0m | [32m[1mSUCCESS [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m108[0m - [32m[1m✓ Data loaded successfully: 9087 rows, 2 columns[0m
✓ Loaded 9087 samples
Columns: ['genre', 'description']

First few rows:
                                     genre  \
movie_name                                   
he_hawshank_edemption         Drama, Crime   
he_odfather                   Drama, Crime   
he_odfather_art_              Drama, Crime   
chindlers_ist          Drama, History, War   
12_ngry_en             

In [4]:
# Preprocess data: TF-IDF features and multi-label encoding
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import TruncatedSVD

# Convert genre strings to lists of genre strings
def preprocess_genres(genre_series):
    """Convert genre strings to lists of genre strings."""
    return genre_series.fillna("").astype(str).str.split(r"\s*,\s*").apply(
        lambda genres: sorted({g.strip() for g in genres if g.strip()})
    )

print("Preprocessing genres...")
y_train_list = preprocess_genres(y_train)
y_test_list = preprocess_genres(y_test)

print(f"Sample y_train: {y_train_list.iloc[0]}")
print(f"Sample y_test: {y_test_list.iloc[0]}")

# Transform text to TF-IDF features
print("\nTransforming text to TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=10000, 
    ngram_range=(1, 2), 
    stop_words='english'
)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

print(f"✓ TF-IDF features: {X_train.shape[1]} features")

# Transform genres to binary labels
print("\nTransforming genres to binary labels...")
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train_list)
y_test = mlb.transform(y_test_list)

print(f"✓ Binary labels: {y_train.shape[1]} genres")
print(f"✓ Training labels shape: {y_train.shape}")
print(f"✓ Test labels shape: {y_test.shape}")


kbest = SelectKBest(score_func=chi2, k=4500)
X_train = kbest.fit_transform(X_train, y_train)
X_test = kbest.transform(X_test)

print(f"✓ KBest selected {X_train.shape[1]} features")
print(f"✓ Training features shape: {X_train.shape}")
print(f"✓ Test features shape: {X_test.shape}")


svd = TruncatedSVD(n_components=2000, random_state=42)
X_train = svd.fit_transform(X_train)
X_test = svd.transform(X_test)

print(f"✓ SVD reduced to {X_train.shape[1]} features")
print(f"✓ Training features shape: {X_train.shape}")
print(f"✓ Test features shape: {X_test.shape}")



Preprocessing genres...
Sample y_train: ['Horror', 'Mystery']
Sample y_test: ['Adventure', 'Science Fiction', 'Western']

Transforming text to TF-IDF features...
✓ TF-IDF features: 10000 features

Transforming genres to binary labels...
✓ Binary labels: 18 genres
✓ Training labels shape: (7269, 18)
✓ Test labels shape: (1818, 18)
✓ KBest selected 4500 features
✓ Training features shape: (7269, 4500)
✓ Test features shape: (1818, 4500)
✓ SVD reduced to 2000 features
✓ Training features shape: (7269, 2000)
✓ Test features shape: (1818, 2000)


In [5]:
## 2. Preprocessing Parameters Grid Search (Fixed LinearSVC)

# Import required modules
from sklearn.svm import LinearSVC
from sklearn.model_selection import ParameterGrid
from descriptions.modeling.evaluate import evaluate_model
from scipy.special import expit
from tqdm import tqdm

# Fixed LinearSVC parameters (as specified)
FIXED_MODEL_PARAMS = {
    "C": 0.1,
    "penalty": "l2",
    "loss": "squared_hinge",
    "max_iter": 1000,
    "tol": 0.001,
    "class_weight": "balanced",
    "dual": False,
    "random_state": 42,
}

# Preprocessing parameter grid to search over
preprocess_param_grid = {
    'max_features': [10000, 15000, 20000],
    'ngram_range': [(1, 2), (1, 3)],
    'max_df': [0.6, 0.7, 0.8],
    'min_df': [2, 3, 4],
    'k_features': [4500, 5000, 6000],
}

# Calculate total combinations
total_combinations = np.prod([len(v) for v in preprocess_param_grid.values()])
print(f"Total preprocessing parameter combinations: {total_combinations}")
print(f"Fixed LinearSVC parameters: {FIXED_MODEL_PARAMS}")
print(f"Estimated time: ~{total_combinations * 2 / 60:.1f} minutes (assuming ~2 min per config)")
print("="*60)

# Reload original text data (before preprocessing overwrote variables)
# We need the original text series from the train_test_split
print("\nReloading original text data for grid search...")
data = load_interim(INTERIM_DATA_DIR / "cleaned_movies.csv")
X, y = data['description'], data['genre']
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
print(f"✓ Loaded original text data: {len(X_train_text)} train, {len(X_test_text)} test")

# Preprocess genres function
def preprocess_genres(genre_series):
    """Convert genre strings to lists of genre strings."""
    return genre_series.fillna("").astype(str).str.split(r"\s*,\s*").apply(
        lambda genres: sorted({g.strip() for g in genres if g.strip()})
    )

# Grid search results storage
results = []

print("\nStarting Grid Search over Preprocessing Parameters...")
print("="*60)

# Generate all parameter combinations
grid = ParameterGrid(preprocess_param_grid)

# Iterate over each preprocessing configuration
for idx, preprocess_params in enumerate(tqdm(grid, total=total_combinations, desc="Grid Search"), 1):
    start_time = time.time()
    
    try:
        # Create TF-IDF vectorizer with current parameters
        tfidf = TfidfVectorizer(
            max_features=preprocess_params['max_features'],
            ngram_range=preprocess_params['ngram_range'],
            max_df=preprocess_params['max_df'],
            min_df=preprocess_params['min_df'],
            stop_words='english',
            sublinear_tf=True,
            use_idf=True
        )
        
        # Transform text to TF-IDF features
        X_train_tfidf = tfidf.fit_transform(X_train_text)
        X_test_tfidf = tfidf.transform(X_test_text)
        
        # Transform genres to binary labels
        y_train_list = preprocess_genres(y_train_text)
        y_test_list = preprocess_genres(y_test_text)
        
        mlb = MultiLabelBinarizer()
        y_train_binary = mlb.fit_transform(y_train_list)
        y_test_binary = mlb.transform(y_test_list)
        
        # Apply feature selection
        kbest = SelectKBest(score_func=chi2, k=preprocess_params['k_features'])
        X_train_selected = kbest.fit_transform(X_train_tfidf, y_train_binary)
        X_test_selected = kbest.transform(X_test_tfidf)
        
        # Convert to dense arrays for LinearSVC
        X_train_dense = X_train_selected.toarray()
        X_test_dense = X_test_selected.toarray()
        
        # Create and train model with FIXED parameters
        model = OneVsRestClassifier(
            LinearSVC(**FIXED_MODEL_PARAMS)  # Uses your fixed params!
        )
        model.fit(X_train_dense, y_train_binary)
        
        # Evaluate on test set
        # LinearSVC doesn't have predict_proba, use decision_function + sigmoid
        y_scores = model.decision_function(X_test_dense)
        y_proba = expit(y_scores)
        y_pred = (y_proba >= 0.5).astype(int)
        
        # Calculate metrics
        test_f1 = f1_score(y_test_binary, y_pred, average='micro', zero_division=0)
        test_precision = precision_score(y_test_binary, y_pred, average='micro', zero_division=0)
        test_recall = recall_score(y_test_binary, y_pred, average='micro', zero_division=0)
        test_hamming = hamming_loss(y_test_binary, y_pred)
        test_jaccard = jaccard_score(y_test_binary, y_pred, average='micro', zero_division=0)
        
        elapsed_time = time.time() - start_time
        
        # Store results
        result = {
            **preprocess_params,
            'f1': test_f1,
            'precision': test_precision,
            'recall': test_recall,
            'hamming_loss': test_hamming,
            'jaccard': test_jaccard,
            'time_seconds': elapsed_time,
        }
        results.append(result)
        
        # Print progress every 5 configurations
        if idx % 5 == 0 or idx == total_combinations:
            print(f"\n[{idx}/{total_combinations}] "
                  f"max_feat={preprocess_params['max_features']}, "
                  f"ngram={preprocess_params['ngram_range']}, "
                  f"k={preprocess_params['k_features']}: "
                  f"F1={test_f1:.4f}, Time={elapsed_time:.1f}s")
    
    except Exception as e:
        print(f"\nError in configuration {idx}: {e}")
        import traceback
        traceback.print_exc()
        continue

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Find best configuration (highest F1 score)
best_idx = results_df['f1'].idxmax()
best_config = results_df.loc[best_idx]

print("\n" + "="*60)
print("Grid Search Complete!")
print("="*60)
print(f"Best F1 Score: {best_config['f1']:.4f} ({best_config['f1']*100:.2f}%)")
print(f"\nBest Preprocessing Configuration:")
print(f"  max_features: {best_config['max_features']}")
print(f"  ngram_range: {best_config['ngram_range']}")
print(f"  max_df: {best_config['max_df']}")
print(f"  min_df: {best_config['min_df']}")
print(f"  k_features: {best_config['k_features']}")
print(f"\nBest Metrics:")
print(f"  F1 Score:       {best_config['f1']:.4f} ({best_config['f1']*100:.2f}%)")
print(f"  Precision:      {best_config['precision']:.4f} ({best_config['precision']*100:.2f}%)")
print(f"  Recall:         {best_config['recall']:.4f} ({best_config['recall']*100:.2f}%)")
print(f"  Hamming Loss:   {best_config['hamming_loss']:.4f} ({best_config['hamming_loss']*100:.2f}%)")
print(f"  Jaccard Score:  {best_config['jaccard']:.4f} ({best_config['jaccard']*100:.2f}%)")
print("="*60)

# Display top 5 configurations
print("\nTop 5 Configurations by F1 Score:")
print("="*60)
top_5 = results_df.nlargest(5, 'f1')[['max_features', 'ngram_range', 'max_df', 'min_df', 'k_features', 'f1', 'precision', 'recall']]
print(top_5.to_string(index=False))
print("="*60)

# Save results
results_df.to_csv(MODELS_DIR / 'preprocessing_grid_search_results.csv', index=False)
print(f"\n✓ Results saved to {MODELS_DIR / 'preprocessing_grid_search_results.csv'}")

# Retrain best model for final evaluation
print("\nRetraining best model with optimal preprocessing parameters...")
tfidf_best = TfidfVectorizer(
    max_features=int(best_config['max_features']),
    ngram_range=best_config['ngram_range'],
    max_df=best_config['max_df'],
    min_df=int(best_config['min_df']),
    stop_words='english',
    sublinear_tf=True,
    use_idf=True
)

X_train_best = tfidf_best.fit_transform(X_train_text)
X_test_best = tfidf_best.transform(X_test_text)

y_train_list = preprocess_genres(y_train_text)
y_test_list = preprocess_genres(y_test_text)

mlb_best = MultiLabelBinarizer()
y_train_best = mlb_best.fit_transform(y_train_list)
y_test_best = mlb_best.transform(y_test_list)

kbest_best = SelectKBest(score_func=chi2, k=int(best_config['k_features']))
X_train_best = kbest_best.fit_transform(X_train_best, y_train_best)
X_test_best = kbest_best.transform(X_test_best)

best_model_svc = OneVsRestClassifier(LinearSVC(**FIXED_MODEL_PARAMS))
best_model_svc.fit(X_train_best.toarray(), y_train_best)

# Evaluate best model
metrics_svc = evaluate_model(best_model_svc, X_test_best.toarray(), y_test_best)

print("\n" + "="*60)
print("FINAL BEST MODEL METRICS (Test Set)")
print("="*60)
print(f"  F1 Score:       {metrics_svc['f1']:.4f} ({metrics_svc['f1']*100:.2f}%)")
print(f"  Precision:      {metrics_svc['precision']:.4f} ({metrics_svc['precision']*100:.2f}%)")
print(f"  Recall:         {metrics_svc['recall']:.4f} ({metrics_svc['recall']*100:.2f}%)")
print(f"  Hamming Loss:   {metrics_svc['hamming_loss']:.4f} ({metrics_svc['hamming_loss']*100:.2f}%)")
print(f"  Jaccard Score:  {metrics_svc['jaccard']:.4f} ({metrics_svc['jaccard']*100:.2f}%)")
print("="*60)

# Store for comparison
test_metrics = metrics_svc
train_metrics = evaluate_model(best_model_svc, X_train_best.toarray(), y_train_best)



Total preprocessing parameter combinations: 162
Fixed LinearSVC parameters: {'C': 0.1, 'penalty': 'l2', 'loss': 'squared_hinge', 'max_iter': 1000, 'tol': 0.001, 'class_weight': 'balanced', 'dual': False, 'random_state': 42}
Estimated time: ~5.4 minutes (assuming ~2 min per config)

Reloading original text data for grid search...
[32m2025-12-14 12:33:18.108[0m | [1mINFO    [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m99[0m - [1mLoading interim data from /Users/christianfullerton/Developer/Python Workspace/movie_genre_model/data/interim/cleaned_movies.csv...[0m
[32m2025-12-14 12:33:18.217[0m | [34m[1mDEBUG   [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m103[0m - [34m[1mLoaded with index column[0m
[32m2025-12-14 12:33:18.218[0m | [32m[1mSUCCESS [0m | [36mdescriptions.dataset[0m:[36mload_interim[0m:[36m108[0m - [32m[1m✓ Data loaded successfully: 9087 rows, 2 columns[0m
✓ Loaded original text data: 7269 train, 1818 test

Start

Grid Search:   3%|▎         | 5/162 [00:27<14:07,  5.40s/it]


[5/162] max_feat=10000, ngram=(1, 2), k=4500: F1=0.6028, Time=5.1s


Grid Search:   6%|▌         | 10/162 [00:55<13:58,  5.52s/it]


[10/162] max_feat=15000, ngram=(1, 3), k=4500: F1=0.6029, Time=5.8s


Grid Search:   9%|▉         | 15/162 [01:21<13:03,  5.33s/it]


[15/162] max_feat=20000, ngram=(1, 2), k=4500: F1=0.6030, Time=4.9s


Grid Search:  12%|█▏        | 20/162 [01:50<13:21,  5.64s/it]


[20/162] max_feat=10000, ngram=(1, 3), k=4500: F1=0.6040, Time=6.0s


Grid Search:  15%|█▌        | 25/162 [02:16<12:13,  5.35s/it]


[25/162] max_feat=15000, ngram=(1, 2), k=4500: F1=0.5995, Time=4.9s


Grid Search:  19%|█▊        | 30/162 [02:44<11:58,  5.44s/it]


[30/162] max_feat=15000, ngram=(1, 3), k=4500: F1=0.6029, Time=5.7s


Grid Search:  22%|██▏       | 35/162 [03:10<11:17,  5.34s/it]


[35/162] max_feat=20000, ngram=(1, 2), k=4500: F1=0.6028, Time=5.1s


Grid Search:  25%|██▍       | 40/162 [03:38<11:12,  5.52s/it]


[40/162] max_feat=10000, ngram=(1, 3), k=4500: F1=0.6030, Time=5.9s


Grid Search:  28%|██▊       | 45/162 [04:05<10:33,  5.42s/it]


[45/162] max_feat=15000, ngram=(1, 2), k=4500: F1=0.6030, Time=5.0s


Grid Search:  31%|███       | 50/162 [04:33<10:14,  5.48s/it]


[50/162] max_feat=20000, ngram=(1, 3), k=4500: F1=0.5980, Time=5.7s


Grid Search:  34%|███▍      | 55/162 [05:00<09:41,  5.43s/it]


[55/162] max_feat=10000, ngram=(1, 2), k=5000: F1=0.6074, Time=5.4s


Grid Search:  37%|███▋      | 60/162 [05:29<09:52,  5.81s/it]


[60/162] max_feat=10000, ngram=(1, 3), k=5000: F1=0.6019, Time=6.1s


Grid Search:  40%|████      | 65/162 [05:58<09:17,  5.75s/it]


[65/162] max_feat=15000, ngram=(1, 2), k=5000: F1=0.6017, Time=5.5s


Grid Search:  43%|████▎     | 70/162 [06:27<08:58,  5.86s/it]


[70/162] max_feat=20000, ngram=(1, 3), k=5000: F1=0.6056, Time=6.1s


Grid Search:  46%|████▋     | 75/162 [06:56<08:21,  5.77s/it]


[75/162] max_feat=10000, ngram=(1, 2), k=5000: F1=0.6067, Time=5.5s


Grid Search:  49%|████▉     | 80/162 [07:25<08:01,  5.88s/it]


[80/162] max_feat=15000, ngram=(1, 3), k=5000: F1=0.6049, Time=6.2s


Grid Search:  52%|█████▏    | 85/162 [07:54<07:25,  5.78s/it]


[85/162] max_feat=20000, ngram=(1, 2), k=5000: F1=0.5998, Time=5.5s


Grid Search:  56%|█████▌    | 90/162 [08:24<07:04,  5.90s/it]


[90/162] max_feat=20000, ngram=(1, 3), k=5000: F1=0.6019, Time=6.2s


Grid Search:  59%|█████▊    | 95/162 [08:53<06:28,  5.79s/it]


[95/162] max_feat=10000, ngram=(1, 2), k=5000: F1=0.6017, Time=5.4s


Grid Search:  62%|██████▏   | 100/162 [09:23<06:07,  5.94s/it]


[100/162] max_feat=15000, ngram=(1, 3), k=5000: F1=0.6056, Time=6.3s


Grid Search:  65%|██████▍   | 105/162 [09:52<05:30,  5.79s/it]


[105/162] max_feat=20000, ngram=(1, 2), k=5000: F1=0.6050, Time=5.4s


Grid Search:  68%|██████▊   | 110/162 [10:23<05:33,  6.42s/it]


[110/162] max_feat=10000, ngram=(1, 3), k=6000: F1=0.6070, Time=7.1s


Grid Search:  71%|███████   | 115/162 [10:57<05:12,  6.66s/it]


[115/162] max_feat=15000, ngram=(1, 2), k=6000: F1=0.6091, Time=6.5s


Grid Search:  74%|███████▍  | 120/162 [11:31<04:44,  6.77s/it]


[120/162] max_feat=15000, ngram=(1, 3), k=6000: F1=0.6028, Time=7.0s


Grid Search:  77%|███████▋  | 125/162 [12:04<04:05,  6.64s/it]


[125/162] max_feat=20000, ngram=(1, 2), k=6000: F1=0.6039, Time=6.3s


Grid Search:  80%|████████  | 130/162 [12:38<03:37,  6.79s/it]


[130/162] max_feat=10000, ngram=(1, 3), k=6000: F1=0.6076, Time=7.0s


Grid Search:  83%|████████▎ | 135/162 [13:12<03:00,  6.67s/it]


[135/162] max_feat=15000, ngram=(1, 2), k=6000: F1=0.6072, Time=6.3s


Grid Search:  86%|████████▋ | 140/162 [13:46<02:29,  6.80s/it]


[140/162] max_feat=20000, ngram=(1, 3), k=6000: F1=0.6048, Time=7.2s


Grid Search:  90%|████████▉ | 145/162 [14:20<01:54,  6.76s/it]


[145/162] max_feat=10000, ngram=(1, 2), k=6000: F1=0.6074, Time=6.3s


Grid Search:  93%|█████████▎| 150/162 [14:54<01:21,  6.76s/it]


[150/162] max_feat=10000, ngram=(1, 3), k=6000: F1=0.6028, Time=7.0s


Grid Search:  96%|█████████▌| 155/162 [15:27<00:46,  6.62s/it]


[155/162] max_feat=15000, ngram=(1, 2), k=6000: F1=0.6039, Time=6.3s


Grid Search:  99%|█████████▉| 160/162 [16:01<00:13,  6.79s/it]


[160/162] max_feat=20000, ngram=(1, 3), k=6000: F1=0.6070, Time=7.0s


Grid Search: 100%|██████████| 162/162 [16:15<00:00,  6.02s/it]


[162/162] max_feat=20000, ngram=(1, 3), k=6000: F1=0.6028, Time=7.3s

Grid Search Complete!
Best F1 Score: 0.6091 (60.91%)

Best Preprocessing Configuration:
  max_features: 15000
  ngram_range: (1, 2)
  max_df: 0.6
  min_df: 2
  k_features: 6000

Best Metrics:
  F1 Score:       0.6091 (60.91%)
  Precision:      0.5469 (54.69%)
  Recall:         0.6873 (68.73%)
  Hamming Loss:   0.1314 (13.14%)
  Jaccard Score:  0.4379 (43.79%)

Top 5 Configurations by F1 Score:
 max_features ngram_range  max_df  min_df  k_features       f1  precision   recall
        15000      (1, 2)     0.6       2        6000 0.609075   0.546865 0.687256
        15000      (1, 2)     0.7       2        6000 0.609075   0.546865 0.687256
        15000      (1, 2)     0.8       2        6000 0.609075   0.546865 0.687256
        15000      (1, 3)     0.6       2        6000 0.608656   0.546449 0.686846
        15000      (1, 3)     0.7       2        6000 0.608656   0.546449 0.686846

✓ Results saved to /Users/christi




[32m2025-12-14 12:49:39.771[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.evaluate[0m:[36mevaluate_model[0m:[36m119[0m - [34m[1mEvaluating model: X shape (1818, 6000), y shape (1818, 18), threshold=0.55[0m
[32m2025-12-14 12:49:39.772[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.evaluate[0m:[36mevaluate_model[0m:[36m128[0m - [34m[1mGenerating predictions with threshold 0.55...[0m
[32m2025-12-14 12:49:39.975[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.evaluate[0m:[36mevaluate_model[0m:[36m132[0m - [34m[1mDecision scores generated: shape (1818, 18)[0m
[32m2025-12-14 12:49:39.976[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.evaluate[0m:[36mevaluate_model[0m:[36m136[0m - [34m[1mProbabilities generated: shape (1818, 18)[0m
[32m2025-12-14 12:49:39.976[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.evaluate[0m:[36mevaluate_model[0m:[36m140[0m - [34m[1mBinary predictions generated: shape (1818,

## 3. Overfitting Analysis: Cross-Validation, Learning Curves, and Validation Curves


### 3.3. Validation Curve

In [6]:
# Validation Curve: How model performance changes with regularization parameter C
print("=" * 70)
print("VALIDATION CURVE ANALYSIS")
print("=" * 70)

from sklearn.model_selection import validation_curve

# Test different C values
C_range = np.logspace(-2, 1, 10)  # From 0.01 to 10
print(f"Testing C values: {C_range}")

# Create base model for validation curve
base_model = ClassifierChain(LinearSVC(random_state=42, dual=False, max_iter=1000, class_weight='balanced'))

print("\nComputing validation curve (this may take a few minutes)...")

# Compute validation curve
train_scores_vc, val_scores_vc = validation_curve(
    base_model,
    X_train,
    y_train,
    param_name='estimator__C',
    param_range=C_range,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score, average='micro', zero_division=0),
    n_jobs=-1
)

# Calculate mean and std
train_scores_mean_vc = train_scores_vc.mean(axis=1)
train_scores_std_vc = train_scores_vc.std(axis=1)
val_scores_mean_vc = val_scores_vc.mean(axis=1)
val_scores_std_vc = val_scores_vc.std(axis=1)

# Plot validation curve
fig, ax = plt.subplots(figsize=(10, 6))
ax.semilogx(C_range, train_scores_mean_vc, 'o-', color='steelblue', label='Training Score', linewidth=2)
ax.fill_between(C_range, train_scores_mean_vc - train_scores_std_vc, train_scores_mean_vc + train_scores_std_vc, alpha=0.2, color='steelblue')
ax.semilogx(C_range, val_scores_mean_vc, 'o-', color='coral', label='Cross-Validation Score', linewidth=2)
ax.fill_between(C_range, val_scores_mean_vc - val_scores_std_vc, val_scores_mean_vc + val_scores_std_vc, alpha=0.2, color='coral')
ax.set_xlabel('C (Regularization Parameter)', fontsize=12)
ax.set_ylabel('F1 Score (Micro)', fontsize=12)
ax.set_title('Validation Curve: Effect of Regularization (C)', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 1.0])
plt.tight_layout()
plt.show()

# Find optimal C
optimal_idx = np.argmax(val_scores_mean_vc)
optimal_C = C_range[optimal_idx]
optimal_score = val_scores_mean_vc[optimal_idx]

print(f"\nValidation Curve Analysis:")
print(f"  Optimal C: {optimal_C:.4f}")
print(f"  Optimal CV Score: {optimal_score:.4f}")
print(f"  Current C (from grid search): {grid_search_svc.best_params_['estimator__C']}")
if abs(optimal_C - grid_search_svc.best_params_['estimator__C']) > 0.5:
    print(f"  ⚠ Consider retraining with C={optimal_C:.4f} for potentially better performance")
else:
    print(f"  ✓ Current C is close to optimal")

VALIDATION CURVE ANALYSIS
Testing C values: [ 0.01        0.02154435  0.04641589  0.1         0.21544347  0.46415888
  1.          2.15443469  4.64158883 10.        ]


NameError: name 'ClassifierChain' is not defined

### 3.4. Final Metrics Summary DataFrame

In [None]:
# Compile all metrics into a comprehensive DataFrame
print("=" * 70)
print("FINAL METRICS SUMMARY")
print("=" * 70)

# Check if cv_summary exists (from section 3.1), if not, compute it
if 'cv_summary' not in globals():
    print("Note: Computing cross-validation metrics (cv_summary not found)...")
    from sklearn.model_selection import cross_validate
    
    scoring = {
        'f1_micro': make_scorer(f1_score, average='micro', zero_division=0),
        'f1_macro': make_scorer(f1_score, average='macro', zero_division=0),
        'precision_micro': make_scorer(precision_score, average='micro', zero_division=0),
        'recall_micro': make_scorer(recall_score, average='micro', zero_division=0),
        'hamming_loss': make_scorer(hamming_loss),
    }
    
    cv_results = cross_validate(
        best_model_svc,
        X_train,
        y_train,
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        scoring=scoring,
        return_train_score=True,
        n_jobs=-1
    )
    
    cv_summary = {}
    for metric in ['f1_micro', 'f1_macro', 'precision_micro', 'recall_micro', 'hamming_loss']:
        train_scores = cv_results[f'train_{metric}']
        test_scores = cv_results[f'test_{metric}']
        cv_summary[metric] = {
            'train_mean': train_scores.mean(),
            'train_std': train_scores.std(),
            'test_mean': test_scores.mean(),
            'test_std': test_scores.std(),
            'gap': train_scores.mean() - test_scores.mean(),
        }

# Get test set metrics
test_metrics_final = evaluate_model(best_model_svc, X_test, y_test)
train_metrics_final = evaluate_model(best_model_svc, X_train, y_train)

# Create comprehensive metrics DataFrame
metrics_data = {
    'Metric': [
        'F1 Score (Micro)',
        'F1 Score (Macro)',
        'Precision (Micro)',
        'Recall (Micro)',
        'Hamming Loss',
        'Jaccard Score',
    ],
    'Train': [
        train_metrics_final['f1'],
        f1_score(y_train, best_model_svc.predict(X_train), average='macro', zero_division=0),
        train_metrics_final['precision'],
        train_metrics_final['recall'],
        train_metrics_final['hamming_loss'],
        train_metrics_final['jaccard'],
    ],
    'Test': [
        test_metrics_final['f1'],
        f1_score(y_test, best_model_svc.predict(X_test), average='macro', zero_division=0),
        test_metrics_final['precision'],
        test_metrics_final['recall'],
        test_metrics_final['hamming_loss'],
        test_metrics_final['jaccard'],
    ],
    'CV Mean': [
        cv_summary['f1_micro']['test_mean'],
        cv_summary['f1_macro']['test_mean'],
        cv_summary['precision_micro']['test_mean'],
        cv_summary['recall_micro']['test_mean'],
        cv_summary['hamming_loss']['test_mean'],
        None,  # Jaccard not in CV summary
    ],
    'CV Std': [
        cv_summary['f1_micro']['test_std'],
        cv_summary['f1_macro']['test_std'],
        cv_summary['precision_micro']['test_std'],
        cv_summary['recall_micro']['test_std'],
        cv_summary['hamming_loss']['test_std'],
        None,
    ],
    'Overfitting Gap': [
        train_metrics_final['f1'] - test_metrics_final['f1'],
        f1_score(y_train, best_model_svc.predict(X_train), average='macro', zero_division=0) - 
        f1_score(y_test, best_model_svc.predict(X_test), average='macro', zero_division=0),
        train_metrics_final['precision'] - test_metrics_final['precision'],
        train_metrics_final['recall'] - test_metrics_final['recall'],
        train_metrics_final['hamming_loss'] - test_metrics_final['hamming_loss'],
        train_metrics_final['jaccard'] - test_metrics_final['jaccard'],
    ],
}

final_metrics_df = pd.DataFrame(metrics_data)

# Format the DataFrame for better readability
pd.set_option('display.float_format', lambda x: f'{x:.4f}' if pd.notna(x) else 'N/A')
print("\nFinal Metrics Summary:")
print(final_metrics_df.to_string(index=False))

# Add model parameters summary
print("\n" + "=" * 70)
print("MODEL PARAMETERS")
print("=" * 70)
print(f"Best Parameters from Grid Search:")
for param, value in grid_search_svc.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nModel Performance Summary:")
print(f"  Train F1: {train_metrics_final['f1']:.4f}")
print(f"  Test F1: {test_metrics_final['f1']:.4f}")
print(f"  CV F1 Mean: {cv_summary['f1_micro']['test_mean']:.4f} ± {cv_summary['f1_micro']['test_std']:.4f}")
print(f"  Overfitting Gap: {train_metrics_final['f1'] - test_metrics_final['f1']:.4f}")
print("=" * 70)

FINAL METRICS SUMMARY
Note: Computing cross-validation metrics (cv_summary not found)...
[32m2025-12-11 00:26:07.312[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.evaluate[0m:[36mevaluate_model[0m:[36m116[0m - [34m[1mEvaluating model: X shape (1818, 2000), y shape (1818, 18)[0m
[32m2025-12-11 00:26:07.314[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.evaluate[0m:[36mevaluate_model[0m:[36m125[0m - [34m[1mGenerating predictions from model...[0m
[32m2025-12-11 00:26:07.402[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.evaluate[0m:[36mevaluate_model[0m:[36m127[0m - [34m[1mPredictions generated: shape (1818, 18)[0m
[32m2025-12-11 00:26:07.402[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.evaluate[0m:[36mevaluate_model[0m:[36m130[0m - [34m[1mCalculating evaluation metrics (micro-averaged)...[0m
[32m2025-12-11 00:26:07.439[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.evaluate[0m:[36mevaluate_model[

### 3.1. Cross-Validation for Overfitting Analysis


In [None]:
# Cross-Validation for Overfitting Analysis
print("=" * 70)
print("CROSS-VALIDATION OVERFITTING ANALYSIS")
print("=" * 70)

from sklearn.model_selection import cross_validate

# Use the best model from grid search
print(f"Using best model with parameters: {grid_search_svc.best_params_}")

# Define scoring metrics for cross-validation
scoring = {
    'f1_micro': make_scorer(f1_score, average='micro', zero_division=0),
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0),
    'precision_micro': make_scorer(precision_score, average='micro', zero_division=0),
    'recall_micro': make_scorer(recall_score, average='micro', zero_division=0),
    'hamming_loss': make_scorer(hamming_loss),
}

# Perform cross-validation with return_train_score=True to detect overfitting
print("\nPerforming 5-fold cross-validation...")
cv_results = cross_validate(
    best_model_svc,
    X_train,
    y_train,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring=scoring,
    return_train_score=True,
    n_jobs=-1
)

# Calculate mean and std for each metric
cv_summary = {}
for metric in ['f1_micro', 'f1_macro', 'precision_micro', 'recall_micro', 'hamming_loss']:
    train_scores = cv_results[f'train_{metric}']
    test_scores = cv_results[f'test_{metric}']
    
    cv_summary[metric] = {
        'train_mean': train_scores.mean(),
        'train_std': train_scores.std(),
        'test_mean': test_scores.mean(),
        'test_std': test_scores.std(),
        'gap': train_scores.mean() - test_scores.mean(),
    }

# Display cross-validation results
print("\n" + "=" * 70)
print("CROSS-VALIDATION RESULTS (5-fold)")
print("=" * 70)
print(f"{'Metric':<20} {'Train Mean':<12} {'Train Std':<12} {'Test Mean':<12} {'Test Std':<12} {'Gap':<10}")
print("-" * 70)
for metric, stats in cv_summary.items():
    print(f"{metric:<20} {stats['train_mean']:>10.4f}   {stats['train_std']:>10.4f}   "
          f"{stats['test_mean']:>10.4f}   {stats['test_std']:>10.4f}   {stats['gap']:>8.4f}")

# Overfitting assessment
overfitting_gap = cv_summary['f1_micro']['gap']
print("\n" + "=" * 70)
print("OVERFITTING ASSESSMENT")
print("=" * 70)
if overfitting_gap < 0.05:
    print(f"✓ EXCELLENT: Overfitting gap is {overfitting_gap:.4f} (< 0.05)")
elif overfitting_gap < 0.10:
    print(f"✓ GOOD: Overfitting gap is {overfitting_gap:.4f} (< 0.10)")
elif overfitting_gap < 0.15:
    print(f"⚠ MODERATE: Overfitting gap is {overfitting_gap:.4f} (0.10-0.15)")
else:
    print(f"⚠ HIGH: Overfitting gap is {overfitting_gap:.4f} (> 0.15)")
print("=" * 70)


CROSS-VALIDATION OVERFITTING ANALYSIS
Using best model with parameters: {'estimator__C': 0.5, 'estimator__class_weight': 'balanced', 'estimator__loss': 'squared_hinge', 'estimator__max_iter': 1000, 'estimator__penalty': 'l2', 'estimator__tol': 0.001}

Performing 5-fold cross-validation...


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)



CROSS-VALIDATION RESULTS (5-fold)
Metric               Train Mean   Train Std    Test Mean    Test Std     Gap       
----------------------------------------------------------------------
f1_micro                 0.7750       0.0036       0.5831       0.0053     0.1920
f1_macro                 0.7883       0.0028       0.5201       0.0132     0.2682
precision_micro          0.6927       0.0037       0.5280       0.0045     0.1646
recall_micro             0.8797       0.0033       0.6510       0.0117     0.2286
hamming_loss             0.0750       0.0013       0.1367       0.0008    -0.0617

OVERFITTING ASSESSMENT
⚠ HIGH: Overfitting gap is 0.1920 (> 0.15)


### 3.2. Learning Curve


In [None]:
# Learning Curve: How model performance changes with training set size
print("=" * 70)
print("LEARNING CURVE ANALYSIS")
print("=" * 70)

from sklearn.model_selection import learning_curve

# Define training sizes (percentages of training data)
train_sizes = np.linspace(0.1, 1.0, 10)
train_sizes_abs = (train_sizes * X_train.shape[0]).astype(int)

print(f"Training sizes: {train_sizes_abs}")
print("\nComputing learning curve (this may take a few minutes)...")

# Compute learning curve
train_sizes_abs, train_scores, val_scores = learning_curve(
    best_model_svc,
    X_train,
    y_train,
    train_sizes=train_sizes_abs,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score, average='micro', zero_division=0),
    n_jobs=-1,
    random_state=42
)

# Calculate mean and std
train_scores_mean = train_scores.mean(axis=1)
train_scores_std = train_scores.std(axis=1)
val_scores_mean = val_scores.mean(axis=1)
val_scores_std = val_scores.std(axis=1)

# Plot learning curve
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(train_sizes_abs, train_scores_mean, 'o-', color='steelblue', label='Training Score', linewidth=2)
ax.fill_between(train_sizes_abs, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color='steelblue')
ax.plot(train_sizes_abs, val_scores_mean, 'o-', color='coral', label='Cross-Validation Score', linewidth=2)
ax.fill_between(train_sizes_abs, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.2, color='coral')
ax.set_xlabel('Training Set Size', fontsize=12)
ax.set_ylabel('F1 Score (Micro)', fontsize=12)
ax.set_title('Learning Curve: LinearSVC', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 1.0])
plt.tight_layout()
plt.show()

# Analyze learning curve
final_gap = train_scores_mean[-1] - val_scores_mean[-1]
print(f"\nLearning Curve Analysis:")
print(f"  Final training score: {train_scores_mean[-1]:.4f}")
print(f"  Final CV score: {val_scores_mean[-1]:.4f}")
print(f"  Gap: {final_gap:.4f}")
if val_scores_mean[-1] < val_scores_mean[-2]:
    print("  ⚠ Model may be overfitting (CV score decreasing)")
elif final_gap > 0.10:
    print(f"  ⚠ Large gap ({final_gap:.4f}) suggests overfitting")
else:
    print(f"  ✓ Good generalization (gap: {final_gap:.4f})")


LEARNING CURVE ANALYSIS
Training sizes: [ 726 1453 2180 2907 3634 4361 5088 5815 6542 7269]

Computing learning curve (this may take a few minutes)...


ValueError: train_sizes has been interpreted as absolute numbers of training samples and must be within (0, 5815], but is within [726, 7269].