# Model Testing: VotingClassifier Ensemble

This notebook tests whether a VotingClassifier ensemble (combining LinearSVC and LogisticRegression) improves metrics compared to a baseline LinearSVC model.

## Approach
1. Load and prepare data (train/test split)
2. Feature engineering: TF-IDF + Normalizer + SelectKBest
3. Train baseline LinearSVC model
4. Train VotingClassifier ensemble (LinearSVC + LogisticRegression)
5. Compare metrics and performance

In [1]:
import time
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.special import expit  # Sigmoid function for converting scores to probabilities
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score,
    hamming_loss,
    jaccard_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.svm import LinearSVC

from descriptions.config import MODELS_DIR, RAW_DATA_DIR
from descriptions.dataset import load_data
from descriptions.modeling.preprocess import _generate_descriptions, _generate_targets

# Reload module to pick up any code changes
import importlib
import descriptions.modeling.preprocess
importlib.reload(descriptions.modeling.preprocess)
from descriptions.modeling.preprocess import _generate_descriptions, _generate_targets

print("✓ Imports complete (module reloaded)")

[32m2025-12-19 22:48:29.179[0m | [1mINFO    [0m | [36mdescriptions.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/christianfullerton/Developer/Python Workspace/movie_genre_model[0m
  import pkg_resources  # noqa: TID251


✓ Imports complete (module reloaded)


In [4]:
# Load raw data
print("Loading raw data...")
data = load_data()
print(f"✓ Loaded {len(data)} samples")
print(f"  Columns: {list(data.columns)}")
print(f"  Sample description: {data['description'].iloc[0][:100]}...")

Loading raw data...
[32m2025-12-19 22:48:45.345[0m | [1mINFO    [0m | [36mdescriptions.dataset[0m:[36mload_data[0m:[36m68[0m - [1mLoading raw data from /Users/christianfullerton/Developer/Python Workspace/movie_genre_model/data/raw/top_movies.csv...[0m
[32m2025-12-19 22:48:45.452[0m | [32m[1mSUCCESS [0m | [36mdescriptions.dataset[0m:[36mload_data[0m:[36m70[0m - [32m[1m✓ Data loaded successfully: 9420 rows, 3 columns[0m
[32m2025-12-19 22:48:45.453[0m | [34m[1mDEBUG   [0m | [36mdescriptions.dataset[0m:[36mload_data[0m:[36m73[0m - [34m[1mColumns: ['movie_name', 'genre', 'description'][0m
✓ Loaded 9420 samples
  Columns: ['movie_name', 'genre', 'description']
  Sample description: Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufr...


In [8]:
# Split data into train and test sets BEFORE preprocessing (prevents data leakage)
RANDOM_STATE = 42
TEST_SIZE = 0.2

print(f"\nSplitting data (test_size={TEST_SIZE}, random_state={RANDOM_STATE})...")
data_train, data_test = train_test_split(
    data, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=True
)
# Reset index to ensure clean integer indexing
data_train = data_train.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)
print(f"✓ Train: {len(data_train)} samples, Test: {len(data_test)} samples")


Splitting data (test_size=0.2, random_state=42)...
✓ Train: 7536 samples, Test: 1884 samples


In [9]:
# Generate targets first (to get filtered indices and MultiLabelBinarizer)
print("=" * 70)
print("STEP 1: Generating Multi-Label Targets")
print("=" * 70)

min_genre_percentage = 5.0
# Note: _generate_targets returns (y, mlb, filtered_index) - order matters!
y_train, mlb, filtered_index_train = _generate_targets(
    data_train, min_genre_percentage=min_genre_percentage
)

# Manually handle test targets to avoid closure issues with mlb.classes_
print("\nGenerating test targets manually...")
from descriptions.modeling.preprocess import _preprocess_genres
df_test = _preprocess_genres(data_test)
genres_list_test = df_test["genre"]

# Filter genres to only those in mlb.classes_ (extract classes to avoid closure issue)
mlb_classes_set = set(mlb.classes_)
genres_list_filtered_test = [sorted({g for g in genres if g in mlb_classes_set}) for genres in genres_list_test]

# Remove samples that lost all genres
keep_mask_test = [len(genres) > 0 for genres in genres_list_filtered_test]
genres_list_filtered_test = [g for g, keep in zip(genres_list_filtered_test, keep_mask_test) if keep]
filtered_index_test = data_test.index[keep_mask_test].tolist()

# Transform using mlb
y_test = mlb.transform(genres_list_filtered_test)
print(f"  Removed {sum(not k for k in keep_mask_test)} samples with no genres after filtering")

print(f"\n✓ Generated targets")
print(f"  Train labels: {y_train.shape}")
print(f"  Test labels: {y_test.shape}")
print(f"  Number of genres: {len(mlb.classes_)}")
print(f"  Genres: {list(mlb.classes_)[:10]}..." if len(mlb.classes_) > 10 else f"  Genres: {list(mlb.classes_)}")

STEP 1: Generating Multi-Label Targets
[32m2025-12-16 12:36:59.163[0m | [1mINFO    [0m | [36mdescriptions.modeling.preprocess[0m:[36m_generate_targets[0m:[36m83[0m - [1mGenerating multi-label targets from 7536 samples...[0m
[32m2025-12-16 12:36:59.163[0m | [1mINFO    [0m | [36mdescriptions.modeling.preprocess[0m:[36m_preprocess_genres[0m:[36m45[0m - [1mStarting genre preprocessing: cleaning and splitting genre strings[0m
[32m2025-12-16 12:36:59.163[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.preprocess[0m:[36m_preprocess_genres[0m:[36m48[0m - [34m[1mFilling missing genres with empty strings[0m
[32m2025-12-16 12:36:59.169[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.preprocess[0m:[36m_preprocess_genres[0m:[36m54[0m - [34m[1mSplitting genre strings by comma and cleaning[0m
[32m2025-12-16 12:36:59.211[0m | [32m[1mSUCCESS [0m | [36mdescriptions.modeling.preprocess[0m:[36m_preprocess_genres[0m:[36m60[0m - [32m[1

In [10]:
# Generate TF-IDF features for filtered data
print("=" * 70)
print("STEP 2: Generating TF-IDF Features")
print("=" * 70)

# Filter data to match filtered indices
data_train_filtered = data_train.loc[filtered_index_train].reset_index(drop=True)
data_test_filtered = data_test.loc[filtered_index_test].reset_index(drop=True)

print(f"  Train samples (after filtering): {len(data_train_filtered)}")
print(f"  Test samples (after filtering): {len(data_test_filtered)}")

# Generate TF-IDF features - fit on train
X_train_combined, vectorizer = _generate_descriptions(data_train_filtered)

# Transform test data using the fitted vectorizer
X_test_combined = vectorizer.transform(data_test_filtered['description'].fillna("").astype(str))

print(f"  ✓ TF-IDF features generated")
print(f"  Train shape: {X_train_combined.shape}")
print(f"  Test shape: {X_test_combined.shape}")

STEP 2: Generating TF-IDF Features
  Train samples (after filtering): 7512
  Test samples (after filtering): 1878
[32m2025-12-16 12:36:59.300[0m | [1mINFO    [0m | [36mdescriptions.modeling.preprocess[0m:[36m_generate_descriptions[0m:[36m171[0m - [1mGenerating TF-IDF features from 7512 movie descriptions...[0m
[32m2025-12-16 12:36:59.309[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.preprocess[0m:[36m_generate_descriptions[0m:[36m179[0m - [34m[1mCreating and fitting new TfidfVectorizer with default parameters[0m
[32m2025-12-16 12:36:59.309[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.preprocess[0m:[36mbuild_preprocessor[0m:[36m218[0m - [34m[1mBuilding preprocessing components: TfidfVectorizer, MultiLabelBinarizer, Normalizer, and SelectKBest[0m
[32m2025-12-16 12:36:59.310[0m | [34m[1mDEBUG   [0m | [36mdescriptions.modeling.preprocess[0m:[36mbuild_preprocessor[0m:[36m229[0m - [34m[1mTfidfVectorizer configured: max_feature

In [11]:
# Apply L2 normalization
print("\n" + "=" * 70)
print("STEP 3: Applying L2 Normalization")
print("=" * 70)

normalizer = Normalizer(norm='l2')
X_train_combined = normalizer.fit_transform(X_train_combined)
X_test_combined = normalizer.transform(X_test_combined)
print(f"✓ Normalization applied (L2 norm per sample)")
print(f"  Train shape: {X_train_combined.shape}")
print(f"  Test shape: {X_test_combined.shape}")


STEP 3: Applying L2 Normalization
✓ Normalization applied (L2 norm per sample)
  Train shape: (7512, 10000)
  Test shape: (1878, 10000)


In [12]:
# Feature Selection: SelectKBest with chi2 for Multi-Label Classification
print("\n" + "=" * 70)
print("STEP 4: Feature Selection (SelectKBest with chi2)")
print("=" * 70)

K_FEATURES = 8000
print(f"\nSelecting top {K_FEATURES} features using chi2...")
print(f"  Input shape: {X_train_combined.shape}")
print(f"  Target shape: {y_train.shape} (multi-label)")
print("  (This may take a few minutes)")

def chi2_multilabel(X, y):
    """Chi2 scoring function for multi-label classification"""
    scores_list = []
    pvalues_list = []
    for label_idx in range(y.shape[1]):
        y_single = y[:, label_idx].ravel()
        chi2_scores, chi2_pvalues = chi2(X, y_single)
        scores_list.append(chi2_scores)
        pvalues_list.append(chi2_pvalues)
    scores_array = np.array(scores_list).T
    pvalues_array = np.array(pvalues_list).T
    max_scores = np.max(scores_array, axis=1)
    min_pvalues = np.min(pvalues_array, axis=1)
    return max_scores, min_pvalues

print("  Creating feature selector...")
feature_selector = SelectKBest(score_func=chi2_multilabel, k=K_FEATURES)
print("  Fitting feature selector on training data...")
start_time = time.time()
feature_selector.fit(X_train_combined, y_train)
fit_time = time.time() - start_time
print(f"  ✓ Feature selector fitted in {fit_time:.2f} seconds")
print("  Transforming training and test sets...")
X_train_final = feature_selector.transform(X_train_combined)
X_test_final = feature_selector.transform(X_test_combined)
print(f"\n✓ Feature selection complete!")
print(f"  Train shape: {X_train_final.shape}")
print(f"  Test shape: {X_test_final.shape}")


STEP 4: Feature Selection (SelectKBest with chi2)

Selecting top 8000 features using chi2...
  Input shape: (7512, 10000)
  Target shape: (7512, 14) (multi-label)
  (This may take a few minutes)
  Creating feature selector...
  Fitting feature selector on training data...
  ✓ Feature selector fitted in 0.12 seconds
  Transforming training and test sets...

✓ Feature selection complete!
  Train shape: (7512, 8000)
  Test shape: (1878, 8000)


In [13]:
# Model parameters (from your existing model)
MODEL_PARAMS = {
    'C': 0.1,
    'penalty': 'l2',
    'loss': 'squared_hinge',
    'max_iter': 1000,
    'tol': 1e-3,
    'class_weight': 'balanced',
    'dual': False,
    'random_state': 42,
}

# Threshold for converting probabilities to binary predictions
THRESHOLD = 0.55

print("Model Configuration:")
print(f"  Base parameters: {MODEL_PARAMS}")
print(f"  Prediction threshold: {THRESHOLD}")
print(f"  Number of labels: {len(mlb.classes_)}")

Model Configuration:
  Base parameters: {'C': 0.1, 'penalty': 'l2', 'loss': 'squared_hinge', 'max_iter': 1000, 'tol': 0.001, 'class_weight': 'balanced', 'dual': False, 'random_state': 42}
  Prediction threshold: 0.55
  Number of labels: 14


In [14]:
# Train Baseline LinearSVC Model
print("\n" + "=" * 70)
print("STEP 5: Training Baseline LinearSVC Model")
print("=" * 70)

baseline_svc = LinearSVC(**MODEL_PARAMS)
baseline_model = OneVsRestClassifier(baseline_svc)

print(f"Training baseline model on {X_train_final.shape[0]} samples...")
start_time = time.time()
baseline_model.fit(X_train_final, y_train)
training_time = time.time() - start_time
print(f"✓ Baseline model trained in {training_time:.2f} seconds")

# Get decision function scores and convert to probabilities using sigmoid
print("Generating predictions...")
y_scores_baseline = baseline_model.decision_function(X_test_final)
# Convert scores to probabilities using sigmoid (expit)
y_proba_baseline = expit(y_scores_baseline)
y_pred_baseline = (y_proba_baseline >= THRESHOLD).astype(int)

print("✓ Baseline predictions generated")


STEP 5: Training Baseline LinearSVC Model
Training baseline model on 7512 samples...
✓ Baseline model trained in 0.25 seconds
Generating predictions...
✓ Baseline predictions generated


In [15]:
# Train VotingClassifier Ensemble (LinearSVC + LogisticRegression)
print("\n" + "=" * 70)
print("STEP 6: Training VotingClassifier Ensemble")
print("=" * 70)

# Create individual estimators
# Note: LinearSVC doesn't have predict_proba, so we wrap it with CalibratedClassifierCV
# to enable soft voting (probability-based voting)
base_svc = LinearSVC(**MODEL_PARAMS)
svc_estimator = CalibratedClassifierCV(base_svc, method='sigmoid', cv=3)

lr_estimator = LogisticRegression(
    C=0.1,
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    solver='lbfgs',  # Good default for multi-class
)

# Create voting classifier with soft voting (uses probabilities)
voting_clf = VotingClassifier(
    estimators=[
        ('svc', svc_estimator),
        ('lr', lr_estimator),
    ],
    voting='soft'  # Use soft voting for probabilities
)

# Wrap in OneVsRestClassifier for multi-label classification
ensemble_model = OneVsRestClassifier(voting_clf)

print(f"Training ensemble model on {X_train_final.shape[0]} samples...")
print("  This will take longer as it trains multiple models...")
print("  (CalibratedClassifierCV adds cross-validation, making it slower)")
start_time = time.time()
ensemble_model.fit(X_train_final, y_train)
training_time = time.time() - start_time
print(f"✓ Ensemble model trained in {training_time:.2f} seconds")

# Generate predictions (voting classifier returns probabilities directly)
print("Generating predictions...")
y_proba_ensemble = ensemble_model.predict_proba(X_test_final)
y_pred_ensemble = (y_proba_ensemble >= THRESHOLD).astype(int)

print("✓ Ensemble predictions generated")


STEP 6: Training VotingClassifier Ensemble
Training ensemble model on 7512 samples...
  This will take longer as it trains multiple models...
  (CalibratedClassifierCV adds cross-validation, making it slower)
✓ Ensemble model trained in 1.21 seconds
Generating predictions...
✓ Ensemble predictions generated


In [16]:
# Evaluate Baseline Model
print("\n" + "=" * 70)
print("BASELINE LINEARSVC MODEL METRICS")
print("=" * 70)

baseline_metrics = {
    'f1_micro': f1_score(y_test, y_pred_baseline, average='micro'),
    'f1_macro': f1_score(y_test, y_pred_baseline, average='macro'),
    'precision_micro': precision_score(y_test, y_pred_baseline, average='micro', zero_division=0),
    'recall_micro': recall_score(y_test, y_pred_baseline, average='micro', zero_division=0),
    'hamming_loss': hamming_loss(y_test, y_pred_baseline),
    'jaccard_score': jaccard_score(y_test, y_pred_baseline, average='micro', zero_division=0),
}

for metric, value in baseline_metrics.items():
    print(f"  {metric:20s}: {value:.4f} ({value*100:.2f}%)")

baseline_metrics['model'] = 'LinearSVC_Baseline'


BASELINE LINEARSVC MODEL METRICS
  f1_micro            : 0.5519 (55.19%)
  f1_macro            : 0.5376 (53.76%)
  precision_micro     : 0.6764 (67.64%)
  recall_micro        : 0.4661 (46.61%)
  hamming_loss        : 0.1375 (13.75%)
  jaccard_score       : 0.3811 (38.11%)


In [17]:
# Evaluate Ensemble Model
print("\n" + "=" * 70)
print("VOTINGCLASSIFIER ENSEMBLE MODEL METRICS")
print("=" * 70)

ensemble_metrics = {
    'f1_micro': f1_score(y_test, y_pred_ensemble, average='micro'),
    'f1_macro': f1_score(y_test, y_pred_ensemble, average='macro'),
    'precision_micro': precision_score(y_test, y_pred_ensemble, average='micro', zero_division=0),
    'recall_micro': recall_score(y_test, y_pred_ensemble, average='micro', zero_division=0),
    'hamming_loss': hamming_loss(y_test, y_pred_ensemble),
    'jaccard_score': jaccard_score(y_test, y_pred_ensemble, average='micro', zero_division=0),
}

for metric, value in ensemble_metrics.items():
    print(f"  {metric:20s}: {value:.4f} ({value*100:.2f}%)")

ensemble_metrics['model'] = 'VotingClassifier_Ensemble'


VOTINGCLASSIFIER ENSEMBLE MODEL METRICS
  f1_micro            : 0.5334 (53.34%)
  f1_macro            : 0.4801 (48.01%)
  precision_micro     : 0.7294 (72.94%)
  recall_micro        : 0.4204 (42.04%)
  hamming_loss        : 0.1336 (13.36%)
  jaccard_score       : 0.3637 (36.37%)


In [18]:
# Compare Models
print("\n" + "=" * 70)
print("MODEL COMPARISON")
print("=" * 70)

comparison_df = pd.DataFrame({
    'Baseline (LinearSVC)': [baseline_metrics[k] for k in ['f1_micro', 'f1_macro', 'precision_micro', 'recall_micro', 'hamming_loss', 'jaccard_score']],
    'Ensemble (VotingClassifier)': [ensemble_metrics[k] for k in ['f1_micro', 'f1_macro', 'precision_micro', 'recall_micro', 'hamming_loss', 'jaccard_score']],
}, index=['F1 (Micro)', 'F1 (Macro)', 'Precision (Micro)', 'Recall (Micro)', 'Hamming Loss', 'Jaccard Score'])

# Calculate improvement
comparison_df['Improvement'] = comparison_df['Ensemble (VotingClassifier)'] - comparison_df['Baseline (LinearSVC)']
# For hamming_loss, lower is better, so flip the sign
comparison_df.loc['Hamming Loss', 'Improvement'] = -comparison_df.loc['Hamming Loss', 'Improvement']

print(comparison_df.round(4))
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"F1 (Micro) Improvement: {comparison_df.loc['F1 (Micro)', 'Improvement']*100:+.2f} percentage points")
print(f"F1 (Macro) Improvement: {comparison_df.loc['F1 (Macro)', 'Improvement']*100:+.2f} percentage points")
print(f"Precision Improvement: {comparison_df.loc['Precision (Micro)', 'Improvement']*100:+.2f} percentage points")
print(f"Recall Improvement: {comparison_df.loc['Recall (Micro)', 'Improvement']*100:+.2f} percentage points")

if comparison_df.loc['F1 (Micro)', 'Improvement'] > 0:
    print("\n✓ VotingClassifier shows improvement!")
else:
    print("\n⚠ VotingClassifier does not improve over baseline")


MODEL COMPARISON
                   Baseline (LinearSVC)  Ensemble (VotingClassifier)  \
F1 (Micro)                       0.5519                       0.5334   
F1 (Macro)                       0.5376                       0.4801   
Precision (Micro)                0.6764                       0.7294   
Recall (Micro)                   0.4661                       0.4204   
Hamming Loss                     0.1375                       0.1336   
Jaccard Score                    0.3811                       0.3637   

                   Improvement  
F1 (Micro)             -0.0185  
F1 (Macro)             -0.0575  
Precision (Micro)       0.0530  
Recall (Micro)         -0.0456  
Hamming Loss            0.0039  
Jaccard Score          -0.0174  

SUMMARY
F1 (Micro) Improvement: -1.85 percentage points
F1 (Macro) Improvement: -5.75 percentage points
Precision Improvement: +5.30 percentage points
Recall Improvement: -4.56 percentage points

⚠ VotingClassifier does not improve over baseline

In [19]:
# Save results to JSON
results = {
    'baseline_metrics': baseline_metrics,
    'ensemble_metrics': ensemble_metrics,
    'comparison': comparison_df.to_dict(),
    'configuration': {
        'model_params': MODEL_PARAMS,
        'threshold': THRESHOLD,
        'k_features': K_FEATURES,
        'train_samples': len(X_train_final),
        'test_samples': len(X_test_final),
        'num_genres': len(mlb.classes_),
    }
}

output_path = MODELS_DIR / "metrics_votingclassifier.json"
with open(output_path, 'w') as f:
    json.dump(results, f, indent=2, default=str)

print(f"\n✓ Results saved to {output_path}")

TypeError: sparse array length is ambiguous; use getnnz() or shape[0]