# Ranking Model Training with XGBoost and MLflow

This notebook trains an XGBoost model for ranking movie recommendations.

**Goal**: Predict user ratings (1-5 stars) and optimize for ranking quality (NDCG@K)

**Approach**:
- Start with simple baseline XGBoost model
- Track experiments with MLflow
- Iterate on hyperparameters
- Understand feature importance
- Evaluate with NDCG@K and RMSE

**Philosophy**: Simple first, then improve. Understand every part.

## 1. Setup & Imports

Load libraries and configure MLflow for experiment tracking.

In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

# MLflow for experiment tracking
import mlflow
import mlflow.xgboost

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries loaded successfully!")
print(f"XGBoost version: {xgb.__version__}")
print(f"MLflow version: {mlflow.__version__}")

In [None]:
# Configure MLflow tracking
mlflow.set_tracking_uri("file:///Users/ashishmahuli/Desktop/rec_system/mlruns")
mlflow.set_experiment("ranking_model")

print("MLflow configured!")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment: {mlflow.get_experiment_by_name('ranking_model')}")
print("\nTo view experiments, run: mlflow ui")
print("Then open: http://localhost:5000")

## 2. Load Materialized Features

Load the pre-computed features from parquet files.

In [None]:
# Load features
features_dir = Path('../features')

train_df = pd.read_parquet(features_dir / 'train_features.parquet')
val_df = pd.read_parquet(features_dir / 'val_features.parquet')
test_df = pd.read_parquet(features_dir / 'test_features.parquet')

# Load metadata
with open(features_dir / 'feature_metadata.json', 'r') as f:
    metadata = json.load(f)

print("Data loaded successfully!")
print(f"\nTrain: {train_df.shape}")
print(f"Val:   {val_df.shape}")
print(f"Test:  {test_df.shape}")
print(f"\nTotal features: {metadata['num_features']}")

In [None]:
# Separate features (X) from target (y)
# Exclude: user_id, movie_id, rating (target), timestamp
exclude_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
feature_cols = [col for col in train_df.columns if col not in exclude_cols]

X_train = train_df[feature_cols]
y_train = train_df['rating']

X_val = val_df[feature_cols]
y_val = val_df['rating']

X_test = test_df[feature_cols]
y_test = test_df['rating']

print(f"Feature columns ({len(feature_cols)}):")
for group_name, group_features in metadata['feature_groups'].items():
    print(f"  {group_name:15} {len(group_features):2} features")

print(f"\nX_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")

In [None]:
# Quick sanity check: verify no missing values
missing_train = X_train.isnull().sum().sum()
missing_val = X_val.isnull().sum().sum()
missing_test = X_test.isnull().sum().sum()

print("Missing value check:")
print(f"Train: {missing_train} missing values")
print(f"Val:   {missing_val} missing values")
print(f"Test:  {missing_test} missing values")

if missing_train == 0 and missing_val == 0 and missing_test == 0:
    print("\n✓ No missing values - data is clean!")
else:
    print("\n⚠ Warning: Missing values detected!")

## 3. Evaluation Metrics Implementation

Implement metrics for evaluating our ranking model.

**Two perspectives**:
1. **Rating Prediction**: RMSE/MAE - how accurate are our predicted ratings?
2. **Ranking Quality**: NDCG@K - how good is our ranking of items for each user?

For recommendation systems, **ranking quality (NDCG@K) is the primary metric**.

In [None]:
def compute_ndcg_at_k(df, predictions, k=10):
    """
    Compute NDCG@K for ranking evaluation.
    
    NDCG (Normalized Discounted Cumulative Gain) measures ranking quality.
    - Range: [0, 1], higher is better
    - 1.0 = perfect ranking
    - Accounts for position: items at top matter more
    - Handles graded relevance: ratings 1-5, not just binary
    
    Algorithm:
    1. For each user, sort items by predicted score (descending)
    2. Take top K items
    3. Compute DCG using actual ratings as relevance
    4. Compute ideal DCG (best possible ranking)
    5. NDCG = DCG / IDCG
    6. Average across all users
    
    Args:
        df: DataFrame with user_id and rating (actual)
        predictions: Array of predicted ratings
        k: Number of top items to consider
    
    Returns:
        Average NDCG@K across all users
    """
    # Add predictions to dataframe
    df_eval = df[['user_id', 'rating']].copy()
    df_eval['prediction'] = predictions
    
    ndcg_scores = []
    
    # Compute NDCG for each user
    for user_id, user_df in df_eval.groupby('user_id'):
        # Skip users with too few items (less than 2)
        if len(user_df) < 2:
            continue
        
        # Sort by prediction (descending) - this is our ranking
        user_df_sorted = user_df.sort_values('prediction', ascending=False)
        
        # Take top K
        top_k = user_df_sorted.head(k)
        
        # Compute DCG@K
        # DCG = sum(rel_i / log2(i+1)) for i in 1..K
        # rel_i is the actual rating at position i
        relevances = top_k['rating'].values
        positions = np.arange(1, len(relevances) + 1)
        dcg = np.sum(relevances / np.log2(positions + 1))
        
        # Compute Ideal DCG (IDCG) - best possible ranking
        # Sort by actual rating (descending)
        ideal_relevances = np.sort(user_df['rating'].values)[::-1][:k]
        ideal_positions = np.arange(1, len(ideal_relevances) + 1)
        idcg = np.sum(ideal_relevances / np.log2(ideal_positions + 1))
        
        # NDCG = DCG / IDCG (handle division by zero)
        if idcg > 0:
            ndcg = dcg / idcg
            ndcg_scores.append(ndcg)
    
    # Return average NDCG across all users
    return np.mean(ndcg_scores) if ndcg_scores else 0.0

print("NDCG@K function defined!")
print("\nWhat is NDCG@K?")
print("- Measures how well we rank items for each user")
print("- 1.0 = perfect ranking (best items at top)")
print("- 0.0 = worst ranking (worst items at top)")
print("- Position matters: top-K items weighted more heavily")
print("- Standard metric for recommendation systems")

In [None]:
def compute_rmse(y_true, y_pred):
    """Compute Root Mean Squared Error."""
    return np.sqrt(mean_squared_error(y_true, y_pred))

def compute_mae(y_true, y_pred):
    """Compute Mean Absolute Error."""
    return mean_absolute_error(y_true, y_pred)

def evaluate_model(model, X, y, df, prefix=""):
    """
    Evaluate model on a dataset.
    
    Returns:
        Dictionary with all metrics
    """
    # Make predictions
    y_pred = model.predict(X)
    
    # Compute metrics
    rmse = compute_rmse(y, y_pred)
    mae = compute_mae(y, y_pred)
    ndcg_10 = compute_ndcg_at_k(df, y_pred, k=10)
    ndcg_20 = compute_ndcg_at_k(df, y_pred, k=20)
    
    metrics = {
        f'{prefix}rmse': rmse,
        f'{prefix}mae': mae,
        f'{prefix}ndcg_10': ndcg_10,
        f'{prefix}ndcg_20': ndcg_20
    }
    
    return metrics, y_pred

print("Evaluation functions defined!")
print("\nMetrics we'll track:")
print("- RMSE: Rating prediction error (lower is better)")
print("- MAE: Mean absolute rating error (lower is better)")
print("- NDCG@10: Ranking quality for top 10 items (higher is better)")
print("- NDCG@20: Ranking quality for top 20 items (higher is better)")

In [None]:
def make_ndcg_evaluator(df, k=10):
    """
    Factory function that creates a custom NDCG evaluator for XGBoost early stopping.
    
    This function creates a closure that captures the dataframe containing user_ids
    and actual ratings. XGBoost's DMatrix doesn't easily support custom grouping
    by user_id, so we use this workaround to access the full dataframe.
    
    Args:
        df: DataFrame with 'user_id' and 'rating' columns
        k: Cutoff for NDCG computation (default: 10)
    
    Returns:
        Custom evaluation function compatible with XGBoost's eval_metric parameter
        
    Usage:
        val_ndcg_eval = make_ndcg_evaluator(val_df, k=10)
        model.fit(X_train, y_train, 
                  eval_set=[(X_val, y_val)], 
                  eval_metric=val_ndcg_eval)
    """
    def ndcg_eval(y_pred, dtrain):
        """
        Custom evaluation metric called by XGBoost during training.
        
        Args:
            y_pred: Predicted values (1D numpy array)
            dtrain: XGBoost DMatrix object (not used, we use captured df instead)
            
        Returns:
            Tuple of (metric_name, metric_value)
            XGBoost will maximize this value (higher is better for NDCG)
        """
        # Compute NDCG@K using the captured dataframe
        ndcg = compute_ndcg_at_k(df, y_pred, k=k)
        
        # Return (name, value)
        # XGBoost maximizes custom metrics by default, which is correct for NDCG
        return f'ndcg@{k}', ndcg
    
    return ndcg_eval

print("Custom NDCG evaluator factory function defined!")
print("\nThis enables NDCG@10-based early stopping:")
print("- XGBoost will stop training when validation NDCG@10 stops improving")
print("- More principled than RMSE-based early stopping for ranking tasks")
print("- Aligns optimization directly with our primary metric")

### Custom NDCG Metric for XGBoost Early Stopping

For hyperparameter tuning, we want XGBoost to stop training based on **NDCG@10** improvement (not RMSE).

**Challenge**: XGBoost's default metrics don't include NDCG@K for regression objectives.

**Solution**: Create a custom evaluation metric using a factory function that captures the validation dataframe.

## 4. Baseline XGBoost Model

Train a simple baseline model with default parameters.

**Goal**: Establish a baseline to compare future experiments against.

In [None]:
# Define baseline parameters
baseline_params = {
    'objective': 'reg:squarederror',  # Regression task (predict ratings)
    'max_depth': 6,                    # Tree depth
    'learning_rate': 0.1,              # Step size (eta)
    'n_estimators': 100,               # Number of trees
    'random_state': RANDOM_SEED,
    'n_jobs': -1                       # Use all CPU cores
}

print("Baseline parameters:")
for key, value in baseline_params.items():
    print(f"  {key:20} {value}")

In [None]:
# Train baseline model with MLflow tracking
with mlflow.start_run(run_name="baseline_xgboost"):
    # Log parameters
    mlflow.log_params(baseline_params)
    mlflow.log_param('num_features', len(feature_cols))
    mlflow.log_param('train_size', len(X_train))
    
    # Train model
    print("Training baseline model...")
    baseline_model = xgb.XGBRegressor(**baseline_params)
    baseline_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    print("✓ Training complete!")
    
    # Evaluate on train and val
    print("\nEvaluating on train set...")
    train_metrics, _ = evaluate_model(baseline_model, X_train, y_train, train_df, prefix='train_')
    
    print("Evaluating on validation set...")
    val_metrics, _ = evaluate_model(baseline_model, X_val, y_val, val_df, prefix='val_')
    
    # Log metrics
    all_metrics = {**train_metrics, **val_metrics}
    mlflow.log_metrics(all_metrics)
    
    # Log model
    mlflow.xgboost.log_model(baseline_model, "model")
    
    print("\n" + "="*60)
    print("BASELINE MODEL RESULTS")
    print("="*60)
    print("\nTrain metrics:")
    print(f"  RMSE:     {train_metrics['train_rmse']:.4f}")
    print(f"  MAE:      {train_metrics['train_mae']:.4f}")
    print(f"  NDCG@10:  {train_metrics['train_ndcg_10']:.4f}")
    print(f"  NDCG@20:  {train_metrics['train_ndcg_20']:.4f}")
    
    print("\nValidation metrics:")
    print(f"  RMSE:     {val_metrics['val_rmse']:.4f}")
    print(f"  MAE:      {val_metrics['val_mae']:.4f}")
    print(f"  NDCG@10:  {val_metrics['val_ndcg_10']:.4f}")
    print(f"  NDCG@20:  {val_metrics['val_ndcg_20']:.4f}")
    print("="*60)
    
    # Store run ID for later reference
    baseline_run_id = mlflow.active_run().info.run_id
    print(f"\nMLflow Run ID: {baseline_run_id}")

### Interpret Baseline Results

**Expected performance** for MovieLens 1M:
- NDCG@10: 0.60-0.75 (good baseline)
- RMSE: 0.85-1.0 (on 1-5 scale)

**What to check**:
- Train vs Val metrics: Large gap suggests overfitting
- NDCG@10 vs NDCG@20: Should be similar
- RMSE: Should be < 1.5 (if higher, model not learning)

**Red flags**:
- NDCG@10 < 0.4: Check metric implementation or data leakage
- Perfect metrics (1.0): Data leakage
- RMSE > 1.5: Model not learning

## 5. Hyperparameter Tuning (Staged Approach)

**Philosophy**: Tune systematically, not exhaustively.

Instead of grid-searching all parameters (expensive and confusing), we use a staged approach:
1. **Primary tuning**: Find best learning_rate + tree count (most impactful)
2. **Diagnostics**: Check if model is over/underfitting
3. **Secondary tuning**: Only adjust max_depth/colsample if needed

**Why this approach?**
- Efficient: ≤10 total runs vs. hundreds in full grid search
- Principled: Reflects how GBDT models are tuned in production
- Interpretable: Understand impact of each parameter
- Robust: NDCG@10-based early stopping aligns with our ranking goal

**Parameters we're tuning**:
- `learning_rate` (primary): Controls step size (0.02 = slow/stable, 0.2 = fast/aggressive)
- `n_estimators` (primary): Number of trees (selected via early stopping on NDCG@10)
- `max_depth` (secondary): Tree complexity (only if diagnostics indicate need)
- `colsample_bytree` (secondary): Feature sampling (only if overfitting)

In [None]:
# Stage 1: Primary Tuning - Learning Rate Grid Search
print("STAGE 1: PRIMARY TUNING (Learning Rate + Early Stopping)")
print("="*60)
print("\nNote: Using RMSE for early stopping, then evaluating with NDCG@10")
print("(XGBoost sklearn API doesn't support custom metrics for early stopping)")

learning_rates = [0.02, 0.05, 0.1, 0.2]
stage1_results = []

for lr in learning_rates:
    print(f"\nTraining with learning_rate={lr}...")
    
    with mlflow.start_run(run_name=f"stage1_lr_{lr}"):
        params = {
            'max_depth': 6,
            'learning_rate': lr,
            'n_estimators': 5000,  # Large upper bound
            'colsample_bytree': 0.8,
            'early_stopping_rounds': 50,  # Stop if no improvement for 50 rounds
            'objective': 'reg:squarederror',
            'random_state': RANDOM_SEED,
            'n_jobs': -1
        }
        
        # Log parameters
        mlflow.log_params(params)
        mlflow.log_param('stage', 'primary_tuning')
        
        # Train with RMSE-based early stopping (XGBoost default)
        model = xgb.XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        
        # Evaluate with NDCG@10 (our primary metric)
        val_metrics, _ = evaluate_model(model, X_val, y_val, val_df, prefix='val_')
        
        # Log metrics
        mlflow.log_metrics(val_metrics)
        mlflow.log_param('best_iteration', model.best_iteration)
        
        # Store results
        stage1_results.append({
            'lr': lr,
            'best_iteration': model.best_iteration,
            'val_ndcg_10': val_metrics['val_ndcg_10'],
            'val_ndcg_20': val_metrics['val_ndcg_20'],
            'val_rmse': val_metrics['val_rmse'],
            'model': model  # Save model for later
        })
        
        print(f"  Best iteration: {model.best_iteration}")
        print(f"  Val NDCG@10:    {val_metrics['val_ndcg_10']:.4f}")
        print(f"  Val RMSE:       {val_metrics['val_rmse']:.4f}")

print("\n" + "="*60)
print("Stage 1 complete!")

In [None]:
# Stage 1 Results: Select best learning rate
print("STAGE 1 RESULTS: Learning Rate Comparison")
print("="*60)

# Display results table
results_df = pd.DataFrame([{
    'learning_rate': r['lr'],
    'best_iteration': r['best_iteration'],
    'val_ndcg_10': r['val_ndcg_10'],
    'val_rmse': r['val_rmse']
} for r in stage1_results])

print("\nResults by Learning Rate:")
print(results_df.to_string(index=False))

# Select best configuration based on NDCG@10
best_config = max(stage1_results, key=lambda x: x['val_ndcg_10'])
best_lr = best_config['lr']
best_n_estimators = best_config['best_iteration']
best_model_stage1 = best_config['model']

print(f"\nBest Configuration:")
print(f"  Learning Rate:     {best_lr}")
print(f"  Best Iteration:    {best_n_estimators}")
print(f"  Val NDCG@10:       {best_config['val_ndcg_10']:.4f}")
print(f"  Val RMSE:          {best_config['val_rmse']:.4f}")

# Visualize results
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Plot NDCG@10 by learning rate
axes[0].bar([str(r['lr']) for r in stage1_results], 
            [r['val_ndcg_10'] for r in stage1_results])
axes[0].set_xlabel('Learning Rate')
axes[0].set_ylabel('Validation NDCG@10')
axes[0].set_title('NDCG@10 by Learning Rate')
axes[0].axhline(y=best_config['val_ndcg_10'], color='r', linestyle='--', alpha=0.5)

# Plot iterations by learning rate
axes[1].bar([str(r['lr']) for r in stage1_results], 
            [r['best_iteration'] for r in stage1_results])
axes[1].set_xlabel('Learning Rate')
axes[1].set_ylabel('Best Iteration (# Trees)')
axes[1].set_title('Optimal Tree Count by Learning Rate')

plt.tight_layout()
plt.show()

print("\nProceeding to Stage 2: Diagnostics...")

### Interpret Stage 1 Results

**What to look for**:
- Lower learning rates typically need more trees but often achieve better NDCG@10
- Higher learning rates train faster but may plateau earlier
- Best iteration should be well before n_estimators limit (5000)
  - If best_iteration is near 5000: Model hasn't converged, increase upper bound
  - If best_iteration < 100: Model converged very quickly

**Expected outcomes for MovieLens**:
- Best learning_rate: Likely 0.05 or 0.1
- Best iteration: 200-1000 trees (depends on learning_rate)
- Validation NDCG@10: 0.70-0.90 (if lower, check feature quality)

**Next**: Proceed to Stage 2 diagnostics to check for over/underfitting.

In [None]:
# Stage 2: Diagnostics - Train vs Val Gap Analysis
print("STAGE 2: DIAGNOSTICS (Train vs Val Gap)")
print("="*60)

# Evaluate best model on BOTH train and val
print("\nEvaluating best model from Stage 1...")
train_metrics_diag, _ = evaluate_model(best_model_stage1, X_train, y_train, train_df, prefix='train_')
val_metrics_diag, _ = evaluate_model(best_model_stage1, X_val, y_val, val_df, prefix='val_')

# Compute gaps
ndcg_gap = train_metrics_diag['train_ndcg_10'] - val_metrics_diag['val_ndcg_10']
rmse_gap = val_metrics_diag['val_rmse'] - train_metrics_diag['train_rmse']

print(f"\nTrain Metrics:")
print(f"  NDCG@10: {train_metrics_diag['train_ndcg_10']:.4f}")
print(f"  RMSE:    {train_metrics_diag['train_rmse']:.4f}")

print(f"\nValidation Metrics:")
print(f"  NDCG@10: {val_metrics_diag['val_ndcg_10']:.4f}")
print(f"  RMSE:    {val_metrics_diag['val_rmse']:.4f}")

print(f"\nGap Analysis:")
print(f"  NDCG Gap (train - val): {ndcg_gap:.4f}")
print(f"  RMSE Gap (val - train): {rmse_gap:.4f}")

# Visualize train vs val
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

# NDCG comparison
axes[0].bar(['Train', 'Val'], 
            [train_metrics_diag['train_ndcg_10'], val_metrics_diag['val_ndcg_10']])
axes[0].set_ylabel('NDCG@10')
axes[0].set_title(f'NDCG@10 Comparison (Gap: {ndcg_gap:.4f})')
axes[0].set_ylim(0.8, 1.0)

# RMSE comparison
axes[1].bar(['Train', 'Val'], 
            [train_metrics_diag['train_rmse'], val_metrics_diag['val_rmse']])
axes[1].set_ylabel('RMSE')
axes[1].set_title(f'RMSE Comparison (Gap: {rmse_gap:.4f})')

plt.tight_layout()
plt.show()

In [None]:
# Stage 2 Decision: Determine if Stage 3 is needed
print("STAGE 2 DECISION: Should we proceed to Stage 3?")
print("="*60)

# Decision rules
proceed_to_stage3 = False
stage3_action = None

if ndcg_gap > 0.05:
    proceed_to_stage3 = True
    stage3_action = "REDUCE_COMPLEXITY"
    print(f"\n⚠ NDCG gap ({ndcg_gap:.4f}) > 0.05: Model is OVERFITTING")
    print("  Recommendation: Reduce complexity (lower max_depth or colsample_bytree)")
elif ndcg_gap < 0.02 and val_metrics_diag['val_ndcg_10'] < 0.70:
    proceed_to_stage3 = True
    stage3_action = "INCREASE_COMPLEXITY"
    print(f"\n⚠ Small gap but low NDCG@10 ({val_metrics_diag['val_ndcg_10']:.4f}): Model is UNDERFITTING")
    print("  Recommendation: Increase complexity (higher max_depth)")
else:
    print(f"\n✓ GOOD FIT detected!")
    print(f"  NDCG gap ({ndcg_gap:.4f}) is acceptable")
    print(f"  Val NDCG@10 ({val_metrics_diag['val_ndcg_10']:.4f}) is satisfactory")
    print("  No further tuning needed - skipping Stage 3")

print(f"\nProceed to Stage 3: {proceed_to_stage3}")
if stage3_action:
    print(f"Stage 3 Action: {stage3_action}")

### Interpret Diagnostics

**NDCG Gap Analysis**:
- **Gap < 0.02**: Excellent generalization, model is well-regularized
- **Gap 0.02-0.05**: Normal, acceptable
- **Gap > 0.05**: Overfitting, model memorizing training data

**RMSE Gap Analysis**:
- **Gap < 0.05**: Good
- **Gap 0.05-0.10**: Acceptable
- **Gap > 0.10**: May be overfitting

**Decision**:
Based on the gaps above, we either:
- **Good fit**: Stop here, use current model
- **Overfitting**: Proceed to Stage 3, reduce complexity (lower max_depth or colsample_bytree)
- **Underfitting**: Proceed to Stage 3, increase complexity (higher max_depth)

**Note**: If diagnostics indicate "GOOD FIT", Stage 3 cells will not run (proceed_to_stage3=False).

In [None]:
# Stage 3: Secondary Tuning (Only runs if needed)
if proceed_to_stage3:
    print("STAGE 3: SECONDARY TUNING")
    print("="*60)
    
    stage3_results = []
    
    if stage3_action == "REDUCE_COMPLEXITY":
        # Try reducing max_depth
        print("\nTrying reduced max_depth values...")
        depths_to_try = [3, 4, 5]
    else:  # INCREASE_COMPLEXITY
        # Try increasing max_depth
        print("\nTrying increased max_depth values...")
        depths_to_try = [8, 9, 12]
    
    for depth in depths_to_try:
        print(f"\nTraining with max_depth={depth}...")
        
        with mlflow.start_run(run_name=f"stage3_depth_{depth}"):
            params = {
                'max_depth': depth,
                'learning_rate': best_lr,
                'n_estimators': 5000,
                'colsample_bytree': 0.8,
                'early_stopping_rounds': 50,
                'objective': 'reg:squarederror',
                'random_state': RANDOM_SEED,
                'n_jobs': -1
            }
            
            mlflow.log_params(params)
            mlflow.log_param('stage', 'secondary_tuning')
            mlflow.log_param('action', stage3_action)
            
            model = xgb.XGBRegressor(**params)
            model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                verbose=False
            )
            
            val_metrics_s3, _ = evaluate_model(model, X_val, y_val, val_df, prefix='val_')
            train_metrics_s3, _ = evaluate_model(model, X_train, y_train, train_df, prefix='train_')
            
            mlflow.log_metrics(val_metrics_s3)
            mlflow.log_param('best_iteration', model.best_iteration)
            
            gap = train_metrics_s3['train_ndcg_10'] - val_metrics_s3['val_ndcg_10']
            
            stage3_results.append({
                'depth': depth,
                'best_iteration': model.best_iteration,
                'val_ndcg_10': val_metrics_s3['val_ndcg_10'],
                'val_rmse': val_metrics_s3['val_rmse'],
                'ndcg_gap': gap,
                'model': model
            })
            
            print(f"  Best iteration: {model.best_iteration}")
            print(f"  Val NDCG@10:    {val_metrics_s3['val_ndcg_10']:.4f}")
            print(f"  NDCG Gap:       {gap:.4f}")
    
    # Select best from Stage 3
    best_stage3 = max(stage3_results, key=lambda x: x['val_ndcg_10'])
    print(f"\nBest Stage 3 Configuration:")
    print(f"  max_depth:   {best_stage3['depth']}")
    print(f"  Val NDCG@10: {best_stage3['val_ndcg_10']:.4f}")
    print(f"  NDCG Gap:    {best_stage3['ndcg_gap']:.4f}")
    
    # Compare Stage 1 vs Stage 3
    if best_stage3['val_ndcg_10'] > best_config['val_ndcg_10']:
        print(f"\n✓ Stage 3 improved NDCG@10!")
        final_best_model = best_stage3['model']
        final_best_params = {'lr': best_lr, 'depth': best_stage3['depth'], 'n_estimators': best_stage3['best_iteration']}
        final_best_ndcg = best_stage3['val_ndcg_10']
    else:
        print(f"\n→ Stage 1 model remains best")
        final_best_model = best_model_stage1
        final_best_params = {'lr': best_lr, 'depth': 6, 'n_estimators': best_n_estimators}
        final_best_ndcg = best_config['val_ndcg_10']
else:
    print("Stage 3 SKIPPED (good fit detected in Stage 2)")
    final_best_model = best_model_stage1
    final_best_params = {'lr': best_lr, 'depth': 6, 'n_estimators': best_n_estimators}
    final_best_ndcg = best_config['val_ndcg_10']

print("\n" + "="*60)
print("Hyperparameter tuning complete!")

In [None]:
# Final Tuning Summary
print("HYPERPARAMETER TUNING SUMMARY")
print("="*60)

print(f"\nTotal MLflow runs: {4 + (len(stage3_results) if proceed_to_stage3 else 0)} (excluding baseline)")

print(f"\nFinal Best Model Configuration:")
print(f"  Learning Rate: {final_best_params['lr']}")
print(f"  Max Depth:     {final_best_params['depth']}")
print(f"  N Estimators:  {final_best_params['n_estimators']}")
print(f"  Colsample:     0.8")

print(f"\nFinal Validation NDCG@10: {final_best_ndcg:.4f}")

# Compare to baseline
baseline_ndcg = val_metrics['val_ndcg_10']  # From Section 4
improvement = final_best_ndcg - baseline_ndcg
pct_improvement = (improvement / baseline_ndcg) * 100

print(f"\nComparison to Baseline:")
print(f"  Baseline NDCG@10:  {baseline_ndcg:.4f}")
print(f"  Final NDCG@10:     {final_best_ndcg:.4f}")
print(f"  Improvement:       {improvement:+.4f} ({pct_improvement:+.1f}%)")

print("\n" + "="*60)
print("Ready for Section 6: Feature Importance Analysis")

## 6. Feature Importance Analysis

Analyze which features are most important for the ranking model.

In [None]:
# Get feature importance from best model
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': final_best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
print("="*60)
for i, row in feature_importance.head(15).iterrows():
    print(f"{row['feature']:30} {row['importance']:.4f}")

# Plot top 20 features
plt.figure(figsize=(10, 8))
top_n = 20
plt.barh(range(top_n), feature_importance.head(top_n)['importance'].values[::-1])
plt.yticks(range(top_n), feature_importance.head(top_n)['feature'].values[::-1])
plt.xlabel('Feature Importance')
plt.title(f'Top {top_n} Feature Importances')
plt.tight_layout()
plt.show()

In [None]:
# Analyze importance by feature group
group_importance = {}

for group_name, group_features in metadata['feature_groups'].items():
    group_total = feature_importance[feature_importance['feature'].isin(group_features)]['importance'].sum()
    group_importance[group_name] = group_total

# Sort by importance
group_importance_sorted = dict(sorted(group_importance.items(), key=lambda x: x[1], reverse=True))

print("Feature Group Importance:")
print("="*60)
for group, importance in group_importance_sorted.items():
    pct = importance / sum(group_importance.values()) * 100
    print(f"{group:15} {importance:8.4f} ({pct:5.1f}%)")

# Visualize group importance
plt.figure(figsize=(10, 6))
plt.bar(group_importance_sorted.keys(), group_importance_sorted.values())
plt.xlabel('Feature Group')
plt.ylabel('Total Importance')
plt.title('Feature Importance by Group')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\n✓ Feature importance analysis complete!")

## 7. Model Evaluation & Analysis

Deep dive into model performance and error patterns.

In [None]:
# Final evaluation on validation set
val_predictions = final_best_model.predict(X_val)
errors = val_predictions - y_val

print("Final Model Validation Performance:")
print("="*60)
final_val_metrics, _ = evaluate_model(final_best_model, X_val, y_val, val_df, prefix='val_')
print(f"NDCG@10: {final_val_metrics['val_ndcg_10']:.4f}")
print(f"NDCG@20: {final_val_metrics['val_ndcg_20']:.4f}")
print(f"RMSE:    {final_val_metrics['val_rmse']:.4f}")
print(f"MAE:     {final_val_metrics['val_mae']:.4f}")

# Plot: Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_val, val_predictions, alpha=0.1, s=1)
plt.plot([1, 5], [1, 5], 'r--', label='Perfect predictions')
plt.xlabel('Actual Rating')
plt.ylabel('Predicted Rating')
plt.title('Actual vs Predicted Ratings (Validation Set)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nInterpretation:")
print("- Points on diagonal: Perfect predictions")
print("- Scatter around diagonal: Prediction error")
print("- Check for systematic bias (above/below diagonal)")

In [None]:
# Analyze errors by actual rating
error_by_rating = pd.DataFrame({
    'actual': y_val,
    'error': np.abs(errors)
}).groupby('actual')['error'].agg(['mean', 'std', 'count'])

print("Absolute Error by Actual Rating:")
print("="*60)
print(error_by_rating)

plt.figure(figsize=(10, 6))
plt.bar(error_by_rating.index, error_by_rating['mean'], yerr=error_by_rating['std'], capsize=5)
plt.xlabel('Actual Rating')
plt.ylabel('Mean Absolute Error')
plt.title('Prediction Error by Actual Rating')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

print("\nInterpretation:")
print("- Check if model struggles with extreme ratings (1 or 5)")
print("- Larger errors at extremes are common (regression to mean)")

## 8. Save Best Model

Save the best model and its configuration for later use.

In [None]:
# Save best model
model_dir = Path('../models')
model_dir.mkdir(parents=True, exist_ok=True)

# Save model
model_path = model_dir / 'xgboost_tuned.json'
final_best_model.save_model(model_path)

# Save feature columns
feature_cols_path = model_dir / 'feature_columns.json'
with open(feature_cols_path, 'w') as f:
    json.dump(feature_cols, f, indent=2)

# Save model info
model_info = {
    'model_name': 'xgboost_tuned',
    'created_at': pd.Timestamp.now().isoformat(),
    'params': {
        'learning_rate': final_best_params['lr'],
        'max_depth': final_best_params['depth'],
        'n_estimators': final_best_params['n_estimators'],
        'colsample_bytree': 0.8,
        'objective': 'reg:squarederror'
    },
    'metrics': {
        'val_ndcg_10': float(final_val_metrics['val_ndcg_10']),
        'val_ndcg_20': float(final_val_metrics['val_ndcg_20']),
        'val_rmse': float(final_val_metrics['val_rmse']),
        'val_mae': float(final_val_metrics['val_mae'])
    },
    'num_features': len(feature_cols),
    'tuning_summary': {
        'stage1_runs': 4,
        'stage3_runs': len(stage3_results) if proceed_to_stage3 else 0,
        'total_runs': 4 + (len(stage3_results) if proceed_to_stage3 else 0)
    }
}

model_info_path = model_dir / 'model_info.json'
with open(model_info_path, 'w') as f:
    json.dump(model_info, f, indent=2)

print("Model saved successfully!")
print("="*60)
print(f"Model:     {model_path}")
print(f"Features:  {feature_cols_path}")
print(f"Info:      {model_info_path}")
print("\nTo load model:")
print("  model = xgb.XGBRegressor()")
print(f"  model.load_model('{model_path}')")
print("\nView experiments in MLflow UI:")
print("  mlflow ui")
print("  Open: http://127.0.0.1:5000")