# 7. Model Training with Time-Series Cross-Validation

Train XGBoost models for:
1. **Regression**: Next-day return prediction
2. **Classification**: Up/down direction prediction

**Key Features**:
- Time-series train/validation/test split with purge gaps
- Point-in-time correct features (no look-ahead bias)
- Proper backtesting methodology
- Model registry integration

**Pipeline**: Feature View → Train/Val/Test Split → XGBoost Training → Evaluation → Model Registry

In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    roc_auc_score, accuracy_score, classification_report, confusion_matrix
)
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

from utils.hopsworks_helpers import get_feature_store, get_model_registry
from utils.time_series_splits import get_train_val_test_split
import yaml
import joblib
import os
from datetime import datetime

# Load config
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("✓ Imports successful")

✓ Imports successful


## Connect to Hopsworks and Load Feature View

In [None]:
# Connect to Hopsworks
print("Connecting to Hopsworks...")
fs = get_feature_store()
print(f"✓ Connected to feature store: {fs.name}")

# Read directly from combined feature group (has both features AND labels)
# Note: feature_view.get_batch_data() excludes labels, so we read from the feature group
print("\nLoading combined feature group...")
combined_fg = fs.get_feature_group('qqq_combined_features', version=1)
print(f"✓ Feature group loaded: {combined_fg.name} v{combined_fg.version}")

# Read all data
print("\nReading data...")
df = combined_fg.read()
print(f"✓ Data loaded: {df.shape}")

# Verify we have the required columns
required_cols = ['date', 'target_return', 'target_direction']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    raise ValueError(f"Missing required columns: {missing_cols}")

print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
print(f"  Has targets: target_return={'target_return' in df.columns}, target_direction={'target_direction' in df.columns}")

## Time-Series Train/Validation/Test Split with Purge Gap

Split data chronologically with purge gaps to prevent look-ahead bias from lagged features.

In [None]:
# Reset index to get date column (feature view may use date as index)
if 'date' not in df.columns and df.index.name == 'date':
    df = df.reset_index()

# Convert date to datetime and remove timezone
df['date'] = pd.to_datetime(df['date'])
if hasattr(df['date'].dtype, 'tz') and df['date'].dtype.tz is not None:
    df['date'] = df['date'].dt.tz_localize(None)

# Sort by date to ensure chronological order
df = df.sort_values('date').reset_index(drop=True)

# Define split dates
# Use 70% train, 15% validation, 15% test (approximately)
n_samples = len(df)
train_pct = 0.70
val_pct = 0.15

train_end_idx = int(n_samples * train_pct)
val_end_idx = int(n_samples * (train_pct + val_pct))

train_end_date = df.iloc[train_end_idx]['date']
val_end_date = df.iloc[val_end_idx]['date']

print(f"Total samples: {n_samples}")
print(f"Split configuration:")
print(f"  Train: 0 to {train_end_idx} ({train_end_idx} samples, {train_pct*100:.0f}%)")
print(f"  Validation: {train_end_idx} to {val_end_idx} ({val_end_idx - train_end_idx} samples, {val_pct*100:.0f}%)")
print(f"  Test: {val_end_idx} to {n_samples} ({n_samples - val_end_idx} samples, {(1-train_pct-val_pct)*100:.0f}%)")
print(f"\nSplit dates:")
print(f"  Train end date: {train_end_date}")
print(f"  Validation end date: {val_end_date}")

# Perform split with purge gap of 1 day
purge_gap = 1
print(f"\nPerforming time-series split with purge_gap={purge_gap}...")

train_df, val_df, test_df = get_train_val_test_split(
    df,
    train_end_date=train_end_date.strftime('%Y-%m-%d'),
    val_end_date=val_end_date.strftime('%Y-%m-%d'),
    date_col='date',
    purge_gap=purge_gap
)

print(f"\n✓ Split completed:")
print(f"  Train: {len(train_df)} samples ({train_df['date'].min()} to {train_df['date'].max()})")
print(f"  Validation: {len(val_df)} samples ({val_df['date'].min()} to {val_df['date'].max()})")
print(f"  Test: {len(test_df)} samples ({test_df['date'].min()} to {test_df['date'].max()})")

## Prepare Features and Targets

In [None]:
# Define feature columns (exclude date, qqq_close, and targets)
feature_cols = [col for col in train_df.columns 
                if col not in ['date', 'qqq_close', 'target_return', 'target_direction']]

print(f"Feature columns ({len(feature_cols)}):")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i:2d}. {col}")

# Split into features and targets
X_train = train_df[feature_cols]
y_train_return = train_df['target_return']
y_train_direction = train_df['target_direction']

X_val = val_df[feature_cols]
y_val_return = val_df['target_return']
y_val_direction = val_df['target_direction']

X_test = test_df[feature_cols]
y_test_return = test_df['target_return']
y_test_direction = test_df['target_direction']

print(f"\n✓ Data prepared:")
print(f"  Train: X={X_train.shape}, y_return={y_train_return.shape}, y_direction={y_train_direction.shape}")
print(f"  Val:   X={X_val.shape}, y_return={y_val_return.shape}, y_direction={y_val_direction.shape}")
print(f"  Test:  X={X_test.shape}, y_return={y_test_return.shape}, y_direction={y_test_direction.shape}")

# Check for missing values
assert X_train.isnull().sum().sum() == 0, "Training features have missing values!"
assert X_val.isnull().sum().sum() == 0, "Validation features have missing values!"
assert X_test.isnull().sum().sum() == 0, "Test features have missing values!"
print("\n✓ No missing values in features")

## Train Regression Model (Next-Day Return Prediction)

In [None]:
# Initialize XGBoost regressor
print("Training XGBoost regression model...")
xgb_regressor = xgb.XGBRegressor(
    n_estimators=config['model']['xgboost']['regression']['n_estimators'],
    max_depth=config['model']['xgboost']['regression']['max_depth'],
    learning_rate=config['model']['xgboost']['regression']['learning_rate'],
    random_state=config['model']['xgboost']['regression']['random_state'],
    objective='reg:squarederror',
    tree_method='hist',  # Faster training
    eval_metric='rmse'  # Moved here from fit()
)

# Train with validation set monitoring
xgb_regressor.fit(
    X_train, y_train_return,
    eval_set=[(X_train, y_train_return), (X_val, y_val_return)],
    verbose=20
)

print("\n✓ Regression model trained!")

# Get predictions on all sets
y_train_pred_return = xgb_regressor.predict(X_train)
y_val_pred_return = xgb_regressor.predict(X_val)
y_test_pred_return = xgb_regressor.predict(X_test)

print(f"\nPredictions generated:")
print(f"  Train: {len(y_train_pred_return)} predictions")
print(f"  Validation: {len(y_val_pred_return)} predictions")
print(f"  Test: {len(y_test_pred_return)} predictions")

## Evaluate Regression Model

In [None]:
# Calculate metrics for all sets
def calculate_regression_metrics(y_true, y_pred, set_name):
    """Calculate and display regression metrics"""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    # Directional accuracy (did we predict the correct sign?)
    directional_acc = accuracy_score(
        (y_true > 0).astype(int),
        (y_pred > 0).astype(int)
    )
    
    print(f"\n{set_name} Metrics:")
    print(f"  MAE:  {mae:.6f}")
    print(f"  RMSE: {rmse:.6f}")
    print(f"  R²:   {r2:.6f}")
    print(f"  Directional Accuracy: {directional_acc:.4f} ({directional_acc*100:.2f}%)")
    
    return {'mae': mae, 'rmse': rmse, 'r2': r2, 'directional_accuracy': directional_acc}

# Evaluate on all sets
print("="*60)
print("REGRESSION MODEL EVALUATION")
print("="*60)

train_metrics = calculate_regression_metrics(y_train_return, y_train_pred_return, "Training")
val_metrics = calculate_regression_metrics(y_val_return, y_val_pred_return, "Validation")
test_metrics = calculate_regression_metrics(y_test_return, y_test_pred_return, "Test")

print("\n" + "="*60)
print("FINAL TEST SET PERFORMANCE")
print("="*60)
print(f"MAE:  {test_metrics['mae']:.6f}")
print(f"RMSE: {test_metrics['rmse']:.6f}")
print(f"R²:   {test_metrics['r2']:.6f}")
print(f"Directional Accuracy: {test_metrics['directional_accuracy']:.4f}")
print("="*60)

## Visualize Regression Performance

In [None]:
# Plot predictions vs actuals
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Test set: Predicted vs Actual scatter
ax = axes[0, 0]
ax.scatter(y_test_return, y_test_pred_return, alpha=0.5, s=30)
ax.plot([y_test_return.min(), y_test_return.max()], 
        [y_test_return.min(), y_test_return.max()], 'r--', lw=2)
ax.set_xlabel('Actual Return')
ax.set_ylabel('Predicted Return')
ax.set_title('Test Set: Predicted vs Actual Returns')
ax.grid(True, alpha=0.3)
ax.text(0.05, 0.95, f'R² = {test_metrics["r2"]:.4f}\nRMSE = {test_metrics["rmse"]:.6f}', 
        transform=ax.transAxes, verticalalignment='top',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# Test set: Time series plot
ax = axes[0, 1]
test_dates = test_df['date'].values
ax.plot(test_dates, y_test_return.values, label='Actual', alpha=0.7, linewidth=2)
ax.plot(test_dates, y_test_pred_return, label='Predicted', alpha=0.7, linewidth=2)
ax.set_xlabel('Date')
ax.set_ylabel('Return')
ax.set_title('Test Set: Time Series of Returns')
ax.legend()
ax.grid(True, alpha=0.3)
ax.tick_params(axis='x', rotation=45)

# Residuals distribution
ax = axes[1, 0]
residuals = y_test_return - y_test_pred_return
ax.hist(residuals, bins=30, edgecolor='black', alpha=0.7)
ax.axvline(0, color='r', linestyle='--', linewidth=2)
ax.set_xlabel('Prediction Error (Actual - Predicted)')
ax.set_ylabel('Frequency')
ax.set_title(f'Residuals Distribution (Mean={residuals.mean():.6f})')
ax.grid(True, alpha=0.3)

# Residuals over time
ax = axes[1, 1]
ax.scatter(test_dates, residuals, alpha=0.5, s=30)
ax.axhline(0, color='r', linestyle='--', linewidth=2)
ax.set_xlabel('Date')
ax.set_ylabel('Prediction Error')
ax.set_title('Residuals Over Time')
ax.grid(True, alpha=0.3)
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("✓ Regression visualizations complete")

## Train Classification Model (Up/Down Direction Prediction)

In [None]:
# Initialize XGBoost classifier
print("Training XGBoost classification model...")
xgb_classifier = xgb.XGBClassifier(
    n_estimators=config['model']['xgboost']['classification']['n_estimators'],
    max_depth=config['model']['xgboost']['classification']['max_depth'],
    learning_rate=config['model']['xgboost']['classification']['learning_rate'],
    random_state=config['model']['xgboost']['classification']['random_state'],
    objective='binary:logistic',
    tree_method='hist',  # Faster training
    eval_metric='logloss'  # Moved here from fit()
)

# Train with validation set monitoring
xgb_classifier.fit(
    X_train, y_train_direction,
    eval_set=[(X_train, y_train_direction), (X_val, y_val_direction)],
    verbose=20
)

print("\n✓ Classification model trained!")

# Get predictions on all sets
y_train_pred_direction = xgb_classifier.predict(X_train)
y_train_pred_proba = xgb_classifier.predict_proba(X_train)[:, 1]

y_val_pred_direction = xgb_classifier.predict(X_val)
y_val_pred_proba = xgb_classifier.predict_proba(X_val)[:, 1]

y_test_pred_direction = xgb_classifier.predict(X_test)
y_test_pred_proba = xgb_classifier.predict_proba(X_test)[:, 1]

print(f"\nPredictions generated:")
print(f"  Train: {len(y_train_pred_direction)} predictions")
print(f"  Validation: {len(y_val_pred_direction)} predictions")
print(f"  Test: {len(y_test_pred_direction)} predictions")

## Evaluate Classification Model

In [None]:
# Calculate metrics for all sets
def calculate_classification_metrics(y_true, y_pred, y_proba, set_name):
    """Calculate and display classification metrics"""
    accuracy = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_proba)
    
    print(f"\n{set_name} Metrics:")
    print(f"  Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"  AUC-ROC:  {auc:.4f}")
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print(f"\n  Confusion Matrix:")
    print(f"                Predicted")
    print(f"                Down  Up")
    print(f"  Actual Down   {cm[0,0]:4d}  {cm[0,1]:4d}")
    print(f"         Up     {cm[1,0]:4d}  {cm[1,1]:4d}")
    
    return {'accuracy': accuracy, 'auc': auc, 'confusion_matrix': cm}

# Evaluate on all sets
print("="*60)
print("CLASSIFICATION MODEL EVALUATION")
print("="*60)

train_class_metrics = calculate_classification_metrics(
    y_train_direction, y_train_pred_direction, y_train_pred_proba, "Training"
)
val_class_metrics = calculate_classification_metrics(
    y_val_direction, y_val_pred_direction, y_val_pred_proba, "Validation"
)
test_class_metrics = calculate_classification_metrics(
    y_test_direction, y_test_pred_direction, y_test_pred_proba, "Test"
)

print("\n" + "="*60)
print("FINAL TEST SET PERFORMANCE")
print("="*60)
print(f"Accuracy: {test_class_metrics['accuracy']:.4f}")
print(f"AUC-ROC:  {test_class_metrics['auc']:.4f}")
print("\nClassification Report:")
print(classification_report(
    y_test_direction, y_test_pred_direction, 
    target_names=['Down (0)', 'Up (1)'],
    digits=4
))
print("="*60)

## Visualize Classification Performance

In [None]:
# Visualize classification results
from sklearn.metrics import roc_curve

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# ROC Curve
ax = axes[0, 0]
fpr, tpr, _ = roc_curve(y_test_direction, y_test_pred_proba)
ax.plot(fpr, tpr, linewidth=2, label=f'ROC (AUC = {test_class_metrics["auc"]:.4f})')
ax.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve (Test Set)')
ax.legend()
ax.grid(True, alpha=0.3)

# Confusion Matrix Heatmap
ax = axes[0, 1]
cm = test_class_metrics['confusion_matrix']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=['Down', 'Up'], yticklabels=['Down', 'Up'])
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix (Test Set)')

# Prediction probabilities distribution
ax = axes[1, 0]
ax.hist(y_test_pred_proba[y_test_direction == 0], bins=30, alpha=0.5, label='Actual Down', edgecolor='black')
ax.hist(y_test_pred_proba[y_test_direction == 1], bins=30, alpha=0.5, label='Actual Up', edgecolor='black')
ax.axvline(0.5, color='r', linestyle='--', linewidth=2, label='Decision Threshold')
ax.set_xlabel('Predicted Probability (Up)')
ax.set_ylabel('Frequency')
ax.set_title('Predicted Probabilities Distribution')
ax.legend()
ax.grid(True, alpha=0.3)

# Predictions over time
ax = axes[1, 1]
test_dates = test_df['date'].values
colors = ['red' if actual == pred else 'green' for actual, pred in zip(y_test_direction, y_test_pred_direction)]
ax.scatter(test_dates, y_test_pred_proba, c=colors, alpha=0.6, s=50)
ax.axhline(0.5, color='black', linestyle='--', linewidth=2, label='Decision Threshold')
ax.set_xlabel('Date')
ax.set_ylabel('Predicted Probability (Up)')
ax.set_title('Predictions Over Time (Red=Correct, Green=Incorrect)')
ax.legend()
ax.grid(True, alpha=0.3)
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("✓ Classification visualizations complete")

## Feature Importance Analysis

In [None]:
# Create models directory
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

# Save models
regressor_path = os.path.join(models_dir, 'qqq_regressor.pkl')
classifier_path = os.path.join(models_dir, 'qqq_classifier.pkl')

joblib.dump(xgb_regressor, regressor_path)
joblib.dump(xgb_classifier, classifier_path)

print(f"✓ Models saved locally:")
print(f"  Regressor: {regressor_path}")
print(f"  Classifier: {classifier_path}")

# Also save model metadata
metadata = {
    'trained_at': datetime.now().isoformat(),
    'feature_count': len(feature_cols),
    'training_samples': len(train_df),
    'validation_samples': len(val_df),
    'test_samples': len(test_df),
    'regression_metrics': {
        'test_mae': float(test_metrics['mae']),
        'test_rmse': float(test_metrics['rmse']),
        'test_r2': float(test_metrics['r2']),
        'test_directional_accuracy': float(test_metrics['directional_accuracy'])
    },
    'classification_metrics': {
        'test_accuracy': float(test_class_metrics['accuracy']),
        'test_auc': float(test_class_metrics['auc'])
    }
}

import json
metadata_path = os.path.join(models_dir, 'model_metadata.json')
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"  Metadata: {metadata_path}")
print("\n✓ Local save complete")

## Save Models Locally (Must run before Model Registry upload)

In [None]:
# Analyze feature importance from both models
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Regression model feature importance
ax = axes[0]
feature_importance_reg = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_regressor.feature_importances_
}).sort_values('importance', ascending=False)

top_n = 15
sns.barplot(
    data=feature_importance_reg.head(top_n), 
    y='feature', 
    x='importance',
    ax=ax,
    palette='viridis'
)
ax.set_title(f'Top {top_n} Features - Regression Model', fontsize=14, fontweight='bold')
ax.set_xlabel('Importance')
ax.set_ylabel('Feature')
ax.grid(True, alpha=0.3, axis='x')

# Classification model feature importance
ax = axes[1]
feature_importance_cls = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_classifier.feature_importances_
}).sort_values('importance', ascending=False)

sns.barplot(
    data=feature_importance_cls.head(top_n), 
    y='feature', 
    x='importance',
    ax=ax,
    palette='plasma'
)
ax.set_title(f'Top {top_n} Features - Classification Model', fontsize=14, fontweight='bold')
ax.set_xlabel('Importance')
ax.set_ylabel('Feature')
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("TOP 10 MOST IMPORTANT FEATURES")
print("="*60)
print("\nRegression Model:")
for i, row in feature_importance_reg.head(10).iterrows():
    print(f"  {row['feature']:25s} : {row['importance']:.4f}")

print("\nClassification Model:")
for i, row in feature_importance_cls.head(10).iterrows():
    print(f"  {row['feature']:25s} : {row['importance']:.4f}")
print("="*60)

## Summary

**✅ Model Training Complete**

### Models Trained:
1. **XGBoost Regressor** - Predicts next-day QQQ return
2. **XGBoost Classifier** - Predicts up/down direction

### Key Features:
- **Time-Series Split**: 70% train, 15% validation, 15% test with purge gaps
- **Point-in-Time Correct**: No look-ahead bias in features
- **Feature Count**: 27 engineered features
- **Training Method**: XGBoost with validation set monitoring

### Model Performance:
**Regression Model:**
- Test MAE: Check output above
- Test RMSE: Check output above  
- Test R²: Check output above
- Directional Accuracy: Check output above

**Classification Model:**
- Test Accuracy: Check output above
- Test AUC-ROC: Check output above

### Saved Artifacts:
- Local models: `../models/qqq_regressor.pkl`, `../models/qqq_classifier.pkl`
- Model metadata: `../models/model_metadata.json`
- Hopsworks registry: Both models registered (if successful)

### Next Steps:
- **Notebook 8**: Implement daily inference pipeline
- Use saved models to generate daily predictions
- Create Gradio dashboard for visualization

In [None]:
# Connect to model registry
print("Connecting to Hopsworks Model Registry...")
mr = get_model_registry()
print(f"✓ Connected to model registry")

# Register regression model
print("\nRegistering regression model...")
try:
    reg_model = mr.python.create_model(
        name="qqq_return_regressor",
        description="XGBoost regression model for predicting next-day QQQ return. Trained with time-series split and point-in-time correct features.",
        metrics={
            "test_mae": float(test_metrics['mae']),
            "test_rmse": float(test_metrics['rmse']),
            "test_r2": float(test_metrics['r2']),
            "test_directional_accuracy": float(test_metrics['directional_accuracy']),
            "val_mae": float(val_metrics['mae']),
            "val_rmse": float(val_metrics['rmse']),
        },
        input_example=X_train.head(1),
        model_schema={
            "input_schema": X_train.dtypes.to_dict(),
            "output_schema": {"predicted_return": "float64"}
        }
    )
    reg_model.save(regressor_path)
    print(f"✓ Regression model registered: {reg_model.name} v{reg_model.version}")
except Exception as e:
    print(f"⚠️  Could not register regression model: {e}")
    print("   Model saved locally but not in registry")

# Register classification model
print("\nRegistering classification model...")
try:
    cls_model = mr.python.create_model(
        name="qqq_direction_classifier",
        description="XGBoost classification model for predicting QQQ up/down direction. Trained with time-series split and point-in-time correct features.",
        metrics={
            "test_accuracy": float(test_class_metrics['accuracy']),
            "test_auc": float(test_class_metrics['auc']),
            "val_accuracy": float(val_class_metrics['accuracy']),
            "val_auc": float(val_class_metrics['auc']),
        },
        input_example=X_train.head(1),
        model_schema={
            "input_schema": X_train.dtypes.to_dict(),
            "output_schema": {
                "predicted_direction": "int64",
                "predicted_probability": "float64"
            }
        }
    )
    cls_model.save(classifier_path)
    print(f"✓ Classification model registered: {cls_model.name} v{cls_model.version}")
except Exception as e:
    print(f"⚠️  Could not register classification model: {e}")
    print("   Model saved locally but not in registry")

print("\n✓ Model registration complete")

## Upload Models to Hopsworks Model Registry

In [None]:
# Create models directory
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

# Save models
regressor_path = os.path.join(models_dir, 'qqq_regressor.pkl')
classifier_path = os.path.join(models_dir, 'qqq_classifier.pkl')

joblib.dump(xgb_regressor, regressor_path)
joblib.dump(xgb_classifier, classifier_path)

print(f"✓ Models saved locally:")
print(f"  Regressor: {regressor_path}")
print(f"  Classifier: {classifier_path}")

# Also save model metadata
metadata = {
    'trained_at': datetime.now().isoformat(),
    'feature_count': len(feature_cols),
    'training_samples': len(train_df),
    'validation_samples': len(val_df),
    'test_samples': len(test_df),
    'regression_metrics': {
        'test_mae': float(test_metrics['mae']),
        'test_rmse': float(test_metrics['rmse']),
        'test_r2': float(test_metrics['r2']),
        'test_directional_accuracy': float(test_metrics['directional_accuracy'])
    },
    'classification_metrics': {
        'test_accuracy': float(test_class_metrics['accuracy']),
        'test_auc': float(test_class_metrics['auc'])
    }
}

import json
metadata_path = os.path.join(models_dir, 'model_metadata.json')
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"  Metadata: {metadata_path}")
print("\n✓ Local save complete")

## Save Models Locally

In [None]:
# Analyze feature importance from both models
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Regression model feature importance
ax = axes[0]
feature_importance_reg = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_regressor.feature_importances_
}).sort_values('importance', ascending=False)

top_n = 15
sns.barplot(
    data=feature_importance_reg.head(top_n), 
    y='feature', 
    x='importance',
    ax=ax,
    palette='viridis'
)
ax.set_title(f'Top {top_n} Features - Regression Model', fontsize=14, fontweight='bold')
ax.set_xlabel('Importance')
ax.set_ylabel('Feature')
ax.grid(True, alpha=0.3, axis='x')

# Classification model feature importance
ax = axes[1]
feature_importance_cls = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_classifier.feature_importances_
}).sort_values('importance', ascending=False)

sns.barplot(
    data=feature_importance_cls.head(top_n), 
    y='feature', 
    x='importance',
    ax=ax,
    palette='plasma'
)
ax.set_title(f'Top {top_n} Features - Classification Model', fontsize=14, fontweight='bold')
ax.set_xlabel('Importance')
ax.set_ylabel('Feature')
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("TOP 10 MOST IMPORTANT FEATURES")
print("="*60)
print("\nRegression Model:")
for i, row in feature_importance_reg.head(10).iterrows():
    print(f"  {row['feature']:25s} : {row['importance']:.4f}")

print("\nClassification Model:")
for i, row in feature_importance_cls.head(10).iterrows():
    print(f"  {row['feature']:25s} : {row['importance']:.4f}")
print("="*60)