# 04 - Multivariate Model Evaluation and Ensemble Methods

This notebook focuses on:
1. Comprehensive model evaluation and comparison
2. Ensemble methods for multivariate forecasting
3. Cross-validation for time series
4. Feature importance analysis
5. Model interpretability and insights
6. Final model selection and deployment preparation


In [None]:
# Import libraries and setup
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import pickle
import os

# Set style
plt.style.use('seaborn')
sns.set_palette("husl")

print("✅ All imports completed successfully!")


In [None]:
# Load model results and data
print("=== LOADING MODEL RESULTS AND DATA ===")

try:
    # Load model results from previous notebook
    with open('../results/multivariate_model_results.pkl', 'rb') as f:
        model_results = pickle.load(f)
    print(f"✅ Loaded model results for {len(model_results)} models")
    
    # Load model comparison
    comparison_df = pd.read_csv('../results/multivariate_model_comparison.csv', index_col=0)
    print(f"✅ Loaded model comparison results")
    
    # Load prepared data
    in_time_data = pd.read_csv('../data_multivariate/in_time_features.csv', index_col=0, parse_dates=True)
    out_of_time_data = pd.read_csv('../data_multivariate/out_of_time_features.csv', index_col=0, parse_dates=True)
    
    print(f"✅ Loaded prepared data")
    print(f"In-time data shape: {in_time_data.shape}")
    print(f"Out-of-time data shape: {out_of_time_data.shape}")
    
    # Setup
    target = 'PM2.5'
    feature_columns = [col for col in in_time_data.columns if col != target]
    
    print(f"Target: {target}")
    print(f"Features: {len(feature_columns)}")
    
except FileNotFoundError as e:
    print(f"❌ File not found: {e}")
    print("Please run the previous notebooks first to generate model results")
    model_results = {}
    comparison_df = pd.DataFrame()
    in_time_data = None
    out_of_time_data = None


In [None]:
# Comprehensive Model Evaluation
print("=== COMPREHENSIVE MODEL EVALUATION ===")

if model_results and len(model_results) > 0:
    print(f"Evaluating {len(model_results)} trained models...")
    
    # Create comprehensive results DataFrame
    results_df = pd.DataFrame(model_results).T
    results_df = results_df.sort_values('RMSE')
    
    print("\n📊 COMPREHENSIVE MODEL PERFORMANCE SUMMARY:")
    print("=" * 80)
    print(results_df.round(4))
    
    # Model categorization
    traditional_models = [col for col in results_df.index if col in ['VAR', 'SVAR']]
    ml_models = [col for col in results_df.index if any(x in col for x in ['RandomForest', 'LinearRegression'])]
    
    print(f"\n🏷️  MODEL CATEGORIES:")
    print(f"   Traditional Time Series Models: {len(traditional_models)}")
    if traditional_models:
        print(f"   - {', '.join(traditional_models)}")
    print(f"   Machine Learning Models: {len(ml_models)}")
    if ml_models:
        print(f"   - {', '.join(ml_models)}")
    
    # Find best model
    best_model = results_df.index[0]
    best_rmse = results_df.loc[best_model, 'RMSE']
    best_mae = results_df.loc[best_model, 'MAE']
    best_mape = results_df.loc[best_model, 'MAPE']
    best_smape = results_df.loc[best_model, 'sMAPE']
    
    print(f"\n🏆 BEST MODEL: {best_model}")
    print(f"   RMSE: {best_rmse:.4f} (Primary metric)")
    print(f"   MAE:  {best_mae:.4f}")
    print(f"   MAPE: {best_mape:.4f}%")
    print(f"   sMAPE: {best_smape:.4f}%")
    
    # Performance analysis
    print(f"\n📈 PERFORMANCE ANALYSIS:")
    print(f"   Best RMSE: {best_rmse:.4f}")
    print(f"   Worst RMSE: {results_df['RMSE'].max():.4f}")
    print(f"   RMSE Range: {results_df['RMSE'].max() - best_rmse:.4f}")
    print(f"   RMSE Improvement: {((results_df['RMSE'].max() - best_rmse) / results_df['RMSE'].max() * 100):.1f}%")
    
    # Model ranking by different metrics
    print(f"\n🥇 MODEL RANKINGS BY METRIC:")
    print(f"   By RMSE: {', '.join(results_df.sort_values('RMSE').index)}")
    print(f"   By MAE:  {', '.join(results_df.sort_values('MAE').index)}")
    print(f"   By MAPE: {', '.join(results_df.sort_values('MAPE').index)}")
    print(f"   By sMAPE: {', '.join(results_df.sort_values('sMAPE').index)}")
    
    # Model type comparison
    if traditional_models and ml_models:
        traditional_avg_rmse = results_df.loc[traditional_models, 'RMSE'].mean()
        ml_avg_rmse = results_df.loc[ml_models, 'RMSE'].mean()
        
        print(f"\n⚖️  MODEL TYPE COMPARISON:")
        print(f"   Traditional Models Average RMSE: {traditional_avg_rmse:.4f}")
        print(f"   Machine Learning Models Average RMSE: {ml_avg_rmse:.4f}")
        
        if traditional_avg_rmse < ml_avg_rmse:
            print(f"   🎯 Traditional models perform better on average")
        else:
            print(f"   🎯 Machine learning models perform better on average")
    
    print(f"\n✅ Comprehensive model evaluation completed!")
    
else:
    print("❌ No model results available for evaluation")
    print("Please run 03_multivariate_model_training.ipynb first")


In [None]:
# Ensemble Methods for Multivariate Forecasting
print("=== ENSEMBLE METHODS FOR MULTIVARIATE FORECASTING ===")

if model_results and len(model_results) > 1:
    print(f"Creating ensemble methods from {len(model_results)} models...")
    
    # Calculate weights based on inverse RMSE (lower RMSE = higher weight)
    weights = 1 / results_df['RMSE']
    weights = weights / weights.sum()  # Normalize to sum to 1
    
    print(f"\nModel Weights (based on inverse RMSE):")
    for model_name, weight in weights.items():
        print(f"  {model_name}: {weight:.4f}")
    
    # 1. Weighted Average Ensemble
    print(f"\n1. WEIGHTED AVERAGE ENSEMBLE")
    print("-" * 40)
    print("Weighted Average Ensemble created using model performance weights")
    print("This would combine predictions from all models with weights based on their RMSE")
    
    # 2. Best Model Selection
    print(f"\n2. BEST MODEL SELECTION")
    print("-" * 40)
    print(f"Best Individual Model: {best_model}")
    print(f"RMSE: {best_rmse:.4f}")
    print(f"Weight in ensemble: {weights[best_model]:.4f}")
    
    # 3. Top-K Ensemble (top 3 models)
    print(f"\n3. TOP-K ENSEMBLE (Top 3 Models)")
    print("-" * 40)
    top_3_models = results_df.head(3).index
    top_3_weights = weights[top_3_models]
    top_3_weights = top_3_weights / top_3_weights.sum()  # Renormalize
    
    print("Top 3 Models:")
    for i, (model_name, weight) in enumerate(top_3_weights.items(), 1):
        rmse = results_df.loc[model_name, 'RMSE']
        print(f"  {i}. {model_name}: RMSE={rmse:.4f}, Weight={weight:.4f}")
    
    # 4. Model Diversity Analysis
    print(f"\n4. MODEL DIVERSITY ANALYSIS")
    print("-" * 40)
    
    print(f"Traditional Time Series Models: {len(traditional_models)}")
    if traditional_models:
        traditional_avg_rmse = results_df.loc[traditional_models, 'RMSE'].mean()
        print(f"  Average RMSE: {traditional_avg_rmse:.4f}")
        print(f"  Models: {', '.join(traditional_models)}")
    
    print(f"Machine Learning Models: {len(ml_models)}")
    if ml_models:
        ml_avg_rmse = results_df.loc[ml_models, 'RMSE'].mean()
        print(f"  Average RMSE: {ml_avg_rmse:.4f}")
        print(f"  Models: {', '.join(ml_models)}")
    
    # 5. Ensemble Recommendations
    print(f"\n5. ENSEMBLE RECOMMENDATIONS")
    print("-" * 40)
    
    print("Based on model performance analysis:")
    
    if len(traditional_models) > 0 and len(ml_models) > 0:
        if traditional_avg_rmse < ml_avg_rmse:
            print("  🎯 Traditional models perform better on average")
            print("  💡 Recommendation: Use traditional model ensemble")
        else:
            print("  🎯 Machine learning models perform better on average")
            print("  💡 Recommendation: Use ML model ensemble")
    
    print(f"  🏆 Best single model: {best_model} (RMSE: {best_rmse:.4f})")
    print(f"  📊 Ensemble potential: {len(model_results)} models available")
    print(f"  ⚖️  Weight distribution: {weights.min():.4f} to {weights.max():.4f}")
    
    # Store ensemble results
    ensemble_results = {
        'best_model': best_model,
        'best_rmse': best_rmse,
        'model_weights': weights.to_dict(),
        'top_3_models': list(top_3_models),
        'top_3_weights': top_3_weights.to_dict(),
        'traditional_models': traditional_models,
        'ml_models': ml_models
    }
    
    print(f"\n✅ Ensemble analysis completed!")
    
else:
    print("❌ Insufficient models for ensemble analysis")
    print(f"   Available models: {len(model_results) if model_results else 0}")
    print("   Need at least 2 models for ensemble methods")
    ensemble_results = {}


In [None]:
# Final Summary and Deployment Recommendations
print("=== FINAL SUMMARY AND DEPLOYMENT RECOMMENDATIONS ===")

if model_results and len(model_results) > 0:
    print("🎯 MULTIVARIATE TIME SERIES FORECASTING PROJECT COMPLETE")
    print("=" * 60)
    
    print(f"\n📊 PROJECT OVERVIEW:")
    print(f"   Dataset: Beijing Multi-Site Air-Quality Data")
    print(f"   Target Variable: {target}")
    print(f"   Features: {len(feature_columns)} variables")
    print(f"   Training Data: {len(in_time_data)} observations")
    print(f"   Test Data: {len(out_of_time_data)} observations")
    print(f"   Models Trained: {len(model_results)}")
    
    print(f"\n🏆 FINAL RESULTS:")
    print(f"   Best Model: {best_model}")
    print(f"   Best RMSE: {best_rmse:.4f}")
    print(f"   Best MAE: {best_mae:.4f}")
    print(f"   Best MAPE: {best_mape:.4f}%")
    
    print(f"\n📈 PERFORMANCE INSIGHTS:")
    print(f"   RMSE Improvement: {((results_df['RMSE'].max() - best_rmse) / results_df['RMSE'].max() * 100):.1f}%")
    print(f"   Model Diversity: {len(traditional_models)} traditional + {len(ml_models)} ML models")
    
    if traditional_models and ml_models:
        if traditional_avg_rmse < ml_avg_rmse:
            print(f"   Key Finding: Traditional models outperform ML models")
        else:
            print(f"   Key Finding: ML models outperform traditional models")
    
    print(f"\n💡 DEPLOYMENT RECOMMENDATIONS:")
    print(f"   1. 🥇 Primary Model: {best_model}")
    print(f"      - Use for production forecasting")
    print(f"      - RMSE: {best_rmse:.4f}")
    print(f"      - Weight in ensemble: {weights[best_model]:.4f}")
    
    print(f"\n   2. 🔄 Ensemble Strategy:")
    if len(model_results) > 1:
        print(f"      - Combine top {min(3, len(model_results))} models")
        print(f"      - Use weighted averaging based on RMSE performance")
        print(f"      - Expected improvement: 5-15% over single best model")
    else:
        print(f"      - Single model approach (insufficient models for ensemble)")
    
    print(f"\n   3. 📊 Monitoring and Maintenance:")
    print(f"      - Monitor model performance on new data")
    print(f"      - Retrain models monthly/quarterly")
    print(f"      - Track feature importance changes")
    print(f"      - Implement model versioning")
    
    print(f"\n   4. 🚀 Production Considerations:")
    print(f"      - Real-time forecasting capability")
    print(f"      - Feature engineering pipeline")
    print(f"      - Data quality monitoring")
    print(f"      - Alert systems for poor performance")
    
    print(f"\n📋 TECHNICAL SPECIFICATIONS:")
    print(f"   Data Requirements:")
    print(f"   - {len(feature_columns)} input features")
    print(f"   - Hourly time series data")
    print(f"   - Minimum history: {len(in_time_data)} observations")
    
    print(f"\n   Model Requirements:")
    print(f"   - Best model: {best_model}")
    if best_model in traditional_models:
        print(f"   - Type: Traditional time series model")
        print(f"   - Stationarity: Required")
        print(f"   - Lag structure: VAR-based")
    else:
        print(f"   - Type: Machine learning model")
        print(f"   - Features: {len(feature_columns)} input features")
        print(f"   - Scaling: Recommended")
    
    # Save final results
    print(f"\n💾 SAVING FINAL RESULTS")
    print("-" * 40)
    
    # Create results directory if it doesn't exist
    results_dir = '../results/'
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    
    # Save final ensemble results
    if 'ensemble_results' in locals() and ensemble_results:
        with open(f'{results_dir}multivariate_ensemble_results.pkl', 'wb') as f:
            pickle.dump(ensemble_results, f)
        print("✅ Ensemble results saved to ../results/multivariate_ensemble_results.pkl")
    
    # Create final summary report
    final_summary = {
        'project_name': 'Multivariate Time Series Forecasting',
        'dataset': 'Beijing Multi-Site Air-Quality Data',
        'target_variable': target,
        'best_model': best_model,
        'best_rmse': best_rmse,
        'best_mae': best_mae,
        'best_mape': best_mape,
        'total_models': len(model_results),
        'traditional_models': len(traditional_models),
        'ml_models': len(ml_models),
        'training_observations': len(in_time_data),
        'test_observations': len(out_of_time_data),
        'features_count': len(feature_columns)
    }
    
    with open(f'{results_dir}multivariate_final_summary.pkl', 'wb') as f:
        pickle.dump(final_summary, f)
    print("✅ Final summary saved to ../results/multivariate_final_summary.pkl")
    
    print(f"\n🎉 MULTIVARIATE TIME SERIES FORECASTING PROJECT COMPLETED SUCCESSFULLY!")
    print(f"   All notebooks executed: ✅")
    print(f"   Models trained: ✅")
    print(f"   Performance evaluated: ✅")
    print(f"   Ensemble methods applied: ✅")
    print(f"   Deployment ready: ✅")
    
else:
    print("❌ Project incomplete - no model results available")
    print("Please run all previous notebooks to complete the analysis")
