In [1]:
# =============================================================================
# NBA Player Performance Prediction Pipeline
# Author: Christopher Bratkovics
# Complete modeling pipeline for NBA player performance prediction
#              including data loading, model training, evaluation, and reporting
# =============================================================================

# Import required modules
import sys
import os
from pathlib import Path
sys.path.append(str(Path().resolve().parent / "nba_analytics"))

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Import custom modules for modeling pipeline
try:
   from model_pipeline import (
       run_nba_modeling_pipeline, 
       validate_model_results, 
       save_model_artifacts,
       DataLoader,
       ModelConfig,
       ModelPipeline,
       ModelInterpreter
   )
   print("Successfully imported model_pipeline.py")

except ImportError as e:
   print(f"Error importing model_pipeline: {e}")
   print("Please ensure model_pipeline.py is in your current directory or Python path")

try:
   from reporting import (
       ModelResultsReporter,
       generate_presentation_visuals
   )
   print("Successfully imported reporting.py")
    
except ImportError as e:
   print(f"Error importing reporting: {e}")
   print("Please ensure reporting.py is in your current directory or Python path")


# =============================================================================
# Configuration and Data Path Setup
# =============================================================================

# Define path to processed dataset
# This dataset was created in notebook "02_eda_and_hypothesis_testing.ipynb"
DATA_PATH = "../data/processed/final_engineered_nba_data.parquet"

def find_data_file():
   """
   Locate the NBA data file from multiple possible locations.
   
   Returns:
       str: Path to the data file if found, None otherwise
   """
   possible_paths = [
       DATA_PATH,
       "../data/processed/cleaned_player_stats_20250526_221650.parquet",
       "data/processed/final_engineered_nba_data.parquet"
   ]
   
   for path in possible_paths:
       if os.path.exists(path):
           return path
   return None

# Attempt to locate data file
data_file = find_data_file()

if data_file is None:
   print("No data file found. Please update DATA_PATH or place your data file in one of these locations:")
   print(f"   - {DATA_PATH}")
   print("   - ../data/processed/cleaned_player_stats_20250526_221650.parquet")
   print("   - data/processed/final_engineered_nba_data.parquet")
else:
   print(f"Data file found: {data_file}")


# =============================================================================
# Execute Complete Modeling Pipeline
# =============================================================================

if data_file:
   try:
       print("\nSTARTING NBA MODELING PIPELINE")
       print("=" * 50)
       
       # Execute the main modeling pipeline
       pipeline, test_results, insights, production_manager = run_nba_modeling_pipeline(data_file)
       print("Pipeline execution successful")
       
       # Store results for subsequent analysis
       modeling_results = {
           'pipeline': pipeline,
           'test_results': test_results,
           'insights': insights,
           'production_manager': production_manager
       }
       
   except Exception as e:
       print(f"Pipeline execution failed: {e}")
       import traceback
       traceback.print_exc()
       modeling_results = None

else:
   print("Cannot run pipeline without data file")
   modeling_results = None


# =============================================================================
# Display Key Results Summary
# =============================================================================

if modeling_results:
    print("\nKEY MODELING RESULTS")
    print("-" * 25)
    
    # Display performance metrics for each target variable
    print("MODEL PERFORMANCE SUMMARY:")
    for target, performance in insights['model_performance'].items():
        print(f"   {target.upper()}:")
        print(f"     Best Model: {performance['best_model'].replace('_', ' ').title()}")
        print(f"     Accuracy (R²): {performance['r2']:.3f} ({performance['r2']*100:.1f}%)")
        print(f"     Average Error: ±{performance['mae']:.1f} {target}")
        print(f"     Predictability: {performance['predictability']}")
    
    print()
    
    # Display most important features for each target
    print("TOP PERFORMANCE DRIVERS:")
    for target, drivers in insights['key_drivers'].items():
        if 'top_features' in drivers:
            print(f"   {target.upper()}: {', '.join(drivers['top_features'][:3])}")


# =============================================================================
# Generate Feature Importance Analysis
# =============================================================================

if modeling_results:
   print("\nGENERATING FEATURE IMPORTANCE ANALYSIS")
   print("-" * 40)
   
   try:
       # Initialize model interpreter for feature importance analysis
       interpreter = ModelInterpreter(pipeline)
       
       # Recreate training data splits for feature importance calculation
       data_loader = DataLoader(pipeline.config)
       df = data_loader.load_and_validate(data_file)
       X, y = pipeline.prepare_model_data(df)
       X_train, X_val, X_test, y_train, y_val, y_test = pipeline.create_time_aware_split(df, X, y)
       
       # Calculate feature importance for all models
       importance_results = interpreter.analyze_feature_importance(X_train, y_train)
       print("Feature importance analysis complete")
       
   except Exception as e:
       print(f"Feature importance analysis failed: {e}")
       importance_results = {}
       y_test = {}


# =============================================================================
# Generate Presentation Visuals and Reports
# =============================================================================

if modeling_results and importance_results:
    print("\nGENERATING PRESENTATION VISUALS AND REPORTS")
    print("-" * 50)
    
    try:
        # Generate comprehensive visualization suite
        generate_presentation_visuals(
            pipeline, 
            test_results, 
            y_test, 
            importance_results
        )
        
        print("\nPRESENTATION MATERIALS GENERATED")
        print("\nGenerated visualizations:")
        print("   - Model performance comparison (bar charts)")
        print("   - Feature importance plots (top 15 features)")
        print("   - Residual analysis (heteroscedasticity patterns)")
        print("   - Prediction scatter plots (actual vs predicted)")
        print("   - Model comparison heatmap (R² scores)")
        
        print("\nGenerated reports:")
        print("   - Model performance metrics (CSV)")
        print("   - Feature importance rankings (CSV)")
        print("   - Residual statistics (CSV)")
        print("   - Comprehensive summary report (TXT)")
        
        reporting_success = True
        
    except Exception as e:
        print(f"Reporting pipeline failed: {e}")
        import traceback
        traceback.print_exc()
        reporting_success = False


# =============================================================================
# Display Detailed Model Performance
# =============================================================================

if modeling_results and reporting_success:
    print("\nDETAILED MODEL PERFORMANCE ANALYSIS")
    print("-" * 45)
    
    # Compile performance metrics for all models
    performance_data = []
    for target, models in test_results.items():
        for model_name, metrics in models.items():
            if isinstance(metrics, dict) and 'r2' in metrics:
                performance_data.append({
                    'Target': target.upper(),
                    'Model': model_name.replace('_', ' ').title(),
                    'R²': f"{metrics['r2']:.3f}",
                    'MAE': f"{metrics['mae']:.2f}",
                    'RMSE': f"{metrics.get('rmse', 'N/A'):.2f}" if metrics.get('rmse') != 'N/A' else 'N/A'
                })
    
    if performance_data:
        df_performance = pd.DataFrame(performance_data)
        print("\nMODEL PERFORMANCE METRICS:")
        print(df_performance.to_string(index=False))
        
        # Identify best performing model for each target
        print("\nBEST MODELS BY TARGET:")
        for target in df_performance['Target'].unique():
            target_data = df_performance[df_performance['Target'] == target]
            # Convert R² back to float for comparison
            target_data['R²_float'] = target_data['R²'].astype(float)
            best_idx = target_data['R²_float'].idxmax()
            best_model = target_data.loc[best_idx]
            print(f"   {target}: {best_model['Model']} (R² = {best_model['R²']})")


# =============================================================================
# Key Insights and Areas for Improvement
# =============================================================================

if modeling_results and reporting_success:
    print("\nKEY INSIGHTS FROM ANALYSIS")
    print("-" * 30)
    
    print("PREDICTIVE INSIGHTS:")
    print("   - Playing time (minutes) is the dominant predictor across all targets")
    print("   - Load management features (rest × minutes) show significant importance")
    print("   - Home court advantage is quantifiable but has minimal impact")
    print("   - Elite player classification helps stratify predictions")
    
    print("\nMODEL PERFORMANCE INSIGHTS:")
    print("   - Random Forest excels for points and rebounds prediction")
    print("   - Gradient Boosting performs best for assists")
    print("   - Points are most predictable (highest R²)")
    print("   - Residual analysis shows heteroscedasticity for high-scorers")
    
    print("\nAREAS FOR IMPROVEMENT:")
    print("   - Add opponent defensive metrics for better context")
    print("   - Include player momentum/streak features")
    print("   - Consider team performance and synergy effects")
    print("   - Implement stratified models for elite vs role players")
    print("   - Address prediction variance for high-performing players")


# =============================================================================
# Create Production-Ready Prediction Function
# =============================================================================

if modeling_results:
   print("\nCREATING PRODUCTION PREDICTION FUNCTION")
   print("-" * 40)
   
   try:
       # Initialize prediction function for deployment
       predict_function = production_manager.create_prediction_function()
       
       # Define test scenarios representing typical NBA game situations
       test_scenarios = [
           {
               'name': 'Star Player - Well Rested, Home Game',
               'data': {
                   'minutes_played': 35.0,
                   'rest_days': 3,
                   'sufficient_rest': True,
                   'is_home_game': True,
                   'is_weekend': False,
                   'player_position': 'G',
                   'month': 3,
                   'day_of_week': 2
               }
           },
           {
               'name': 'Role Player - Back-to-Back, Away Game',
               'data': {
                   'minutes_played': 22.0,
                   'rest_days': 1,
                   'sufficient_rest': False,
                   'is_home_game': False,
                   'is_weekend': True,
                   'player_position': 'F',
                   'month': 1,
                   'day_of_week': 6
               }
           },
           {
               'name': 'Center - Optimal Conditions',
               'data': {
                   'minutes_played': 32.0,
                   'rest_days': 2,
                   'sufficient_rest': True,
                   'is_home_game': True,
                   'is_weekend': False,
                   'player_position': 'C',
                   'month': 4,
                   'day_of_week': 3
               }
           }
       ]
       
       print("\nSAMPLE PREDICTIONS:")
       print("-" * 25)
       
       # Generate predictions for each test scenario
       for scenario in test_scenarios:
           predictions = predict_function(scenario['data'])
           
           print(f"\n{scenario['name']}:")
           context = scenario['data']
           home_status = 'Home' if context['is_home_game'] else 'Away'
           print(f"  Context: {context['minutes_played']:.0f} min, "
                 f"{context['rest_days']} rest days, {home_status}")
           print("  Predicted Performance:")
           for stat, pred in predictions.items():
               print(f"    {stat.upper()}: {pred}")
       
       print("\nProduction prediction function created and tested successfully")
       
       # Store prediction function for potential export
       globals()['prediction_function'] = predict_function
       
   except Exception as e:
       print(f"Prediction function creation failed: {e}")


# =============================================================================
# Model Validation and Artifact Saving
# =============================================================================

if modeling_results:
   print("\nMODEL VALIDATION & ARTIFACT SAVING")
   print("-" * 40)
   
   try:
       # Validate all models meet minimum performance thresholds
       validation_passed = validate_model_results(test_results, min_r2_threshold=0.3)
       
       if validation_passed:
           print("All models passed validation thresholds")
       else:
           print("Some models below performance threshold (still saving artifacts)")
       
       # Save model artifacts for future deployment
       save_model_artifacts(pipeline, test_results, insights, output_dir="../outputs/artifacts")
       print("Model artifacts saved successfully to ../outputs/artifacts/")
       
   except Exception as e:
       print(f"Artifact saving failed: {e}")


# =============================================================================
# FINAL SUMMARY
# =============================================================================

print("\n" + "="*60)
print("NBA PLAYER PERFORMANCE PREDICTION - FINAL SUMMARY")
print("="*60)

if modeling_results and reporting_success:
    print("\nPIPELINE EXECUTED SUCCESSFULLY")
    
    print("\nCOMPLETED TASKS:")
    print("   - Advanced modeling pipeline with multiple algorithms")
    print("   - Feature importance analysis with business context")
    print("   - Comprehensive model evaluation and validation")
    print("   - Professional visualization generation")
    print("   - Statistical analysis and residual diagnostics")
    print("   - Production-ready prediction function")
    print("   - Model artifacts saved for deployment")
    
    print("\nMODEL PERFORMANCE SUMMARY:")
    avg_r2 = 0
    count = 0
    for target, performance in insights['model_performance'].items():
        r2 = performance['r2']
        mae = performance['mae']
        avg_r2 += r2
        count += 1
        
        # Assess model quality based on R² score
        if r2 >= 0.9:
            quality = "Exceptional"
        elif r2 >= 0.7:
            quality = "Excellent" 
        elif r2 >= 0.5:
            quality = "Good"
        else:
            quality = "Fair"
            
        print(f"   {target.upper()}: R² = {r2:.3f} ({r2*100:.1f}%), MAE = ±{mae:.1f}, Quality = {quality}")
    
    if count > 0:
        avg_r2 /= count
        print(f"\n   Average Model Accuracy: {avg_r2*100:.1f}%")
    
    print("\nOUTPUT LOCATIONS:")
    print("   Visualizations: ../outputs/visuals/reporting_results/")
    print("      - model_performance_comparison.png")
    print("      - feature_importance_[target].png")
    print("      - residual_analysis.png")
    print("      - prediction_scatter_plots.png")
    print("      - model_comparison_heatmap.png")
    
    print("\n   Reports: ../outputs/reports/")
    print("      - model_performance_metrics.csv")
    print("      - feature_importance_[target].csv")
    print("      - residual_statistics.csv")
    print("      - model_results_summary.txt")
    
    print("\n   Model Artifacts: ../outputs/artifacts/")
    print("      - Production-ready models and scalers")
    print("      - Feature lists and metadata")

else:
    print("\nPIPELINE EXECUTION INCOMPLETE")
    if not modeling_results:
        print("   - Modeling pipeline failed")
    if modeling_results and not reporting_success:
        print("   - Reporting pipeline failed")
    print("\nCheck error messages above for details")

print("\n" + "="*60)
print("Analysis Complete!")
print("="*60)

Successfully imported model_pipeline.py
Successfully imported reporting.py
Data file found: ../data/processed/final_engineered_nba_data.parquet

STARTING NBA MODELING PIPELINE
NBA PLAYER PERFORMANCE PREDICTION MODELING PIPELINE
Initializing comprehensive model training workflow...
Loading NBA player performance dataset...
Successfully loaded dataset: 169,851 records with 113 features
Dataset temporal coverage: 1331 days

Preparing data for model training...
Identified and removing 40 leakage/identifier columns
Data preparation complete: 169,851 records with 74 leak-free features

Creating time-aware data splits to prevent temporal leakage...
Chronological split sizes - Train: 101,910 | Validation: 33,970 | Test: 33,971

Initiating model training pipeline...

Training models for PTS prediction:

Applying feature selection for PTS prediction...
Feature selection complete: 74 features reduced to 65 features
  linear_regression: R-squared = 0.882 | Mean Absolute Error = 2.090
  ridge: R-sq