In [2]:
pip install catboost xgboost nba_api

Collecting catboost
  Downloading catboost-1.2.10-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting nba_api
  Downloading nba_api-1.11.4-py3-none-any.whl.metadata (5.8 kB)
Downloading catboost-1.2.10-cp312-cp312-manylinux2014_x86_64.whl (97.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m97.1/97.1 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nba_api-1.11.4-py3-none-any.whl (322 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m322.6/322.6 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nba_api, catboost
Successfully installed catboost-1.2.10 nba_api-1.11.4


In [3]:
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leaguegamelog, leaguedashteamstats
from datetime import datetime
import time

# First, get your game log data
SEASON = "2025-26"
SEASON_TYPE = "Regular Season"

print("Fetching game log data...")
df_games = leaguegamelog.LeagueGameLog(
    season=SEASON,
    season_type_all_star=SEASON_TYPE,
    timeout=120
).get_data_frames()[0]

print(f"Raw data shape: {df_games.shape}")
print(f"Games in dataset: {len(df_games) // 2} (each game appears twice)")

# Now fetch team advanced stats for the season
print("\nFetching team advanced stats...")
team_advanced = leaguedashteamstats.LeagueDashTeamStats(
    season=SEASON,
    season_type_all_star=SEASON_TYPE,
    measure_type_detailed_defense='Advanced',
    per_mode_detailed='PerGame',
    timeout=120
).get_data_frames()[0]

print(f"Team advanced stats shape: {team_advanced.shape}")
print("Advanced stats columns:", [col for col in team_advanced.columns if col in
      ['OFF_RATING', 'DEF_RATING', 'NET_RATING', 'PACE', 'TS_PCT', 'EFG_PCT',
       'OREB_PCT', 'DREB_PCT', 'TM_TOV_PCT', 'AST_RATIO', 'PIE']])

def process_games_to_model_format(df_games, team_advanced):
    """
    Transform game log into the format your model needs
    """
    print("\nProcessing games into model format...")

    # Get unique game IDs
    game_ids = df_games['GAME_ID'].unique()
    print(f"Found {len(game_ids)} unique games")

    processed_games = []

    for i, game_id in enumerate(game_ids):
        if i % 50 == 0:
            print(f"  Processing game {i}/{len(game_ids)}...")

        # Get both entries for this game
        game_entries = df_games[df_games['GAME_ID'] == game_id]

        if len(game_entries) != 2:
            print(f"    Warning: Game {game_id} has {len(game_entries)} entries, skipping")
            continue

        # Determine home and away
        team1 = game_entries.iloc[0]
        team2 = game_entries.iloc[1]

        # Check if IS_HOME_GAME column exists
        if 'IS_HOME_GAME' in df_games.columns:
            home_entry = team1 if team1['IS_HOME_GAME'] else team2
            away_entry = team2 if team1['IS_HOME_GAME'] else team1
        else:
            # Fallback to MATCHUP column logic
            if 'vs.' in str(team1['MATCHUP']):
                home_entry, away_entry = team1, team2
            else:
                home_entry, away_entry = team2, team1

        # Get team advanced stats
        home_team_id = home_entry['TEAM_ID']
        away_team_id = away_entry['TEAM_ID']

        home_adv = team_advanced[team_advanced['TEAM_ID'] == home_team_id]
        away_adv = team_advanced[team_advanced['TEAM_ID'] == away_team_id]

        if len(home_adv) == 0 or len(away_adv) == 0:
            print(f"    Warning: Missing advanced stats for game {game_id}")
            continue

        home_adv = home_adv.iloc[0]
        away_adv = away_adv.iloc[0]

        # Parse game date
        try:
            game_date = pd.to_datetime(home_entry['GAME_DATE'])
        except:
            game_date = datetime.now()

        # Create the game record in your model's format
        game_record = {
            # Basic game info
            'GAME_ID': game_id,
            'GAME_DATE': game_date,
            'SEASON': SEASON,
            'HOME_TEAM': home_entry['TEAM_NAME'],
            'AWAY_TEAM': away_entry['TEAM_NAME'],
            'HOME_PTS': int(home_entry['PTS']),
            'AWAY_PTS': int(away_entry['PTS']),
            'TOTAL_PTS': int(home_entry['PTS'] + away_entry['PTS']),

            # Home advanced stats
            'HOME_ADV_OFF_RATING': home_adv.get('OFF_RATING', np.nan),
            'HOME_ADV_DEF_RATING': home_adv.get('DEF_RATING', np.nan),
            'HOME_ADV_NET_RATING': home_adv.get('NET_RATING', np.nan),
            'HOME_ADV_PACE': home_adv.get('PACE', np.nan),
            'HOME_ADV_TS_PCT': home_adv.get('TS_PCT', np.nan),
            'HOME_ADV_EFG_PCT': home_adv.get('EFG_PCT', np.nan),
            'HOME_ADV_OREB_PCT': home_adv.get('OREB_PCT', np.nan),
            'HOME_ADV_DREB_PCT': home_adv.get('DREB_PCT', np.nan),
            'HOME_ADV_TM_TOV_PCT': home_adv.get('TM_TOV_PCT', np.nan),
            'HOME_ADV_AST_RATIO': home_adv.get('AST_RATIO', np.nan),
            'HOME_ADV_PIE': home_adv.get('PIE', np.nan),

            # Away advanced stats
            'AWAY_ADV_OFF_RATING': away_adv.get('OFF_RATING', np.nan),
            'AWAY_ADV_DEF_RATING': away_adv.get('DEF_RATING', np.nan),
            'AWAY_ADV_NET_RATING': away_adv.get('NET_RATING', np.nan),
            'AWAY_ADV_PACE': away_adv.get('PACE', np.nan),
            'AWAY_ADV_TS_PCT': away_adv.get('TS_PCT', np.nan),
            'AWAY_ADV_EFG_PCT': away_adv.get('EFG_PCT', np.nan),
            'AWAY_ADV_OREB_PCT': away_adv.get('OREB_PCT', np.nan),
            'AWAY_ADV_DREB_PCT': away_adv.get('DREB_PCT', np.nan),
            'AWAY_ADV_TM_TOV_PCT': away_adv.get('TM_TOV_PCT', np.nan),
            'AWAY_ADV_AST_RATIO': away_adv.get('AST_RATIO', np.nan),
            'AWAY_ADV_PIE': away_adv.get('PIE', np.nan),

            # Rankings (if available)
            'HOME_ADV_OFF_RATING_RANK': home_adv.get('OFF_RATING_RANK', np.nan),
            'HOME_ADV_DEF_RATING_RANK': home_adv.get('DEF_RATING_RANK', np.nan),
            'HOME_ADV_NET_RATING_RANK': home_adv.get('NET_RATING_RANK', np.nan),
            'HOME_ADV_PACE_RANK': home_adv.get('PACE_RANK', np.nan),

            'AWAY_ADV_OFF_RATING_RANK': away_adv.get('OFF_RATING_RANK', np.nan),
            'AWAY_ADV_DEF_RATING_RANK': away_adv.get('DEF_RATING_RANK', np.nan),
            'AWAY_ADV_NET_RATING_RANK': away_adv.get('NET_RATING_RANK', np.nan),
            'AWAY_ADV_PACE_RANK': away_adv.get('PACE_RANK', np.nan),
        }

        processed_games.append(game_record)

    return pd.DataFrame(processed_games)

def add_derived_features(df):
    """
    Add all the derived features your model needs
    """
    print("\nAdding derived features...")

    # Calculate LINE (betting line) - simplified model based on net rating
    # You can replace this with actual line data if you have it
    df['LINE'] = 220 + (df['HOME_ADV_NET_RATING'] - df['AWAY_ADV_NET_RATING']) * 0.5
    df['LINE'] = df['LINE'].clip(200, 250).round(1)

    # Pace and offensive rating features
    df['AVG_PACE'] = (df['HOME_ADV_PACE'] + df['AWAY_ADV_PACE']) / 2
    df['AVG_OFF_RTG'] = (df['HOME_ADV_OFF_RATING'] + df['AWAY_ADV_OFF_RATING']) / 2
    df['PACE_X_OFF'] = df['AVG_PACE'] * df['AVG_OFF_RTG'] / 100

    # Matchup features
    df['PACE_DIFF'] = abs(df['HOME_ADV_PACE'] - df['AWAY_ADV_PACE'])
    df['OFF_VS_DEF'] = df['HOME_ADV_OFF_RATING'] - df['AWAY_ADV_DEF_RATING']
    df['DEF_VS_OFF'] = df['HOME_ADV_DEF_RATING'] - df['AWAY_ADV_OFF_RATING']

    # Strength indicators
    df['HOME_STRENGTH'] = df['HOME_ADV_NET_RATING'] / 100
    df['AWAY_STRENGTH'] = df['AWAY_ADV_NET_RATING'] / 100
    df['STRENGTH_DIFF'] = df['HOME_STRENGTH'] - df['AWAY_STRENGTH']

    # Shooting efficiency
    df['TS_PCT_DIFF'] = df['HOME_ADV_TS_PCT'] - df['AWAY_ADV_TS_PCT']

    # Rebounding battle
    df['REB_BATTLE'] = (df['HOME_ADV_OREB_PCT'] + df['HOME_ADV_DREB_PCT'] -
                        df['AWAY_ADV_OREB_PCT'] - df['AWAY_ADV_DREB_PCT'])

    # Turnover battle
    df['TOV_BATTLE'] = df['AWAY_ADV_TM_TOV_PCT'] - df['HOME_ADV_TM_TOV_PCT']

    # Home court advantage indicators
    df['HOME_OFF_ADV'] = df['HOME_ADV_OFF_RATING'] - df['AWAY_ADV_OFF_RATING']
    df['HOME_DEF_ADV'] = df['AWAY_ADV_DEF_RATING'] - df['HOME_ADV_DEF_RATING']

    return df

# Process the data
print("\n" + "="*60)
print("PROCESSING DATA FOR YOUR MODEL")
print("="*60)

# Transform to model format
model_df = process_games_to_model_format(df_games, team_advanced)

# Add derived features
model_df = add_derived_features(model_df)

# Sort by date
model_df = model_df.sort_values('GAME_DATE').reset_index(drop=True)

# Save the processed data
output_file = f"nba_model_data_{SEASON.replace('-', '_')}.csv"
model_df.to_csv(output_file, index=False)
print(f"\n‚úÖ Saved processed data to {output_file}")

# Display data summary
print("\n" + "="*60)
print("DATA SUMMARY")
print("="*60)
print(f"Total games: {len(model_df)}")
print(f"Date range: {model_df['GAME_DATE'].min()} to {model_df['GAME_DATE'].max()}")
print(f"Number of columns: {len(model_df.columns)}")

# Check for missing values
missing_pct = model_df.isnull().sum() / len(model_df) * 100
missing_cols = missing_pct[missing_pct > 0].sort_values(ascending=False)
if len(missing_cols) > 0:
    print(f"\nColumns with missing values:")
    for col, pct in missing_cols.head(10).items():
        print(f"  {col}: {pct:.1f}% missing")

# Show sample of key columns
print("\nSample data (first 5 games):")
key_cols = ['GAME_DATE', 'HOME_TEAM', 'AWAY_TEAM', 'HOME_PTS', 'AWAY_PTS',
            'HOME_ADV_OFF_RATING', 'AWAY_ADV_OFF_RATING', 'LINE']
available_cols = [col for col in key_cols if col in model_df.columns]
print(model_df[available_cols].head(5).to_string())

# Verify your model's required features
print("\n" + "="*60)
print("FEATURE AVAILABILITY CHECK")
print("="*60)

# Your model's feature list from your code
required_features = [
    'HOME_ADV_OFF_RATING', 'AWAY_ADV_OFF_RATING',
    'HOME_ADV_DEF_RATING', 'AWAY_ADV_DEF_RATING',
    'HOME_ADV_NET_RATING', 'AWAY_ADV_NET_RATING',
    'HOME_ADV_PACE', 'AWAY_ADV_PACE',
    'HOME_ADV_TS_PCT', 'AWAY_ADV_TS_PCT',
    'HOME_ADV_EFG_PCT', 'AWAY_ADV_EFG_PCT',
    'HOME_ADV_OREB_PCT', 'AWAY_ADV_OREB_PCT',
    'HOME_ADV_DREB_PCT', 'AWAY_ADV_DREB_PCT',
    'HOME_ADV_TM_TOV_PCT', 'AWAY_ADV_TM_TOV_PCT',
    'HOME_ADV_AST_RATIO', 'AWAY_ADV_AST_RATIO',
    'HOME_ADV_PIE', 'AWAY_ADV_PIE',
    'HOME_ADV_OFF_RATING_RANK', 'AWAY_ADV_OFF_RATING_RANK',
    'HOME_ADV_DEF_RATING_RANK', 'AWAY_ADV_DEF_RATING_RANK',
    'HOME_ADV_NET_RATING_RANK', 'AWAY_ADV_NET_RATING_RANK',
    'HOME_ADV_PACE_RANK', 'AWAY_ADV_PACE_RANK',
    'LINE', 'AVG_PACE', 'AVG_OFF_RTG', 'PACE_X_OFF'
]

available = []
missing = []

for feat in required_features:
    if feat in model_df.columns:
        available.append(feat)
    else:
        missing.append(feat)

print(f"‚úÖ Available features: {len(available)}/{len(required_features)}")
if missing:
    print(f"‚ùå Missing features: {missing[:10]}...")  # Show first 10 missing
else:
    print("‚úÖ All required features are present!")

# Now you can run your model
print("\n" + "="*60)
print("READY TO RUN YOUR MODEL")
print("="*60)
print("\nTo run your model with this data:")
print("  from your_model_file import main_multioutput")
print("  model, features, train_data, X_test, y_test, test_df, results = main_multioutput(model_df)")

# Optionally, create rolling features if you want them now
def create_rolling_features_for_model(df, window_sizes=[5, 10, 20]):
    """
    Create rolling averages for recent form (your function from the original code)
    """
    print("\nCreating rolling features...")
    df = df.copy()

    # For each team, calculate rolling averages
    all_teams = pd.concat([df['HOME_TEAM'], df['AWAY_TEAM']]).unique()

    for team in all_teams:
        team_mask = (df['HOME_TEAM'] == team) | (df['AWAY_TEAM'] == team)
        team_indices = df[team_mask].index.tolist()

        if len(team_indices) < 2:
            continue

        for window in window_sizes:
            for metric in ['OFF_RATING', 'DEF_RATING', 'NET_RATING', 'PACE', 'TS_PCT']:
                rolling_col = f'ROLL_{window}_{metric}'
                if rolling_col not in df.columns:
                    df[rolling_col] = np.nan

                for i, idx in enumerate(team_indices):
                    if i < window:
                        continue

                    prev_indices = team_indices[max(0, i-window):i]
                    values = []

                    for prev_idx in prev_indices:
                        if df.loc[prev_idx, 'HOME_TEAM'] == team:
                            if f'HOME_ADV_{metric}' in df.columns:
                                values.append(df.loc[prev_idx, f'HOME_ADV_{metric}'])
                        else:
                            if f'AWAY_ADV_{metric}' in df.columns:
                                values.append(df.loc[prev_idx, f'AWAY_ADV_{metric}'])

                    if values:
                        df.loc[idx, rolling_col] = np.mean(values)

    return df

# Create rolling features if you want
create_rollings = input("\nCreate rolling features? (y/n): ").lower() == 'y'
if create_rollings:
    model_df = create_rolling_features_for_model(model_df)
    print(f"Added rolling features. Total columns now: {len(model_df.columns)}")

print("\nDone! Your data is ready for the model.")

Fetching game log data...
Raw data shape: (1738, 29)
Games in dataset: 869 (each game appears twice)

Fetching team advanced stats...
Team advanced stats shape: (30, 46)
Advanced stats columns: ['OFF_RATING', 'DEF_RATING', 'NET_RATING', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'PACE', 'PIE']

PROCESSING DATA FOR YOUR MODEL

Processing games into model format...
Found 869 unique games
  Processing game 0/869...
  Processing game 50/869...
  Processing game 100/869...
  Processing game 150/869...
  Processing game 200/869...
  Processing game 250/869...
  Processing game 300/869...
  Processing game 350/869...
  Processing game 400/869...
  Processing game 450/869...
  Processing game 500/869...
  Processing game 550/869...
  Processing game 600/869...
  Processing game 650/869...
  Processing game 700/869...
  Processing game 750/869...
  Processing game 800/869...
  Processing game 850/869...

Adding derived features...

‚úÖ Saved processed data to nba_mo

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import json
from datetime import datetime

# ============================================================================
# PART 1: YOUR ORIGINAL MODEL CODE (copy this entire section)
# ============================================================================

# 1. Enhanced Feature Set using your actual columns
FEATURES_REG = [
    # Core advanced stats (already calculated in your data)
    'HOME_ADV_OFF_RATING', 'AWAY_ADV_OFF_RATING',
    'HOME_ADV_DEF_RATING', 'AWAY_ADV_DEF_RATING',
    'HOME_ADV_NET_RATING', 'AWAY_ADV_NET_RATING',
    'HOME_ADV_PACE', 'AWAY_ADV_PACE',
    'HOME_ADV_TS_PCT', 'AWAY_ADV_TS_PCT',
    'HOME_ADV_EFG_PCT', 'AWAY_ADV_EFG_PCT',
    'HOME_ADV_OREB_PCT', 'AWAY_ADV_OREB_PCT',
    'HOME_ADV_DREB_PCT', 'AWAY_ADV_DREB_PCT',
    'HOME_ADV_TM_TOV_PCT', 'AWAY_ADV_TM_TOV_PCT',
    'HOME_ADV_AST_RATIO', 'AWAY_ADV_AST_RATIO',
    'HOME_ADV_PIE', 'AWAY_ADV_PIE',

    # Rankings (captures relative strength)
    'HOME_ADV_OFF_RATING_RANK', 'AWAY_ADV_OFF_RATING_RANK',
    'HOME_ADV_DEF_RATING_RANK', 'AWAY_ADV_DEF_RATING_RANK',
    'HOME_ADV_NET_RATING_RANK', 'AWAY_ADV_NET_RATING_RANK',
    'HOME_ADV_PACE_RANK', 'AWAY_ADV_PACE_RANK',

    # Interaction features (pace-of-play matchup)
    'AVG_PACE',
    'AVG_OFF_RTG',
    'PACE_X_OFF',

    # Game context
    'LINE',  # The betting line - very important!
]

def create_rolling_features(df, window_sizes=[5, 10, 20]):
    """
    Create rolling averages for recent form using your existing advanced stats
    """
    df = df.copy()

    # Ensure chronological order
    df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])
    df = df.sort_values('GAME_DATE').reset_index(drop=True)

    # For each team, calculate rolling averages of their advanced stats
    all_teams = pd.concat([df['HOME_TEAM'], df['AWAY_TEAM']]).unique()

    for team in all_teams:
        # Get all games for this team (home or away)
        team_mask = (df['HOME_TEAM'] == team) | (df['AWAY_TEAM'] == team)
        team_indices = df[team_mask].index.tolist()

        if len(team_indices) < 2:
            continue

        # For each window size
        for window in window_sizes:
            # Calculate rolling averages for key metrics
            for metric in ['OFF_RATING', 'DEF_RATING', 'NET_RATING', 'PACE', 'TS_PCT']:
                rolling_col = f'ROLL_{window}_{metric}'
                if rolling_col not in df.columns:
                    df[rolling_col] = np.nan

                # For each game involving this team
                for i, idx in enumerate(team_indices):
                    if i < window:  # Not enough previous games
                        continue

                    # Get previous 'window' games
                    prev_indices = team_indices[max(0, i-window):i]

                    # Calculate average of the metric from previous games
                    values = []
                    for prev_idx in prev_indices:
                        if df.loc[prev_idx, 'HOME_TEAM'] == team:
                            if f'HOME_ADV_{metric}' in df.columns:
                                values.append(df.loc[prev_idx, f'HOME_ADV_{metric}'])
                        else:
                            if f'AWAY_ADV_{metric}' in df.columns:
                                values.append(df.loc[prev_idx, f'AWAY_ADV_{metric}'])

                    if values:
                        df.loc[idx, rolling_col] = np.mean(values)

    return df

def create_matchup_features(df):
    """
    Create features specific to the matchup
    """
    df = df.copy()

    # Pace matchup (how styles clash)
    df['PACE_DIFF'] = abs(df['HOME_ADV_PACE'] - df['AWAY_ADV_PACE'])

    # Offensive vs Defensive matchup
    df['OFF_VS_DEF'] = df['HOME_ADV_OFF_RATING'] - df['AWAY_ADV_DEF_RATING']
    df['DEF_VS_OFF'] = df['HOME_ADV_DEF_RATING'] - df['AWAY_ADV_OFF_RATING']

    # Strength indicators
    df['HOME_STRENGTH'] = df['HOME_ADV_NET_RATING'] / 100
    df['AWAY_STRENGTH'] = df['AWAY_ADV_NET_RATING'] / 100
    df['STRENGTH_DIFF'] = df['HOME_STRENGTH'] - df['AWAY_STRENGTH']

    # Shooting efficiency clash
    df['TS_PCT_DIFF'] = df['HOME_ADV_TS_PCT'] - df['AWAY_ADV_TS_PCT']

    # Rebounding battle
    df['REB_BATTLE'] = df['HOME_ADV_OREB_PCT'] + df['HOME_ADV_DREB_PCT'] - \
                       (df['AWAY_ADV_OREB_PCT'] + df['AWAY_ADV_DREB_PCT'])

    # Turnover battle
    df['TOV_BATTLE'] = df['AWAY_ADV_TM_TOV_PCT'] - df['HOME_ADV_TM_TOV_PCT']

    # Home court advantage indicators
    df['HOME_OFF_ADV'] = df['HOME_ADV_OFF_RATING'] - df['AWAY_ADV_OFF_RATING']
    df['HOME_DEF_ADV'] = df['AWAY_ADV_DEF_RATING'] - df['HOME_ADV_DEF_RATING']  # Lower DEF rating is better

    return df

def prepare_multioutput_data(df, features, test_size=0.2):
    """
    Prepare data for multi-output prediction (HOME_PTS, AWAY_PTS, TOTAL_PTS)
    """
    # Sort by date
    df = df.sort_values('GAME_DATE').reset_index(drop=True)

    # Drop rows with missing targets
    df = df.dropna(subset=['HOME_PTS', 'AWAY_PTS', 'TOTAL_PTS'])

    # Select available features
    available_features = [f for f in features if f in df.columns]
    missing = set(features) - set(available_features)
    if missing:
        print(f"Note: Missing features that will be created: {missing}")

    # Create additional features
    print("Creating rolling features...")
    df = create_rolling_features(df)

    print("Creating matchup features...")
    df = create_matchup_features(df)

    # Update available features with newly created ones
    all_features = available_features + [
        'PACE_DIFF', 'OFF_VS_DEF', 'DEF_VS_OFF',
        'HOME_STRENGTH', 'AWAY_STRENGTH', 'STRENGTH_DIFF',
        'TS_PCT_DIFF', 'REB_BATTLE', 'TOV_BATTLE',
        'HOME_OFF_ADV', 'HOME_DEF_ADV'
    ]

    # Add rolling features that exist
    rolling_features = [col for col in df.columns if col.startswith('ROLL_')]
    all_features.extend(rolling_features)

    # Keep only features that exist in df
    final_features = [f for f in all_features if f in df.columns]

    # Drop rows with missing features
    clean_df = df.dropna(subset=final_features + ['HOME_PTS', 'AWAY_PTS', 'TOTAL_PTS']).copy()

    print(f"Total games with complete data: {len(clean_df)}")

    # Time-based split
    split_idx = int(len(clean_df) * (1 - test_size))
    train_df = clean_df.iloc[:split_idx]
    test_df = clean_df.iloc[split_idx:]

    # Features
    X_train = train_df[final_features]
    X_test = test_df[final_features]

    # Multiple targets
    y_train = train_df[['HOME_PTS', 'AWAY_PTS', 'TOTAL_PTS']]
    y_test = test_df[['HOME_PTS', 'AWAY_PTS', 'TOTAL_PTS']]

    print(f"\nTraining on {len(X_train)} games ({train_df['GAME_DATE'].min().date()} to {train_df['GAME_DATE'].max().date()})")
    print(f"Testing on {len(X_test)} games ({test_df['GAME_DATE'].min().date()} to {test_df['GAME_DATE'].max().date()})")
    print(f"Using {len(final_features)} features")
    print(f"Predicting: HOME_PTS, AWAY_PTS, TOTAL_PTS")

    return X_train, X_test, y_train, y_test, final_features, train_df, test_df

def train_multioutput_ensemble(X_train, X_test, y_train, y_test, feature_names, test_df):
    """
    Train multi-output ensemble models for team scores and total
    """
    # Initialize base models
    xgb_base = XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='mae'
    )

    cat_base = CatBoostRegressor(
        iterations=300,
        learning_rate=0.05,
        depth=6,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=0,
        loss_function='MAE'
    )

    rf_base = RandomForestRegressor(
        n_estimators=200,
        max_depth=12,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    )

    # Create multi-output models
    print("\nTraining Multi-Output Models...")

    print("Training XGBoost Multi-Output...")
    xgb_multi = MultiOutputRegressor(xgb_base, n_jobs=-1)
    xgb_multi.fit(X_train, y_train)

    print("Training CatBoost Multi-Output...")
    cat_multi = MultiOutputRegressor(cat_base, n_jobs=-1)
    cat_multi.fit(X_train, y_train)

    print("Training Random Forest Multi-Output...")
    rf_multi = MultiOutputRegressor(rf_base, n_jobs=-1)
    rf_multi.fit(X_train, y_train)

    # Create ensemble for each target
    print("\nCreating Ensembles for each target...")

    # For each target, create a voting ensemble
    ensembles = {}
    target_names = ['HOME_PTS', 'AWAY_PTS', 'TOTAL_PTS']

    for i, target in enumerate(target_names):
        ensemble = VotingRegressor([
            ('xgb', xgb_multi.estimators_[i]),
            ('cat', cat_multi.estimators_[i]),
            ('rf', rf_multi.estimators_[i])
        ])
        ensemble.fit(X_train, y_train.iloc[:, i])
        ensembles[target] = ensemble

    # Also create a combined model that predicts all three at once
    # This will be useful for predictions
    combined_model = {
        'ensembles': ensembles,
        'xgb_multi': xgb_multi,
        'cat_multi': cat_multi,
        'rf_multi': rf_multi,
        'feature_names': feature_names,
        'target_names': target_names
    }

    # Evaluate performance
    results_df = evaluate_multioutput_models(combined_model, X_test, y_test, test_df)

    return combined_model, results_df

def evaluate_multioutput_models(model, X_test, y_test, test_df):
    """
    Evaluate multi-output model performance
    """
    print("\n" + "="*60)
    print("MULTI-OUTPUT MODEL PERFORMANCE")
    print("="*60)

    target_names = ['HOME_PTS', 'AWAY_PTS', 'TOTAL_PTS']

    # Get predictions from ensembles
    predictions = {}
    for target in target_names:
        predictions[target] = model['ensembles'][target].predict(X_test)

    # Create results dataframe
    results_df = pd.DataFrame({
        'date': test_df['GAME_DATE'].values,
        'home_team': test_df['HOME_TEAM'].values,
        'away_team': test_df['AWAY_TEAM'].values,
        'home_actual': y_test['HOME_PTS'].values,
        'away_actual': y_test['AWAY_PTS'].values,
        'total_actual': y_test['TOTAL_PTS'].values,
        'home_pred': predictions['HOME_PTS'],
        'away_pred': predictions['AWAY_PTS'],
        'total_pred': predictions['TOTAL_PTS'],
    })

    # Add LINE if available
    if 'LINE' in test_df.columns:
        results_df['line'] = test_df['LINE'].values

    # Calculate errors
    for col in ['home', 'away', 'total']:
        actual = f'{col}_actual'
        pred = f'{col}_pred'
        results_df[f'{col}_error'] = results_df[pred] - results_df[actual]
        results_df[f'{col}_abs_error'] = abs(results_df[pred] - results_df[actual])

    # Print metrics for each target
    for target in target_names:
        target_lower = target.lower().replace('_pts', '')
        mae = results_df[f'{target_lower}_abs_error'].mean()
        rmse = np.sqrt((results_df[f'{target_lower}_error']**2).mean())
        bias = results_df[f'{target_lower}_error'].mean()

        print(f"\n{target}:")
        print(f"  MAE: {mae:.2f} points")
        print(f"  RMSE: {rmse:.2f} points")
        print(f"  Bias: {bias:.2f} points")

    # Total points specific metrics
    print(f"\n{'='*60}")
    print("TOTAL POINTS BETTING METRICS")
    print(f"{'='*60}")

    if 'line' in results_df.columns:
        results_df['total_vs_line'] = results_df['total_pred'] - results_df['line']
        results_df['actual_vs_line'] = results_df['total_actual'] - results_df['line']
        results_df['correct_side'] = ((results_df['total_pred'] > results_df['line']) ==
                                      (results_df['total_actual'] > results_df['line']))

        accuracy = results_df['correct_side'].mean() * 100
        print(f"Over/Under Accuracy: {accuracy:.1f}%")

        # Calculate ROI (assuming -110 odds)
        wins = results_df['correct_side'].sum()
        losses = len(results_df) - wins
        roi = (wins * 0.91 - losses) / len(results_df) * 100
        print(f"ROI (at -110): {roi:.1f}%")

    # Accuracy at different thresholds for total
    for threshold in [5, 10, 15, 20]:
        pct_within = (results_df['total_abs_error'] <= threshold).mean() * 100
        print(f"Total within {threshold} points: {pct_within:.1f}%")

    # Show distribution of errors
    print(f"\nError Distribution:")
    print(f"  Home Mean Error: {results_df['home_error'].mean():.2f}")
    print(f"  Away Mean Error: {results_df['away_error'].mean():.2f}")
    print(f"  Total Mean Error: {results_df['total_error'].mean():.2f}")

    # Show correlation between predictions
    print(f"\n{'='*60}")
    print("PREDICTION CORRELATIONS")
    print(f"{'='*60}")

    corr_home_actual = np.corrcoef(results_df['home_actual'], results_df['home_pred'])[0, 1]
    corr_away_actual = np.corrcoef(results_df['away_actual'], results_df['away_pred'])[0, 1]
    corr_total_actual = np.corrcoef(results_df['total_actual'], results_df['total_pred'])[0, 1]

    print(f"Home Score Correlation: {corr_home_actual:.3f}")
    print(f"Away Score Correlation: {corr_away_actual:.3f}")
    print(f"Total Score Correlation: {corr_total_actual:.3f}")

    # Check consistency (home + away should equal total)
    results_df['calculated_total'] = results_df['home_pred'] + results_df['away_pred']
    results_df['total_diff'] = abs(results_df['calculated_total'] - results_df['total_pred'])
    avg_diff = results_df['total_diff'].mean()
    max_diff = results_df['total_diff'].max()

    print(f"\n{'='*60}")
    print("PREDICTION CONSISTENCY CHECK")
    print(f"{'='*60}")
    print(f"Average difference (home+away vs total): {avg_diff:.4f} points")
    print(f"Maximum difference: {max_diff:.4f} points")
    print(f"Models are {'consistent' if avg_diff < 0.1 else 'slightly inconsistent'}")

    return results_df

def predict_game_multioutput(model, home_team, away_team, line, recent_games_df, feature_names):
    """
    Make multi-output prediction for a new game (home, away, and total)
    """
    # Get most recent game for both teams to get their current stats
    home_games = recent_games_df[
        (recent_games_df['HOME_TEAM'] == home_team) |
        (recent_games_df['AWAY_TEAM'] == home_team)
    ]

    away_games = recent_games_df[
        (recent_games_df['HOME_TEAM'] == away_team) |
        (recent_games_df['AWAY_TEAM'] == away_team)
    ]

    if len(home_games) == 0 or len(away_games) == 0:
        return {"error": f"Team not found in database: {home_team if len(home_games)==0 else away_team}"}

    home_recent = home_games.iloc[-1:]
    away_recent = away_games.iloc[-1:]

    # Build feature dictionary
    features = {}

    # Basic stats
    for prefix in ['HOME_ADV_', 'AWAY_ADV_']:
        for stat in ['OFF_RATING', 'DEF_RATING', 'NET_RATING', 'PACE',
                     'TS_PCT', 'EFG_PCT', 'OREB_PCT', 'DREB_PCT',
                     'TM_TOV_PCT', 'AST_RATIO', 'PIE']:
            col = f"{prefix}{stat}"
            if col in home_recent.columns and prefix == 'HOME_ADV_':
                features[col] = home_recent[col].values[0]
            elif col in away_recent.columns and prefix == 'AWAY_ADV_':
                features[col] = away_recent[col].values[0]
            else:
                # Fill with reasonable defaults if missing
                if 'OFF_RATING' in stat:
                    features[col] = 110.0
                elif 'DEF_RATING' in stat:
                    features[col] = 110.0
                elif 'PACE' in stat:
                    features[col] = 100.0
                elif 'PCT' in stat:
                    features[col] = 0.5
                else:
                    features[col] = 0.0

    # Rankings
    for prefix in ['HOME_ADV_', 'AWAY_ADV_']:
        for stat in ['OFF_RATING_RANK', 'DEF_RATING_RANK', 'NET_RATING_RANK', 'PACE_RANK']:
            col = f"{prefix}{stat}"
            if col in home_recent.columns and prefix == 'HOME_ADV_':
                features[col] = home_recent[col].values[0]
            elif col in away_recent.columns and prefix == 'AWAY_ADV_':
                features[col] = away_recent[col].values[0]
            else:
                features[col] = 15  # Middle rank

    # Game context
    features['LINE'] = line
    features['AVG_PACE'] = (features.get('HOME_ADV_PACE', 100) + features.get('AWAY_ADV_PACE', 100)) / 2
    features['AVG_OFF_RTG'] = (features.get('HOME_ADV_OFF_RATING', 110) + features.get('AWAY_ADV_OFF_RATING', 110)) / 2
    features['PACE_X_OFF'] = features['AVG_PACE'] * features['AVG_OFF_RTG'] / 100

    # Create matchup features
    features['PACE_DIFF'] = abs(features.get('HOME_ADV_PACE', 100) - features.get('AWAY_ADV_PACE', 100))
    features['OFF_VS_DEF'] = features.get('HOME_ADV_OFF_RATING', 110) - features.get('AWAY_ADV_DEF_RATING', 110)
    features['DEF_VS_OFF'] = features.get('HOME_ADV_DEF_RATING', 110) - features.get('AWAY_ADV_OFF_RATING', 110)
    features['HOME_STRENGTH'] = features.get('HOME_ADV_NET_RATING', 0) / 100
    features['AWAY_STRENGTH'] = features.get('AWAY_ADV_NET_RATING', 0) / 100
    features['STRENGTH_DIFF'] = features['HOME_STRENGTH'] - features['AWAY_STRENGTH']
    features['TS_PCT_DIFF'] = features.get('HOME_ADV_TS_PCT', 0.55) - features.get('AWAY_ADV_TS_PCT', 0.55)
    features['HOME_OFF_ADV'] = features.get('HOME_ADV_OFF_RATING', 110) - features.get('AWAY_ADV_OFF_RATING', 110)
    features['HOME_DEF_ADV'] = features.get('AWAY_ADV_DEF_RATING', 110) - features.get('HOME_ADV_DEF_RATING', 110)

    # Rolling features - try to get from recent games
    for col in recent_games_df.columns:
        if col.startswith('ROLL_') and col in feature_names:
            if col in home_recent.columns:
                features[col] = home_recent[col].values[0]
            elif col in away_recent.columns:
                features[col] = away_recent[col].values[0]
            else:
                features[col] = 0

    # Create dataframe with all features
    X_pred = pd.DataFrame([features])

    # Ensure all feature columns exist
    for col in feature_names:
        if col not in X_pred.columns:
            X_pred[col] = 0  # Fill missing with 0

    # Select only the features used in training
    X_pred = X_pred[feature_names]

    # Make predictions using each ensemble
    home_pred = model['ensembles']['HOME_PTS'].predict(X_pred)[0]
    away_pred = model['ensembles']['AWAY_PTS'].predict(X_pred)[0]
    total_pred = model['ensembles']['TOTAL_PTS'].predict(X_pred)[0]

    # Also get individual model predictions for confidence
    individual_preds = {
        'home': {},
        'away': {},
        'total': {}
    }

    for target in ['HOME_PTS', 'AWAY_PTS', 'TOTAL_PTS']:
        for name in ['xgb', 'cat', 'rf']:
            if target == 'HOME_PTS':
                individual_preds['home'][name] = model['ensembles'][target].named_estimators_[name].predict(X_pred)[0]
            elif target == 'AWAY_PTS':
                individual_preds['away'][name] = model['ensembles'][target].named_estimators_[name].predict(X_pred)[0]
            else:
                individual_preds['total'][name] = model['ensembles'][target].named_estimators_[name].predict(X_pred)[0]

    # Calculate confidence based on ensemble agreement
    home_agreement = 1 - (np.std(list(individual_preds['home'].values())) / (np.mean(list(individual_preds['home'].values())) + 1e-10))
    away_agreement = 1 - (np.std(list(individual_preds['away'].values())) / (np.mean(list(individual_preds['away'].values())) + 1e-10))
    total_agreement = 1 - (np.std(list(individual_preds['total'].values())) / (np.mean(list(individual_preds['total'].values())) + 1e-10))

    home_confidence = min(100, max(0, home_agreement * 100))
    away_confidence = min(100, max(0, away_agreement * 100))
    total_confidence = min(100, max(0, total_agreement * 100))

    return {
        'matchup': f"{away_team} @ {home_team}",
        'line': line,
        'predictions': {
            'home': {
                'team': home_team,
                'predicted': round(home_pred, 2),
                'confidence': round(home_confidence, 1),
                'individual': {k: round(v, 2) for k, v in individual_preds['home'].items()}
            },
            'away': {
                'team': away_team,
                'predicted': round(away_pred, 2),
                'confidence': round(away_confidence, 1),
                'individual': {k: round(v, 2) for k, v in individual_preds['away'].items()}
            },
            'total': {
                'predicted': round(total_pred, 2),
                'difference': round(total_pred - line, 2),
                'recommendation': 'OVER' if total_pred > line else 'UNDER',
                'confidence': round(total_confidence, 1),
                'individual': {k: round(v, 2) for k, v in individual_preds['total'].items()}
            }
        },
        'implied_total': round(home_pred + away_pred, 2),
        'verification': abs(round(home_pred + away_pred, 2) - round(total_pred, 2)) < 0.1  # Should be True
    }

def main_multioutput(df):
    """
    Main function to run the multi-output prediction pipeline
    """
    print("="*60)
    print("NBA MULTI-OUTPUT PREDICTION MODEL")
    print("(Home Points, Away Points, Total Points)")
    print("="*60)

    # Prepare data
    X_train, X_test, y_train, y_test, features, train_df, test_df = prepare_multioutput_data(
        df, FEATURES_REG, test_size=0.2
    )

    # Train model - FIXED: passing test_df as parameter
    model, results_df = train_multioutput_ensemble(X_train, X_test, y_train, y_test, features, test_df)

    return model, features, train_df, X_test, y_test, test_df, results_df

# ============================================================================
# PART 2: LOAD YOUR DATA (from the previous step)
# ============================================================================

# Load the data you saved earlier
model_df = pd.read_csv('nba_model_data_2025_26.csv')
model_df['GAME_DATE'] = pd.to_datetime(model_df['GAME_DATE'])

print(f"Loaded {len(model_df)} games from 2025-26 season")

# ============================================================================
# PART 3: RUN THE MODEL
# ============================================================================

# Run your model with the data
model, features, train_data, X_test, y_test, test_df, results = main_multioutput(model_df)

# ============================================================================
# PART 4: MAKE PREDICTIONS
# ============================================================================

# Make a prediction for a specific game
result = predict_game_multioutput(
    model,
    "Los Angeles Lakers",
    "Boston Celtics",
    225.5,
    train_data,
    features
)

# Print the result
def json_serializable(obj):
    if isinstance(obj, (np.floating, np.complexfloating)): return float(obj)
    if isinstance(obj, np.integer): return int(obj)
    if isinstance(obj, (np.bool_, bool)): return bool(obj)
    return str(obj)

print(json.dumps(result, indent=2, default=json_serializable))

# View top 10 test results
print("\nTop 10 Test Results:")
print(results[['date', 'home_team', 'away_team', 'home_actual', 'home_pred',
               'away_actual', 'away_pred', 'total_actual', 'total_pred', 'correct_side']].head(10))

Loaded 869 games from 2025-26 season
NBA MULTI-OUTPUT PREDICTION MODEL
(Home Points, Away Points, Total Points)
Creating rolling features...
Creating matchup features...
Total games with complete data: 578

Training on 462 games (2025-11-29 to 2026-02-03)
Testing on 116 games (2026-02-04 to 2026-02-24)
Using 60 features
Predicting: HOME_PTS, AWAY_PTS, TOTAL_PTS

Training Multi-Output Models...
Training XGBoost Multi-Output...
Training CatBoost Multi-Output...
Training Random Forest Multi-Output...

Creating Ensembles for each target...

MULTI-OUTPUT MODEL PERFORMANCE

HOME_PTS:
  MAE: 10.00 points
  RMSE: 12.67 points
  Bias: 1.58 points

AWAY_PTS:
  MAE: 9.42 points
  RMSE: 12.29 points
  Bias: -1.75 points

TOTAL_PTS:
  MAE: 15.12 points
  RMSE: 19.58 points
  Bias: 0.02 points

TOTAL POINTS BETTING METRICS
Over/Under Accuracy: 72.4%
ROI (at -110): 38.3%
Total within 5 points: 25.9%
Total within 10 points: 43.1%
Total within 15 points: 56.0%
Total within 20 points: 72.4%

Error Distr

In [7]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from nba_api.stats.endpoints import leaguegamelog, leaguedashteamstats
from nba_api.stats.static import teams
import time
import json

class NBARealTimePredictor:
    """
    Real-time NBA game predictor using your trained model
    """

    def __init__(self, model, feature_names, trained_model_data):
        """
        Initialize with your trained model

        Args:
            model: Your trained model from main_multioutput
            feature_names: List of feature names used by the model
            trained_model_data: The training dataframe (for reference)
        """
        self.model = model
        self.feature_names = feature_names
        self.trained_model_data = trained_model_data
        self.team_name_map = self._create_team_name_map()

    def _create_team_name_map(self):
        """Create mapping from various team name formats to standard names"""
        # Get all NBA teams
        nba_teams = teams.get_teams()

        # Create mapping dictionary
        team_map = {}
        for team in nba_teams:
            # Map full name
            team_map[team['full_name'].lower()] = team['full_name']
            # Map abbreviation
            team_map[team['abbreviation'].lower()] = team['full_name']
            # Map nickname
            team_map[team['nickname'].lower()] = team['full_name']
            # Map city
            city = team['full_name'].replace(' ' + team['nickname'], '')
            team_map[city.lower()] = team['full_name']

        # Add common variations
        common_variations = {
            'lakers': 'Los Angeles Lakers',
            'celtics': 'Boston Celtics',
            'warriors': 'Golden State Warriors',
            'bulls': 'Chicago Bulls',
            'heat': 'Miami Heat',
            'sixers': 'Philadelphia 76ers',
            '76ers': 'Philadelphia 76ers',
            'knicks': 'New York Knicks',
            'mavericks': 'Dallas Mavericks',
            'mavs': 'Dallas Mavericks',
            'blazers': 'Portland Trail Blazers',
            'trail blazers': 'Portland Trail Blazers',
            'wolves': 'Minnesota Timberwolves',
            'timberwolves': 'Minnesota Timberwolves',
            'thunder': 'Oklahoma City Thunder',
            'okc': 'Oklahoma City Thunder',
            'spurs': 'San Antonio Spurs',
            'rockets': 'Houston Rockets',
            'jazz': 'Utah Jazz',
            'kings': 'Sacramento Kings',
            'suns': 'Phoenix Suns',
            'clippers': 'LA Clippers',
            'lac': 'LA Clippers',
            'nuggets': 'Denver Nuggets',
            'bucks': 'Milwaukee Bucks',
            'pacers': 'Indiana Pacers',
            'magic': 'Orlando Magic',
            'hawks': 'Atlanta Hawks',
            'hornets': 'Charlotte Hornets',
            'pistons': 'Detroit Pistons',
            'cavaliers': 'Cleveland Cavaliers',
            'cavs': 'Cleveland Cavaliers',
            'raptors': 'Toronto Raptors',
            'wizards': 'Washington Wizards',
            'grizzlies': 'Memphis Grizzlies',
            'pelicans': 'New Orleans Pelicans',
            'nola': 'New Orleans Pelicans'
        }
        team_map.update(common_variations)

        return team_map

    def _standardize_team_name(self, team_input):
        """Convert various team name formats to standard name"""
        if not team_input:
            return None

        # If already in correct format and exists in training data
        if team_input in self.trained_model_data['HOME_TEAM'].values:
            return team_input

        # Try to find in mapping
        team_lower = team_input.lower().strip()
        if team_lower in self.team_name_map:
            return self.team_name_map[team_lower]

        # Try partial matching
        for key, value in self.team_name_map.items():
            if key in team_lower or team_lower in key:
                return value

        return team_input  # Return original if no match found

    def fetch_live_team_stats(self, team_name, season="2025-26"):
        """
        Fetch current season stats for a team from NBA API
        """
        try:
            # Get team ID
            nba_teams = teams.get_teams()
            team_id = None
            for team in nba_teams:
                if team['full_name'].lower() == team_name.lower():
                    team_id = team['id']
                    break

            if not team_id:
                print(f"Could not find team ID for {team_name}")
                return None

            # Get team advanced stats
            team_stats = leaguedashteamstats.LeagueDashTeamStats(
                season=season,
                season_type_all_star='Regular Season',
                measure_type_detailed_defense='Advanced',
                per_mode_detailed='PerGame',
                timeout=30
            ).get_data_frames()[0]

            # Get team's stats
            team_row = team_stats[team_stats['TEAM_ID'] == team_id]

            if len(team_row) == 0:
                print(f"No stats found for {team_name}")
                return None

            team_row = team_row.iloc[0]

            # Extract advanced stats
            stats = {}
            advanced_metrics = [
                'OFF_RATING', 'DEF_RATING', 'NET_RATING', 'PACE',
                'TS_PCT', 'EFG_PCT', 'OREB_PCT', 'DREB_PCT',
                'TM_TOV_PCT', 'AST_RATIO', 'PIE'
            ]

            for metric in advanced_metrics:
                stats[metric] = team_row.get(metric, np.nan)

            # Get rankings
            ranking_metrics = ['OFF_RATING', 'DEF_RATING', 'NET_RATING', 'PACE']
            for metric in ranking_metrics:
                stats[f'{metric}_RANK'] = team_row.get(f'{metric}_RANK', np.nan)

            return stats

        except Exception as e:
            print(f"Error fetching stats for {team_name}: {e}")
            return None

    def fetch_recent_games(self, team_name, num_games=10, season="2025-26"):
        """
        Fetch recent games for a team to calculate rolling averages
        """
        try:
            # Get team ID
            nba_teams = teams.get_teams()
            team_id = None
            for team in nba_teams:
                if team['full_name'].lower() == team_name.lower():
                    team_id = team['id']
                    break

            if not team_id:
                print(f"Could not find team ID for {team_name}")
                return pd.DataFrame()

            # Get team game log
            team_games = leaguegamelog.LeagueGameLog(
                season=season,
                season_type_all_star='Regular Season',
                team_id_nullable=team_id,
                timeout=30
            ).get_data_frames()[0]

            # Sort by date (most recent first)
            team_games['GAME_DATE'] = pd.to_datetime(team_games['GAME_DATE'])
            team_games = team_games.sort_values('GAME_DATE', ascending=False)

            # Get last N games
            recent = team_games.head(num_games).copy()

            # Add home/away indicator
            recent['IS_HOME'] = recent['MATCHUP'].str.contains('vs.')

            return recent

        except Exception as e:
            print(f"Error fetching recent games for {team_name}: {e}")
            return pd.DataFrame()

    def calculate_rolling_averages(self, team_name, recent_games_df, window_sizes=[5, 10, 20]):
        """
        Calculate rolling averages from recent games data
        """
        rolling_stats = {}

        # We need advanced stats for these games
        # This is a simplified version - in practice you'd want to fetch
        # advanced stats for each game date

        # For now, we'll use season averages as a fallback
        season_stats = self.fetch_live_team_stats(team_name)
        if not season_stats:
            return rolling_stats

        # Create rolling averages (simplified - using season stats as base)
        # In production, you'd want actual per-game advanced stats
        for window in window_sizes:
            for metric in ['OFF_RATING', 'DEF_RATING', 'NET_RATING', 'PACE', 'TS_PCT']:
                rolling_col = f'ROLL_{window}_{metric}'
                # For simplicity, we'll use season averages
                # In reality, you'd calculate from actual game data
                if metric in season_stats:
                    rolling_stats[rolling_col] = season_stats[metric]

        return rolling_stats

    def prepare_game_features(self, home_team, away_team, line=None):
        """
        Prepare all features needed for prediction
        """
        print(f"\nPreparing features for {away_team} @ {home_team}...")

        # Standardize team names
        home_team = self._standardize_team_name(home_team)
        away_team = self._standardize_team_name(away_team)

        print(f"Standardized: {away_team} @ {home_team}")

        # Fetch current stats for both teams
        home_stats = self.fetch_live_team_stats(home_team)
        away_stats = self.fetch_live_team_stats(away_team)

        if not home_stats or not away_stats:
            print("Failed to fetch team stats")
            return None

        # Fetch recent games for rolling averages
        home_recent = self.fetch_recent_games(home_team, num_games=20)
        away_recent = self.fetch_recent_games(away_team, num_games=20)

        # Calculate rolling averages
        home_rolling = self.calculate_rolling_averages(home_team, home_recent)
        away_rolling = self.calculate_rolling_averages(away_team, away_recent)

        # Build feature dictionary
        features = {}

        # Add basic stats
        for stat, value in home_stats.items():
            features[f'HOME_ADV_{stat}'] = value

        for stat, value in away_stats.items():
            features[f'AWAY_ADV_{stat}'] = value

        # Add rolling features
        features.update(home_rolling)
        features.update(away_rolling)

        # Set line (default if not provided)
        if line is None:
            # Simple line estimate based on net rating difference
            net_diff = home_stats.get('NET_RATING', 0) - away_stats.get('NET_RATING', 0)
            line = 225 + (net_diff * 0.3)  # 225 is average NBA total
            line = round(line, 1)
            print(f"Estimated line: {line}")

        features['LINE'] = line

        # Calculate derived features
        features['AVG_PACE'] = (features.get('HOME_ADV_PACE', 100) + features.get('AWAY_ADV_PACE', 100)) / 2
        features['AVG_OFF_RTG'] = (features.get('HOME_ADV_OFF_RATING', 110) + features.get('AWAY_ADV_OFF_RATING', 110)) / 2
        features['PACE_X_OFF'] = features['AVG_PACE'] * features['AVG_OFF_RTG'] / 100

        # Matchup features
        features['PACE_DIFF'] = abs(features.get('HOME_ADV_PACE', 100) - features.get('AWAY_ADV_PACE', 100))
        features['OFF_VS_DEF'] = features.get('HOME_ADV_OFF_RATING', 110) - features.get('AWAY_ADV_DEF_RATING', 110)
        features['DEF_VS_OFF'] = features.get('HOME_ADV_DEF_RATING', 110) - features.get('AWAY_ADV_OFF_RATING', 110)
        features['HOME_STRENGTH'] = features.get('HOME_ADV_NET_RATING', 0) / 100
        features['AWAY_STRENGTH'] = features.get('AWAY_ADV_NET_RATING', 0) / 100
        features['STRENGTH_DIFF'] = features['HOME_STRENGTH'] - features['AWAY_STRENGTH']
        features['TS_PCT_DIFF'] = features.get('HOME_ADV_TS_PCT', 0.55) - features.get('AWAY_ADV_TS_PCT', 0.55)
        features['HOME_OFF_ADV'] = features.get('HOME_ADV_OFF_RATING', 110) - features.get('AWAY_ADV_OFF_RATING', 110)
        features['HOME_DEF_ADV'] = features.get('AWAY_ADV_DEF_RATING', 110) - features.get('HOME_ADV_DEF_RATING', 110)

        # Rebounding and turnover battles
        features['REB_BATTLE'] = (features.get('HOME_ADV_OREB_PCT', 0.25) + features.get('HOME_ADV_DREB_PCT', 0.75) -
                                  features.get('AWAY_ADV_OREB_PCT', 0.25) - features.get('AWAY_ADV_DREB_PCT', 0.75))
        features['TOV_BATTLE'] = features.get('AWAY_ADV_TM_TOV_PCT', 0.13) - features.get('HOME_ADV_TM_TOV_PCT', 0.13)

        return features

    def predict_game(self, home_team, away_team, line=None):
        """
        Make real-time prediction for a game
        """
        # Prepare features
        features = self.prepare_game_features(home_team, away_team, line)

        if features is None:
            return {"error": "Failed to prepare features"}

        # Create dataframe with features
        X_pred = pd.DataFrame([features])

        # Ensure all feature columns exist
        for col in self.feature_names:
            if col not in X_pred.columns:
                X_pred[col] = 0

        # Select only features used in training
        X_pred = X_pred[self.feature_names]

        # Make prediction using your model
        home_pred = self.model['ensembles']['HOME_PTS'].predict(X_pred)[0]
        away_pred = self.model['ensembles']['AWAY_PTS'].predict(X_pred)[0]
        total_pred = self.model['ensembles']['TOTAL_PTS'].predict(X_pred)[0]

        # Get individual model predictions for confidence
        individual_preds = {'home': {}, 'away': {}, 'total': {}}

        for target in ['HOME_PTS', 'AWAY_PTS', 'TOTAL_PTS']:
            for name in ['xgb', 'cat', 'rf']:
                pred = self.model['ensembles'][target].named_estimators_[name].predict(X_pred)[0]
                if target == 'HOME_PTS':
                    individual_preds['home'][name] = pred
                elif target == 'AWAY_PTS':
                    individual_preds['away'][name] = pred
                else:
                    individual_preds['total'][name] = pred

        # Calculate confidence based on agreement
        home_conf = 1 - (np.std(list(individual_preds['home'].values())) / (np.mean(list(individual_preds['home'].values())) + 1e-10))
        away_conf = 1 - (np.std(list(individual_preds['away'].values())) / (np.mean(list(individual_preds['away'].values())) + 1e-10))
        total_conf = 1 - (np.std(list(individual_preds['total'].values())) / (np.mean(list(individual_preds['total'].values())) + 1e-10))

        # Prepare result
        result = {
            'game_info': {
                'matchup': f"{away_team} @ {home_team}",
                'date': datetime.now().strftime('%Y-%m-%d'),
                'line': features.get('LINE', line),
                'data_source': 'Live NBA API (2025-26 season)'
            },
            'team_stats': {
                'home': {
                    'name': home_team,
                    'off_rating': round(features.get('HOME_ADV_OFF_RATING', 0), 2),
                    'def_rating': round(features.get('HOME_ADV_DEF_RATING', 0), 2),
                    'net_rating': round(features.get('HOME_ADV_NET_RATING', 0), 2),
                    'pace': round(features.get('HOME_ADV_PACE', 0), 2)
                },
                'away': {
                    'name': away_team,
                    'off_rating': round(features.get('AWAY_ADV_OFF_RATING', 0), 2),
                    'def_rating': round(features.get('AWAY_ADV_DEF_RATING', 0), 2),
                    'net_rating': round(features.get('AWAY_ADV_NET_RATING', 0), 2),
                    'pace': round(features.get('AWAY_ADV_PACE', 0), 2)
                }
            },
            'predictions': {
                'home': {
                    'team': home_team,
                    'predicted': round(home_pred, 1),
                    'confidence': round(home_conf * 100, 1),
                    'individual': {k: round(v, 1) for k, v in individual_preds['home'].items()}
                },
                'away': {
                    'team': away_team,
                    'predicted': round(away_pred, 1),
                    'confidence': round(away_conf * 100, 1),
                    'individual': {k: round(v, 1) for k, v in individual_preds['away'].items()}
                },
                'total': {
                    'predicted': round(total_pred, 1),
                    'line': features.get('LINE', line),
                    'difference': round(total_pred - features.get('LINE', line), 1),
                    'recommendation': 'OVER' if total_pred > features.get('LINE', line) else 'UNDER',
                    'confidence': round(total_conf * 100, 1),
                    'individual': {k: round(v, 1) for k, v in individual_preds['total'].items()}
                }
            },
            'implied_total': round(home_pred + away_pred, 1),
            'verification': abs(round(home_pred + away_pred, 1) - round(total_pred, 1)) < 0.1
        }

        return result

    def predict_multiple_games(self, games_list):
        """
        Predict multiple games at once

        Args:
            games_list: List of dicts with keys ['home', 'away', 'line'] (line optional)
        """
        results = []

        for i, game in enumerate(games_list):
            print(f"\n{'='*60}")
            print(f"Game {i+1}/{len(games_list)}")

            result = self.predict_game(
                game['home'],
                game['away'],
                game.get('line')
            )

            results.append(result)

            # Small delay to avoid rate limiting
            time.sleep(1)

        return results

    def print_prediction(self, result):
        """Pretty print a prediction result"""
        if 'error' in result:
            print(f"‚ùå Error: {result['error']}")
            return

        print("\n" + "="*70)
        print(f"üèÄ {result['game_info']['matchup']}")
        print(f"üìÖ {result['game_info']['date']}")
        print("="*70)

        print("\nüìä TEAM STATS:")
        print(f"  Home - {result['team_stats']['home']['name']}:")
        print(f"    Off: {result['team_stats']['home']['off_rating']} | Def: {result['team_stats']['home']['def_rating']} | Net: {result['team_stats']['home']['net_rating']} | Pace: {result['team_stats']['home']['pace']}")
        print(f"  Away - {result['team_stats']['away']['name']}:")
        print(f"    Off: {result['team_stats']['away']['off_rating']} | Def: {result['team_stats']['away']['def_rating']} | Net: {result['team_stats']['away']['net_rating']} | Pace: {result['team_stats']['away']['pace']}")

        print("\nüéØ PREDICTIONS:")
        print(f"  Home: {result['predictions']['home']['team']} - {result['predictions']['home']['predicted']} pts (Conf: {result['predictions']['home']['confidence']}%)")
        print(f"  Away: {result['predictions']['away']['team']} - {result['predictions']['away']['predicted']} pts (Conf: {result['predictions']['away']['confidence']}%)")
        print(f"  Total: {result['predictions']['total']['predicted']} pts | Line: {result['predictions']['total']['line']}")

        # Color coded recommendation
        diff = result['predictions']['total']['difference']
        rec = result['predictions']['total']['recommendation']
        if diff > 3:
            rec_display = f"üî¥ {rec} (Strong)"
        elif diff > 1:
            rec_display = f"üü† {rec}"
        elif diff > 0:
            rec_display = f"üü° {rec}"
        elif diff > -1:
            rec_display = f"üü¢ UNDER (Lean)"
        else:
            rec_display = f"üîµ UNDER (Strong)"

        print(f"  Recommendation: {rec_display} (Diff: {result['predictions']['total']['difference']:+})")
        print(f"  Confidence: {result['predictions']['total']['confidence']}%")

        print("\nü§ñ Model Agreement:")
        print(f"  Home: XGB={result['predictions']['home']['individual']['xgb']} | CAT={result['predictions']['home']['individual']['cat']} | RF={result['predictions']['home']['individual']['rf']}")
        print(f"  Away: XGB={result['predictions']['away']['individual']['xgb']} | CAT={result['predictions']['away']['individual']['cat']} | RF={result['predictions']['away']['individual']['rf']}")
        print(f"  Total: XGB={result['predictions']['total']['individual']['xgb']} | CAT={result['predictions']['total']['individual']['cat']} | RF={result['predictions']['total']['individual']['rf']}")

        print("\n" + "="*70)


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

"""
Assuming you already have your trained model from earlier:

model, features, train_data, X_test, y_test, test_df, results = main_multioutput(model_df)

Now you can create the real-time predictor:
"""

# Initialize the predictor with your trained model
predictor = NBARealTimePredictor(model, features, train_data)

# Example 1: Predict a single game
print("\n" + "="*70)
print("PREDICTING SINGLE GAME")
print("="*70)

result = predictor.predict_game(
    home_team="Los Angeles Lakers",
    away_team="Boston Celtics",
    line=228.5  # Optional - if not provided, will estimate
)

predictor.print_prediction(result)

# Example 2: Predict multiple games (tomorrow's slate)
print("\n" + "="*70)
print("PREDICTING MULTIPLE GAMES")
print("="*70)

tomorrow_games = [
    {"home": "Los Angeles Lakers", "away": "Orlando Magic", "line": 235.5},
    {"home": "Portland Trail Blazers", "away": "Minnesota Timberwolves","line": 230},  # Line will be estimated
    {"home": "Phoenix Suns", "away": "Boston Celtics", "line": 225.5},
    {"home": "Dallas Mavericks", "away": "Brooklyn Nets","line": 240}
]

results = predictor.predict_multiple_games(tomorrow_games)

# Print all results
for i, result in enumerate(results):
    predictor.print_prediction(result)

# Example 3: Interactive prediction
def interactive_prediction(predictor):
    """Interactive mode to predict games"""
    print("\n" + "="*70)
    print("INTERACTIVE PREDICTION MODE")
    print("="*70)
    print("Enter 'quit' to exit")

    while True:
        print("\n" + "-"*40)
        home = input("Home team: ").strip()
        if home.lower() == 'quit':
            break

        away = input("Away team: ").strip()
        if away.lower() == 'quit':
            break

        line_input = input("Line (optional, press Enter to skip): ").strip()
        line = float(line_input) if line_input else None

        result = predictor.predict_game(home, away, line)
        predictor.print_prediction(result)

# Uncomment to run interactive mode
# interactive_prediction(predictor)

# Example 4: Export predictions to JSON
def export_predictions(predictor, games_list, filename="predictions.json"):
    """Export predictions to JSON file"""
    results = predictor.predict_multiple_games(games_list)

    # Convert to serializable format
    def json_serializable(obj):
        if isinstance(obj, (np.floating, np.complexfloating)): return float(obj)
        if isinstance(obj, np.integer): return int(obj)
        if isinstance(obj, (np.bool_, bool)): return bool(obj)
        return str(obj)

    with open(filename, 'w') as f:
        json.dump(results, f, indent=2, default=json_serializable)

    print(f"\n‚úÖ Predictions exported to {filename}")
    return results

# Export tomorrow's games
# export_predictions(predictor, tomorrow_games, "tomorrow_predictions.json")


PREDICTING SINGLE GAME

Preparing features for Boston Celtics @ Los Angeles Lakers...
Standardized: Boston Celtics @ Los Angeles Lakers
Error fetching recent games for Los Angeles Lakers: LeagueGameLog.__init__() got an unexpected keyword argument 'team_id_nullable'
Error fetching recent games for Boston Celtics: LeagueGameLog.__init__() got an unexpected keyword argument 'team_id_nullable'

üèÄ Boston Celtics @ Los Angeles Lakers
üìÖ 2026-02-25

üìä TEAM STATS:
  Home - Los Angeles Lakers:
    Off: 116.0 | Def: 116.6 | Net: -0.7 | Pace: 99.54
  Away - Boston Celtics:
    Off: 120.0 | Def: 112.0 | Net: 8.0 | Pace: 95.55

üéØ PREDICTIONS:
  Home: Los Angeles Lakers - 111.7 pts (Conf: 99.5%)
  Away: Boston Celtics - 112.6 pts (Conf: 98.5%)
  Total: 224.4 pts | Line: 228.5
  Recommendation: üîµ UNDER (Strong) (Diff: -4.1)
  Confidence: 99.1%

ü§ñ Model Agreement:
  Home: XGB=112.4000015258789 | CAT=110.9 | RF=111.7
  Away: XGB=110.5999984741211 | CAT=114.7 | RF=112.5
  Total: XGB=2