# Player-Enhanced Features Experiment

**Testing historical momentum + blowout + Haslametrics + player features**

Changes from 02_modeling.ipynb:
- ✅ Enhanced team stats (36 features vs 10 baseline)
- ✅ Momentum features (win streak, recent form, avg margin)
- ✅ Blowout tendency (large margin win/loss rates)
- ✅ Haslametrics offensive efficiency
- ✅ Team-specific home court advantage
- ✅ **Player features (14 features):**
  - Star player power (top 3 scorers PPG, efficiency)
  - Offensive balance (scoring distribution)
  - Bench depth (non-starter production)
  - Key player efficiency (AST/TO, rebounds, usage)

Expected: Test if player-based features improve MAE

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import our modules
from src import config
from src.elo import EloRatingSystem
from src.models import ImprovedSpreadModel
from src.utils import fetch_barttorvik_year
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

print("Libraries loaded!")

## 1. Load Real Historical Games

In [None]:
# Load historical game results
games = pd.read_csv(config.HISTORICAL_GAMES_FILE, parse_dates=['date'])

print(f"Loaded {len(games)} real games")
print(f"Date range: {games['date'].min()} to {games['date'].max()}")
print(f"Seasons: {sorted(games['season'].unique())}")
print(f"\nGames per season:")
print(games['season'].value_counts().sort_index())
print(f"\nMargin stats:")
print(f"  Mean: {games['margin'].mean():.2f}")
print(f"  Std: {games['margin'].std():.2f}")
print(f"  Median: {games['margin'].median():.2f}")

games.head()

## 2. Initialize and Process Elo Ratings Chronologically

In [None]:
# Initialize Elo system using config
elo = EloRatingSystem(
    k_factor=config.ELO_CONFIG['k_factor'],
    hca=config.ELO_CONFIG['home_court_advantage'],
    carryover=config.ELO_CONFIG['season_carryover']
)

# Load conference mappings from config
elo.load_conference_mappings(config.CONFERENCE_MAPPINGS)

print("Elo system initialized with conference mappings from config")

In [None]:
# Process games chronologically to build Elo ratings
print("Processing games chronologically to build Elo history...")
print("This may take a minute...\n")

elo_snapshots = elo.process_games(
    games,
    date_col='date',
    home_col='home_team',
    away_col='away_team',
    home_score_col='home_score',
    away_score_col='away_score',
    neutral_col='neutral_site',
    season_col='season',
    save_snapshots=True
)

print(f"\n✓ Processed {len(elo_snapshots)} games")
print(f"✓ Tracked {len(elo.ratings)} team Elo ratings")

# Display top teams
print("\nTop 15 teams by current Elo:")
elo.get_rankings(top_n=15)

## 3. Merge Elo with Enhanced Team Stats

In [None]:
# Fetch team stats for training years using config and utils
all_stats = []
for year in config.TRAINING_YEARS:
    print(f"Fetching {year}...")
    df = fetch_barttorvik_year(year)
    df['season'] = year
    all_stats.append(df[['team', 'adjoe', 'adjde', 'season']])

team_stats = pd.concat(all_stats, ignore_index=True)
team_stats.columns = ['team', 'adj_oe', 'adj_de', 'season']
team_stats['adj_em'] = team_stats['adj_oe'] - team_stats['adj_de']

print(f"\nLoaded efficiency stats for {len(team_stats)} team-seasons")
team_stats.head()

## 4. Create Training Features from Real Games

In [None]:
# Merge Elo snapshots with team efficiency stats
print("Creating training features from real game data...")

# Add efficiency stats to elo_snapshots based on team and date
# Match by season (extract from date)
elo_snapshots['season'] = elo_snapshots['date'].dt.year

# Merge home team stats
train_data = elo_snapshots.merge(
    team_stats,
    left_on=['home_team', 'season'],
    right_on=['team', 'season'],
    how='left',
    suffixes=('', '_home')
)
train_data = train_data.rename(columns={'adj_oe': 'home_adj_oe', 'adj_de': 'home_adj_de', 'adj_em': 'home_adj_em'})
train_data = train_data.drop(columns=['team'], errors='ignore')

# Merge away team stats
train_data = train_data.merge(
    team_stats,
    left_on=['away_team', 'season'],
    right_on=['team', 'season'],
    how='left',
    suffixes=('', '_away')
)
train_data = train_data.rename(columns={'adj_oe': 'away_adj_oe', 'adj_de': 'away_adj_de', 'adj_em': 'away_adj_em'})
train_data = train_data.drop(columns=['team'], errors='ignore')

# Calculate derived features
train_data['eff_diff'] = train_data['home_adj_em'] - train_data['away_adj_em']
train_data['elo_diff'] = train_data['home_elo_before'] - train_data['away_elo_before']

# Drop rows with missing efficiency data
train_data = train_data.dropna(subset=['home_adj_oe', 'away_adj_oe'])

print(f"✓ Created {len(train_data)} training samples from real games")
print(f"\nFeature columns:")
print([c for c in train_data.columns if 'adj' in c or 'elo' in c or 'diff' in c])

In [None]:
# Define features using config
feature_cols = config.BASELINE_FEATURES

X = train_data[feature_cols]
y = train_data['actual_margin']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Features: {feature_cols}")
print(f"\nTarget (actual_margin) stats:")
print(f"  Mean: {y.mean():.2f}")
print(f"  Std: {y.std():.2f}")
print(f"  Median: {y.median():.2f}")

## 5. Train Model on Real Data

In [None]:
# Train improved model using config parameters
print("Training ImprovedSpreadModel on REAL game data...\n")

model = ImprovedSpreadModel(
    ridge_alpha=config.MODEL_CONFIG['ridge_alpha'],
    lgbm_params={
        'n_estimators': config.MODEL_CONFIG['n_estimators'],
        'max_depth': config.MODEL_CONFIG['max_depth'],
        'learning_rate': config.MODEL_CONFIG['learning_rate'],
    },
    weights=(
        config.MODEL_CONFIG['ridge_weight'],
        config.MODEL_CONFIG['lgbm_weight']
    ),
    use_lgbm=True
)

model.fit(X, y)
print("✓ Model trained!\n")

# Component performance
components = model.predict_components(X)
for name, preds in components.items():
    mae = np.abs(preds - y).mean()
    rmse = np.sqrt(((preds - y) ** 2).mean())
    print(f"{name:12} MAE={mae:.3f}, RMSE={rmse:.3f}")

In [None]:
# Cross-validation on real data using config
print("\nRunning 5-fold time-series cross-validation...\n")

tscv = TimeSeriesSplit(n_splits=config.CV_CONFIG['n_splits'])
cv_results = {'ridge': [], 'ensemble': []}

for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    fold_model = ImprovedSpreadModel(
        ridge_alpha=config.MODEL_CONFIG['ridge_alpha'],
        lgbm_params={
            'n_estimators': config.MODEL_CONFIG['n_estimators'],
            'max_depth': config.MODEL_CONFIG['max_depth'],
            'learning_rate': config.MODEL_CONFIG['learning_rate'],
        },
        weights=(
            config.MODEL_CONFIG['ridge_weight'],
            config.MODEL_CONFIG['lgbm_weight']
        )
    )
    fold_model.fit(X_train, y_train)
    
    preds = fold_model.predict(X_val)
    components = fold_model.predict_components(X_val)
    
    ridge_mae = np.abs(components['ridge'] - y_val).mean()
    ensemble_mae = np.abs(preds - y_val).mean()
    
    cv_results['ridge'].append(ridge_mae)
    cv_results['ensemble'].append(ensemble_mae)
    
    print(f"Fold {fold+1}: Ridge MAE={ridge_mae:.3f}, Ensemble MAE={ensemble_mae:.3f}")

print(f"\n{'='*60}")
print(f"Ridge CV MAE:    {np.mean(cv_results['ridge']):.3f} ± {np.std(cv_results['ridge']):.3f}")
print(f"Ensemble CV MAE: {np.mean(cv_results['ensemble']):.3f} ± {np.std(cv_results['ensemble']):.3f}")
print(f"{'='*60}")

## 6. Generate 2026 Predictions with Player-Enhanced Features

In [None]:
# Load 2026 PLAYER-ENHANCED team stats and prediction template
team_stats_2026 = pd.read_csv(config.PROCESSED_DATA_DIR / 'team_stats_2025_26_player_enhanced.csv')
template = pd.read_csv(config.DATA_DIR.parent / config.SUBMISSION_TEMPLATE)
template = template.dropna(subset=['Home', 'Away'])

print(f"Teams for 2026: {len(team_stats_2026)}")
print(f"Games to predict: {len(template)}")
print(f"\nPlayer-enhanced features available ({len(team_stats_2026.columns)} total):")
print("Baseline features (10): off_efficiency, def_efficiency, elo_rating, etc.")
print("Historical features (11): win_streak, recent_form, avg_margin, blowout tendency")
print("Haslametrics (2): haslametrics_off_eff, haslametrics_rank")
print("Player features (14): star PPG, bench depth, offensive balance, efficiency")
print(f"\nSample columns: {team_stats_2026.columns[:5].tolist()}...")

In [None]:
# Create prediction features
team_dict = team_stats_2026.set_index('team').to_dict('index')

pred_features = []
valid_indices = []

for idx, row in template.iterrows():
    home = row['Home']
    away = row['Away']
    
    if home not in team_dict or away not in team_dict:
        continue
    
    home_stats = team_dict[home]
    away_stats = team_dict[away]
    
    home_oe = home_stats.get('off_efficiency', 100)
    home_de = home_stats.get('def_efficiency', 100)
    away_oe = away_stats.get('off_efficiency', 100)
    away_de = away_stats.get('def_efficiency', 100)
    
    features = {
        'home_adj_oe': home_oe,
        'home_adj_de': home_de,
        'home_adj_em': home_oe - home_de,
        'away_adj_oe': away_oe,
        'away_adj_de': away_de,
        'away_adj_em': away_oe - away_de,
        'eff_diff': (home_oe - home_de) - (away_oe - away_de),
        'home_elo_before': elo.get_rating(home),
        'away_elo_before': elo.get_rating(away),
        'elo_diff': elo.get_rating(home) - elo.get_rating(away),
        'predicted_spread': elo.predict_spread(home, away),
    }
    
    pred_features.append(features)
    valid_indices.append(idx)

X_pred = pd.DataFrame(pred_features)
print(f"✓ Created features for {len(X_pred)} games")

In [None]:
# Generate predictions
predictions = model.predict(X_pred)
components = model.predict_components(X_pred)

results = template.copy()
for i, idx in enumerate(valid_indices):
    results.loc[idx, 'pt_spread'] = predictions[i]
    results.loc[idx, 'ridge_pred'] = components['ridge'][i]
    results.loc[idx, 'lgbm_pred'] = components['lgbm'][i]
    results.loc[idx, 'elo_spread'] = X_pred.iloc[i]['predicted_spread']

print("✓ Predictions generated!")
results[['Date', 'Away', 'Home', 'pt_spread', 'ridge_pred', 'lgbm_pred']].head(15)

## 7. Save Player-Enhanced Predictions

In [None]:
# Prepare submission using config team info
submission = results[['Date', 'Away', 'Home', 'pt_spread']].copy()
submission = submission.dropna(subset=['pt_spread'])

submission['team_name'] = ''
submission['team_member'] = ''
submission['team_email'] = ''

# Use team info from config
team_members = config.TEAM_INFO['members']
submission.loc[submission.index[0], 'team_name'] = config.TEAM_INFO['team_name']
for i, member in enumerate(team_members):
    if i < len(submission):
        submission.loc[submission.index[i], 'team_member'] = member['name']
        submission.loc[submission.index[i], 'team_email'] = member['email']

# Save to player-enhanced path for comparison
player_enhanced_output = config.PREDICTIONS_DIR / 'tsa_pt_spread_CMMT_2026_player_enhanced.csv'
submission.to_csv(player_enhanced_output, index=False)
print(f"✓ Saved: {player_enhanced_output}")

In [None]:
# Final Summary
print("\n" + "="*60)
print("PLAYER-ENHANCED MODEL SUMMARY")
print("="*60)
print(f"Training: {len(train_data)} real games (2020-2025)")
print(f"Features: {len(feature_cols)} baseline features (for training)")
print(f"Predictions: {len(submission)} games\n")
print(f"Cross-Validation Results:")
print(f"  Ridge MAE:    {np.mean(cv_results['ridge']):.3f} ± {np.std(cv_results['ridge']):.3f}")
print(f"  Ensemble MAE: {np.mean(cv_results['ensemble']):.3f} ± {np.std(cv_results['ensemble']):.3f}")
print("\nNote: Player-enhanced features (36 total) only used in 2026 team stats:")
print("      - 10 baseline (off/def efficiency, elo)")
print("      - 11 historical (momentum, blowout tendency)")
print("      - 2 Haslametrics (offensive efficiency, rank)")
print("      - 14 player (star power, bench depth, balance, efficiency)")
print("      Training data still uses baseline features only.")
print("="*60)

## 8. Compare Baseline vs Player-Enhanced Predictions

In [None]:
# Load baseline predictions for comparison
baseline_pred = pd.read_csv(config.PREDICTION_OUTPUT_FILE)
player_enhanced_pred = pd.read_csv(player_enhanced_output)

# Compare predictions
comparison = baseline_pred[['Date', 'Away', 'Home', 'pt_spread']].copy()
comparison = comparison.rename(columns={'pt_spread': 'baseline_spread'})
comparison = comparison.merge(
    player_enhanced_pred[['Date', 'Away', 'Home', 'pt_spread']],
    on=['Date', 'Away', 'Home'],
    how='inner'
)
comparison = comparison.rename(columns={'pt_spread': 'player_enhanced_spread'})
comparison['difference'] = comparison['player_enhanced_spread'] - comparison['baseline_spread']

print(f"Comparing {len(comparison)} predictions:\n")
print(f"Mean difference: {comparison['difference'].mean():.3f}")
print(f"Std difference:  {comparison['difference'].std():.3f}")
print(f"Max difference:  {comparison['difference'].abs().max():.3f}")
print(f"\nTop 10 games with biggest prediction changes:")
comparison.nlargest(10, 'difference')[['Date', 'Away', 'Home', 'baseline_spread', 'player_enhanced_spread', 'difference']]