# Real Historical Data Training

**Upgrade from synthetic to real game outcomes**

Changes from 05_improved_modeling.ipynb:
- ✅ Real game results (33,746 games from 2020-2025)
- ✅ Chronological Elo rating updates
- ✅ More accurate margin variance

Expected: **MAE 8.818 → ~8.5-8.6**

In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import our modules
from src import config
from src.elo import EloRatingSystem
from src.models import ImprovedSpreadModel
from src.utils import fetch_barttorvik_year
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

print("Libraries loaded!")

Libraries loaded!


## 1. Load Real Historical Games

In [2]:
# Load historical game results
games = pd.read_csv(config.HISTORICAL_GAMES_FILE, parse_dates=['date'])

print(f"Loaded {len(games)} real games")
print(f"Date range: {games['date'].min()} to {games['date'].max()}")
print(f"Seasons: {sorted(games['season'].unique())}")
print(f"\nGames per season:")
print(games['season'].value_counts().sort_index())
print(f"\nMargin stats:")
print(f"  Mean: {games['margin'].mean():.2f}")
print(f"  Std: {games['margin'].std():.2f}")
print(f"  Median: {games['margin'].median():.2f}")

games.head()

Loaded 33746 real games
Date range: 2019-11-05 00:00:00 to 2025-03-08 00:00:00
Seasons: [np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]

Games per season:
season
2020    5747
2021    4338
2022    5661
2023    6250
2024    5798
2025    5952
Name: count, dtype: int64

Margin stats:
  Mean: 2.83
  Std: 18.70
  Median: 2.00


Unnamed: 0,date,home_team,away_team,home_score,away_score,neutral_site,season,margin
0,2019-11-05,Abilene Christian,Arlington Baptist,90.0,39.0,False,2020,51.0
1,2019-11-05,N.C. A&T,UNC Greensboro,50.0,83.0,True,2020,-33.0
2,2019-11-05,Nebraska,UC Riverside,47.0,66.0,False,2020,-19.0
3,2019-11-05,Nevada,Utah,74.0,79.0,False,2020,-5.0
4,2019-11-05,New Hampshire,Curry,93.0,29.0,False,2020,64.0


## 2. Initialize and Process Elo Ratings Chronologically

In [3]:
# Initialize Elo system using config
elo = EloRatingSystem(
    k_factor=config.ELO_CONFIG['k_factor'],
    hca=config.ELO_CONFIG['home_court_advantage'],
    carryover=config.ELO_CONFIG['season_carryover']
)

# Load conference mappings from config
elo.load_conference_mappings(config.CONFERENCE_MAPPINGS)

print("Elo system initialized with conference mappings from config")

Elo system initialized with conference mappings from config


In [4]:
# Process games chronologically to build Elo ratings
print("Processing games chronologically to build Elo history...")
print("This may take a minute...\n")

elo_snapshots = elo.process_games(
    games,
    date_col='date',
    home_col='home_team',
    away_col='away_team',
    home_score_col='home_score',
    away_score_col='away_score',
    neutral_col='neutral_site',
    season_col='season',
    save_snapshots=True
)

print(f"\n✓ Processed {len(elo_snapshots)} games")
print(f"✓ Tracked {len(elo.ratings)} team Elo ratings")

# Display top teams
print("\nTop 15 teams by current Elo:")
elo.get_rankings(top_n=15)

Processing games chronologically to build Elo history...
This may take a minute...


✓ Processed 33746 games
✓ Tracked 1213 team Elo ratings

Top 15 teams by current Elo:


Unnamed: 0,rank,team,elo,conference
0,1,Houston,2347.620519,Big 12
1,2,Florida,2282.359698,SEC
2,3,Michigan St.,2280.357765,Big Ten
3,4,Duke,2272.182835,ACC
4,5,St. John's (NY),2257.560185,Big East
5,6,Tennessee,2233.59624,SEC
6,7,Auburn,2227.623663,SEC
7,8,Saint Mary's (CA),2203.262061,WCC
8,9,Maryland,2191.847523,Big Ten
9,10,Alabama,2182.195751,SEC


## 3. Merge Elo with Team Stats

In [5]:
# Fetch team stats for training years using config and utils
all_stats = []
for year in config.TRAINING_YEARS:
    print(f"Fetching {year}...")
    df = fetch_barttorvik_year(year)
    df['season'] = year
    all_stats.append(df[['team', 'adjoe', 'adjde', 'season']])

team_stats = pd.concat(all_stats, ignore_index=True)
team_stats.columns = ['team', 'adj_oe', 'adj_de', 'season']
team_stats['adj_em'] = team_stats['adj_oe'] - team_stats['adj_de']

print(f"\nLoaded efficiency stats for {len(team_stats)} team-seasons")
team_stats.head()

Fetching 2020...
Fetching 2021...
Fetching 2022...
Fetching 2023...
Fetching 2024...
Fetching 2025...

Loaded efficiency stats for 2147 team-seasons


Unnamed: 0,team,adj_oe,adj_de,season,adj_em
0,Kansas,116.0808,87.722046,2020,28.358754
1,Baylor,114.452832,88.367926,2020,26.084906
2,Gonzaga,121.264452,94.330489,2020,26.933963
3,Dayton,119.50183,93.396664,2020,26.105165
4,Michigan St.,114.77197,91.316497,2020,23.455473


## 4. Create Training Features from Real Games

In [6]:
# Merge Elo snapshots with team efficiency stats
print("Creating training features from real game data...")

# Add efficiency stats to elo_snapshots based on team and date
# Match by season (extract from date)
elo_snapshots['season'] = elo_snapshots['date'].dt.year
initial_count = len(elo_snapshots)

# Merge home team stats
train_data = elo_snapshots.merge(
    team_stats,
    left_on=['home_team', 'season'],
    right_on=['team', 'season'],
    how='left',
    suffixes=('', '_home')
)
train_data = train_data.rename(columns={'adj_oe': 'home_adj_oe', 'adj_de': 'home_adj_de', 'adj_em': 'home_adj_em'})
train_data = train_data.drop(columns=['team'], errors='ignore')

# Merge away team stats
train_data = train_data.merge(
    team_stats,
    left_on=['away_team', 'season'],
    right_on=['team', 'season'],
    how='left',
    suffixes=('', '_away')
)
train_data = train_data.rename(columns={'adj_oe': 'away_adj_oe', 'adj_de': 'away_adj_de', 'adj_em': 'away_adj_em'})
train_data = train_data.drop(columns=['team'], errors='ignore')

# Calculate derived features
train_data['eff_diff'] = train_data['home_adj_em'] - train_data['away_adj_em']
train_data['elo_diff'] = train_data['home_elo_before'] - train_data['away_elo_before']

# Data Loss Analysis
print(f"\nData Loss Analysis:")
print(f"  Initial games: {initial_count:,}")

# Count missing efficiency data
missing_home = train_data['home_adj_oe'].isna().sum()
missing_away = train_data['away_adj_oe'].isna().sum()
missing_either = train_data[['home_adj_oe', 'away_adj_oe']].isna().any(axis=1).sum()

print(f"  Missing home stats: {missing_home:,} games ({missing_home/initial_count*100:.1f}%)")
print(f"  Missing away stats: {missing_away:,} games ({missing_away/initial_count*100:.1f}%)")
print(f"  Missing either stat: {missing_either:,} games ({missing_either/initial_count*100:.1f}%)")

# Drop rows with missing efficiency data
train_data = train_data.dropna(subset=['home_adj_oe', 'away_adj_oe'])
final_count = len(train_data)
retention_rate = final_count / initial_count * 100

print(f"  Final training games: {final_count:,} ({retention_rate:.1f}% retained)")
print(f"  Dropped: {initial_count - final_count:,} games ({100-retention_rate:.1f}%)")

print(f"\n✓ Created {len(train_data)} training samples from real games")
print(f"\nFeature columns:")
print([c for c in train_data.columns if 'adj' in c or 'elo' in c or 'diff' in c])

Creating training features from real game data...

Data Loss Analysis:
  Initial games: 33,746
  Missing home stats: 9,256 games (27.4%)
  Missing away stats: 11,406 games (33.8%)
  Missing either stat: 15,722 games (46.6%)
  Final training games: 18,024 (53.4% retained)
  Dropped: 15,722 games (46.6%)

✓ Created 18024 training samples from real games

Feature columns:
['home_elo_before', 'away_elo_before', 'home_elo_after', 'away_elo_after', 'home_adj_oe', 'home_adj_de', 'home_adj_em', 'away_adj_oe', 'away_adj_de', 'away_adj_em', 'eff_diff', 'elo_diff']


In [7]:
# Define features using config
feature_cols = config.BASELINE_FEATURES

X = train_data[feature_cols]
y = train_data['actual_margin']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Features: {feature_cols}")
print(f"\nTarget (actual_margin) stats:")
print(f"  Mean: {y.mean():.2f}")
print(f"  Std: {y.std():.2f}")
print(f"  Median: {y.median():.2f}")

X shape: (18024, 11)
y shape: (18024,)
Features: ['home_adj_oe', 'home_adj_de', 'home_adj_em', 'away_adj_oe', 'away_adj_de', 'away_adj_em', 'eff_diff', 'home_elo_before', 'away_elo_before', 'elo_diff', 'predicted_spread']

Target (actual_margin) stats:
  Mean: -0.38
  Std: 14.88
  Median: -1.00


## 5. Train Model on Real Data

In [8]:
# Train improved model using config parameters
print("Training ImprovedSpreadModel on REAL game data...\n")

model = ImprovedSpreadModel(
    ridge_alpha=config.MODEL_CONFIG['ridge_alpha'],
    lgbm_params={
        'n_estimators': config.MODEL_CONFIG['n_estimators'],
        'max_depth': config.MODEL_CONFIG['max_depth'],
        'learning_rate': config.MODEL_CONFIG['learning_rate'],
    },
    weights=(
        config.MODEL_CONFIG['ridge_weight'],
        config.MODEL_CONFIG['lgbm_weight']
    ),
    use_lgbm=True
)

model.fit(X, y)
print("✓ Model trained!\n")

# Component performance
components = model.predict_components(X)
for name, preds in components.items():
    mae = np.abs(preds - y).mean()
    rmse = np.sqrt(((preds - y) ** 2).mean())
    print(f"{name:12} MAE={mae:.3f}, RMSE={rmse:.3f}")

Training ImprovedSpreadModel on REAL game data...

✓ Model trained!

ridge        MAE=5.911, RMSE=7.633
lgbm         MAE=4.144, RMSE=5.752
ensemble     MAE=4.601, RMSE=6.209


In [9]:
# Feature Importance Analysis
print("\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*60)

if hasattr(model, 'lgbm') and hasattr(model.lgbm, 'feature_importances_'):
    import matplotlib.pyplot as plt
    
    # Get feature importances from LightGBM component
    importances = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.lgbm.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nLightGBM Feature Importances (gain):")
    print(importances.to_string(index=False))
    
    # Plot top 10 features
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(importances)), importances['importance'])
    plt.yticks(range(len(importances)), importances['feature'])
    plt.xlabel('Importance (Gain)')
    plt.title('Feature Importance - LightGBM Component')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    # Ridge coefficients (absolute values)
    if hasattr(model, 'ridge') and hasattr(model.ridge, 'coef_'):
        ridge_importance = pd.DataFrame({
            'feature': feature_cols,
            'coefficient': model.ridge.coef_,
            'abs_coefficient': np.abs(model.ridge.coef_)
        }).sort_values('abs_coefficient', ascending=False)
        
        print("\nRidge Regression Coefficients:")
        print(ridge_importance.to_string(index=False))
else:
    print("Feature importance not available for this model type")


FEATURE IMPORTANCE ANALYSIS
Feature importance not available for this model type


In [10]:
# Cross-validation on real data using config
print("\nRunning 5-fold time-series cross-validation...\n")

tscv = TimeSeriesSplit(n_splits=config.CV_CONFIG['n_splits'])
cv_results = {'ridge': [], 'ensemble': []}

for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    fold_model = ImprovedSpreadModel(
        ridge_alpha=config.MODEL_CONFIG['ridge_alpha'],
        lgbm_params={
            'n_estimators': config.MODEL_CONFIG['n_estimators'],
            'max_depth': config.MODEL_CONFIG['max_depth'],
            'learning_rate': config.MODEL_CONFIG['learning_rate'],
        },
        weights=(
            config.MODEL_CONFIG['ridge_weight'],
            config.MODEL_CONFIG['lgbm_weight']
        )
    )
    fold_model.fit(X_train, y_train)
    
    preds = fold_model.predict(X_val)
    components = fold_model.predict_components(X_val)
    
    ridge_mae = np.abs(components['ridge'] - y_val).mean()
    ensemble_mae = np.abs(preds - y_val).mean()
    
    cv_results['ridge'].append(ridge_mae)
    cv_results['ensemble'].append(ensemble_mae)
    
    print(f"Fold {fold+1}: Ridge MAE={ridge_mae:.3f}, Ensemble MAE={ensemble_mae:.3f}")

print(f"\n{'='*60}")
print(f"Ridge CV MAE:    {np.mean(cv_results['ridge']):.3f} ± {np.std(cv_results['ridge']):.3f}")
print(f"Ensemble CV MAE: {np.mean(cv_results['ensemble']):.3f} ± {np.std(cv_results['ensemble']):.3f}")
print(f"{'='*60}")


Running 5-fold time-series cross-validation...

Fold 1: Ridge MAE=6.070, Ensemble MAE=5.529
Fold 2: Ridge MAE=6.108, Ensemble MAE=5.301
Fold 3: Ridge MAE=5.674, Ensemble MAE=4.805
Fold 4: Ridge MAE=6.076, Ensemble MAE=5.209
Fold 5: Ridge MAE=5.946, Ensemble MAE=5.010

Ridge CV MAE:    5.975 ± 0.160
Ensemble CV MAE: 5.171 ± 0.248


## 6. Generate 2026 Predictions with Real-Data-Trained Model

In [11]:
# Load 2026 prediction template and team stats using config paths
team_stats_2026 = pd.read_csv(config.PROCESSED_DATA_DIR / 'team_stats_2025_26.csv')
template = pd.read_csv(config.DATA_DIR.parent / config.SUBMISSION_TEMPLATE)
template = template.dropna(subset=['Home', 'Away'])

print(f"Teams for 2026: {len(team_stats_2026)}")
print(f"Games to predict: {len(template)}")

Teams for 2026: 21
Games to predict: 78


In [12]:
# Create prediction features
team_dict = team_stats_2026.set_index('team').to_dict('index')

pred_features = []
valid_indices = []

for idx, row in template.iterrows():
    home = row['Home']
    away = row['Away']
    
    if home not in team_dict or away not in team_dict:
        continue
    
    home_stats = team_dict[home]
    away_stats = team_dict[away]
    
    home_oe = home_stats.get('off_efficiency', 100)
    home_de = home_stats.get('def_efficiency', 100)
    away_oe = away_stats.get('off_efficiency', 100)
    away_de = away_stats.get('def_efficiency', 100)
    
    features = {
        'home_adj_oe': home_oe,
        'home_adj_de': home_de,
        'home_adj_em': home_oe - home_de,
        'away_adj_oe': away_oe,
        'away_adj_de': away_de,
        'away_adj_em': away_oe - away_de,
        'eff_diff': (home_oe - home_de) - (away_oe - away_de),
        'home_elo_before': elo.get_rating(home),
        'away_elo_before': elo.get_rating(away),
        'elo_diff': elo.get_rating(home) - elo.get_rating(away),
        'predicted_spread': elo.predict_spread(home, away),
    }
    
    pred_features.append(features)
    valid_indices.append(idx)

X_pred = pd.DataFrame(pred_features)
print(f"✓ Created features for {len(X_pred)} games")

✓ Created features for 78 games


In [13]:
# Generate predictions
predictions = model.predict(X_pred)
components = model.predict_components(X_pred)

results = template.copy()
for i, idx in enumerate(valid_indices):
    results.loc[idx, 'pt_spread'] = predictions[i]
    results.loc[idx, 'ridge_pred'] = components['ridge'][i]
    results.loc[idx, 'lgbm_pred'] = components['lgbm'][i]
    results.loc[idx, 'elo_spread'] = X_pred.iloc[i]['predicted_spread']

print("✓ Predictions generated!")
results[['Date', 'Away', 'Home', 'pt_spread', 'ridge_pred', 'lgbm_pred']].head(15)

✓ Predictions generated!


Unnamed: 0,Date,Away,Home,pt_spread,ridge_pred,lgbm_pred
0,2/7/2026,Syracuse,Virginia,14.274158,11.17579,15.602029
1,2/7/2026,Louisville,Wake Forest,2.036893,-1.64449,3.614629
2,2/7/2026,Virginia Tech,NC State,9.857153,8.771754,10.322324
3,2/7/2026,Miami,Boston College,7.865321,3.555695,9.712304
4,2/7/2026,SMU,Pitt,-6.016245,-5.741065,-6.134179
5,2/7/2026,Florida State,Notre Dame,10.963625,11.989109,10.524132
6,2/7/2026,Duke,North Carolina,-0.678596,-1.927966,-0.143151
7,2/7/2026,Clemson,California,-9.267491,-7.231119,-10.140222
8,2/7/2026,Georgia Tech,Stanford,10.327767,9.898498,10.511739
9,2/9/2026,NC State,Louisville,18.625434,18.235914,18.792372


## 7. Save Real-Data Predictions

In [14]:
# Prepare submission using config team info
submission = results[['Date', 'Away', 'Home', 'pt_spread']].copy()
submission = submission.dropna(subset=['pt_spread'])

submission['team_name'] = ''
submission['team_member'] = ''
submission['team_email'] = ''

# Use team info from config
team_members = config.TEAM_INFO['members']
submission.loc[submission.index[0], 'team_name'] = config.TEAM_INFO['team_name']
for i, member in enumerate(team_members):
    if i < len(submission):
        submission.loc[submission.index[i], 'team_member'] = member['name']
        submission.loc[submission.index[i], 'team_email'] = member['email']

# Save to config path
submission.to_csv(config.PREDICTION_OUTPUT_FILE, index=False)
print(f"✓ Saved: {config.PREDICTION_OUTPUT_FILE}")

✓ Saved: /Users/calebhan/Documents/Coding/Personal/triangle-sports-analytics-26/notebooks/../data/predictions/tsa_pt_spread_CMMT_2026.csv


In [15]:
# Final Summary
print("\n" + "="*60)
print("REAL DATA MODEL SUMMARY")
print("="*60)
print(f"Training: {len(train_data)} real games (2020-2025)")
print(f"Features: {len(feature_cols)}")
print(f"Predictions: {len(submission)} games\n")
print(f"Cross-Validation Results:")
print(f"  Ridge MAE:    {np.mean(cv_results['ridge']):.3f} ± {np.std(cv_results['ridge']):.3f}")
print(f"  Ensemble MAE: {np.mean(cv_results['ensemble']):.3f} ± {np.std(cv_results['ensemble']):.3f}")
print("="*60)


REAL DATA MODEL SUMMARY
Training: 18024 real games (2020-2025)
Features: 11
Predictions: 78 games

Cross-Validation Results:
  Ridge MAE:    5.975 ± 0.160
  Ensemble MAE: 5.171 ± 0.248
