# Real Historical Data Training

**Upgrade from synthetic to real game outcomes**

Changes from 05_improved_modeling.ipynb:
- ✅ Real game results (33,746 games from 2020-2025)
- ✅ Chronological Elo rating updates
- ✅ More accurate margin variance

Expected: **MAE 8.818 → ~8.5-8.6**

In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import our modules
from src.elo import EloRatingSystem
from src.features import FeatureEngine
from src.models import ImprovedSpreadModel
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

print("Libraries loaded!")

Libraries loaded!


## 1. Load Real Historical Games

In [2]:
# Load historical game results
games = pd.read_csv('../data/raw/games/historical_games_2019_2025.csv', parse_dates=['date'])

print(f"Loaded {len(games)} real games")
print(f"Date range: {games['date'].min()} to {games['date'].max()}")
print(f"Seasons: {sorted(games['season'].unique())}")
print(f"\nGames per season:")
print(games['season'].value_counts().sort_index())
print(f"\nMargin stats:")
print(f"  Mean: {games['margin'].mean():.2f}")
print(f"  Std: {games['margin'].std():.2f}")
print(f"  Median: {games['margin'].median():.2f}")

games.head()

Loaded 33746 real games
Date range: 2019-11-05 00:00:00 to 2025-03-08 00:00:00
Seasons: [np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]

Games per season:
season
2020    5747
2021    4338
2022    5661
2023    6250
2024    5798
2025    5952
Name: count, dtype: int64

Margin stats:
  Mean: 2.83
  Std: 18.70
  Median: 2.00


Unnamed: 0,date,home_team,away_team,home_score,away_score,neutral_site,season,margin
0,2019-11-05,Abilene Christian,Arlington Baptist,90.0,39.0,False,2020,51.0
1,2019-11-05,N.C. A&T,UNC Greensboro,50.0,83.0,True,2020,-33.0
2,2019-11-05,Nebraska,UC Riverside,47.0,66.0,False,2020,-19.0
3,2019-11-05,Nevada,Utah,74.0,79.0,False,2020,-5.0
4,2019-11-05,New Hampshire,Curry,93.0,29.0,False,2020,64.0


## 2. Initialize and Process Elo Ratings Chronologically

In [3]:
# Initialize Elo system
elo = EloRatingSystem(
    k_factor=38,
    hca=4.0,
    carryover=0.64
)

# Load conference mappings
conferences = {
    'ACC': ['Duke', 'North Carolina', 'NC State', 'Virginia', 'Virginia Tech',
           'Clemson', 'Florida State', 'Miami', 'Pitt', 'Syracuse', 'Louisville',
           'Wake Forest', 'Georgia Tech', 'Boston College', 'Notre Dame',
           'California', 'Stanford', 'SMU'],
    'SEC': ['Kentucky', 'Tennessee', 'Alabama', 'Auburn', 'Florida', 'Texas A&M',
           'Arkansas', 'LSU', 'Mississippi State', 'Ole Miss', 'Missouri',
           'South Carolina', 'Vanderbilt', 'Georgia'],
    'Big Ten': ['Purdue', 'Michigan', 'Michigan State', 'Ohio State', 'Illinois',
               'Indiana', 'Iowa', 'Wisconsin', 'Minnesota', 'Northwestern'],
    'Big 12': ['Houston', 'Kansas', 'Baylor', 'Iowa State', 'BYU'],
    'Big East': ['UConn', 'Creighton', 'Marquette', 'Villanova', 'Xavier'],
}
elo.load_conference_mappings(conferences)

print("Elo system initialized with conference mappings")

Elo system initialized with conference mappings


In [4]:
# Process games chronologically to build Elo ratings
print("Processing games chronologically to build Elo history...")
print("This may take a minute...\n")

elo_snapshots = elo.process_games(
    games,
    date_col='date',
    home_col='home_team',
    away_col='away_team',
    home_score_col='home_score',
    away_score_col='away_score',
    neutral_col='neutral_site',
    season_col='season',
    save_snapshots=True
)

print(f"\n✓ Processed {len(elo_snapshots)} games")
print(f"✓ Tracked {len(elo.ratings)} team Elo ratings")

# Display top teams
print("\nTop 15 teams by current Elo:")
elo.get_rankings(top_n=15)

Processing games chronologically to build Elo history...
This may take a minute...

Season change: 2020 -> 2021
Season change: 2021 -> 2022
Season change: 2022 -> 2023
Season change: 2023 -> 2024
Season change: 2024 -> 2025

✓ Processed 33746 games
✓ Tracked 1213 team Elo ratings

Top 15 teams by current Elo:


Unnamed: 0,rank,team,elo,conference
0,1,Houston,2286.642203,Big 12
1,2,Florida,2234.461474,SEC
2,3,Duke,2225.008279,ACC
3,4,Michigan St.,2211.211028,Other
4,5,St. John's (NY),2200.431425,Other
5,6,Tennessee,2185.045891,SEC
6,7,Auburn,2177.676491,SEC
7,8,Saint Mary's (CA),2149.232812,Other
8,9,Alabama,2132.505891,SEC
9,10,Clemson,2125.095711,ACC


## 3. Merge Elo with Team Stats

In [5]:
# Load current season efficiency stats from Barttorvik
import ssl
import urllib.request
from io import StringIO

def fetch_barttorvik_year(year):
    ssl_context = ssl.create_default_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE
    
    url = f"https://barttorvik.com/{year}_team_results.csv"
    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    
    with urllib.request.urlopen(req, context=ssl_context, timeout=30) as response:
        content = response.read().decode('utf-8')
    
    df = pd.read_csv(StringIO(content))
    return df

# Fetch team stats for training years
all_stats = []
for year in [2020, 2021, 2022, 2023, 2024, 2025]:
    df = fetch_barttorvik_year(year)
    df['season'] = year
    all_stats.append(df[['team', 'adjoe', 'adjde', 'season']])

team_stats = pd.concat(all_stats, ignore_index=True)
team_stats.columns = ['team', 'adj_oe', 'adj_de', 'season']
team_stats['adj_em'] = team_stats['adj_oe'] - team_stats['adj_de']

print(f"Loaded efficiency stats for {len(team_stats)} team-seasons")
team_stats.head()

Loaded efficiency stats for 2147 team-seasons


Unnamed: 0,team,adj_oe,adj_de,season,adj_em
0,B12,7.0,2.0,2020,5.0
1,B12,13.0,3.0,2020,10.0
2,WCC,1.0,41.0,2020,-40.0
3,A10,3.0,29.0,2020,-26.0
4,B10,11.0,11.0,2020,0.0


## 4. Create Training Features from Real Games

In [6]:
# Merge Elo snapshots with team efficiency stats
print("Creating training features from real game data...")

# Add efficiency stats to elo_snapshots based on team and date
# Match by season (extract from date)
elo_snapshots['season'] = elo_snapshots['date'].dt.year

# Merge home team stats
train_data = elo_snapshots.merge(
    team_stats,
    left_on=['home_team', 'season'],
    right_on=['team', 'season'],
    how='left',
    suffixes=('', '_home')
)
train_data = train_data.rename(columns={'adj_oe': 'home_adj_oe', 'adj_de': 'home_adj_de', 'adj_em': 'home_adj_em'})
train_data = train_data.drop(columns=['team'], errors='ignore')

# Merge away team stats
train_data = train_data.merge(
    team_stats,
    left_on=['away_team', 'season'],
    right_on=['team', 'season'],
    how='left',
    suffixes=('', '_away')
)
train_data = train_data.rename(columns={'adj_oe': 'away_adj_oe', 'adj_de': 'away_adj_de', 'adj_em': 'away_adj_em'})
train_data = train_data.drop(columns=['team'], errors='ignore')

# Calculate derived features
train_data['eff_diff'] = train_data['home_adj_em'] - train_data['away_adj_em']
train_data['elo_diff'] = train_data['home_elo_before'] - train_data['away_elo_before']

# Drop rows with missing efficiency data
train_data = train_data.dropna(subset=['home_adj_oe', 'away_adj_oe'])

print(f"✓ Created {len(train_data)} training samples from real games")
print(f"\nFeature columns:")
print([c for c in train_data.columns if 'adj' in c or 'elo' in c or 'diff' in c])

Creating training features from real game data...
✓ Created 8850 training samples from real games

Feature columns:
['home_elo_before', 'away_elo_before', 'home_elo_after', 'away_elo_after', 'home_adj_oe', 'home_adj_de', 'home_adj_em', 'away_adj_oe', 'away_adj_de', 'away_adj_em', 'eff_diff', 'elo_diff']


In [7]:
# Define features and target
feature_cols = [
    'home_adj_oe', 'home_adj_de', 'home_adj_em',
    'away_adj_oe', 'away_adj_de', 'away_adj_em',
    'eff_diff',
    'home_elo_before', 'away_elo_before', 'elo_diff', 'predicted_spread'
]

X = train_data[feature_cols]
y = train_data['actual_margin']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nTarget (actual_margin) stats:")
print(f"  Mean: {y.mean():.2f}")
print(f"  Std: {y.std():.2f}")
print(f"  Median: {y.median():.2f}")

X shape: (8850, 11)
y shape: (8850,)

Target (actual_margin) stats:
  Mean: -0.21
  Std: 15.08
  Median: -1.00


## 5. Train Model on Real Data

In [8]:
# Train improved model
print("Training ImprovedSpreadModel on REAL game data...\n")

model = ImprovedSpreadModel(
    ridge_alpha=1.0,
    lgbm_params={
        'n_estimators': 100,
        'max_depth': 6,
        'learning_rate': 0.1,
    },
    weights=(0.4, 0.6),
    use_lgbm=True
)

model.fit(X, y)
print("✓ Model trained!\n")

# Component performance
components = model.predict_components(X)
for name, preds in components.items():
    mae = np.abs(preds - y).mean()
    rmse = np.sqrt(((preds - y) ** 2).mean())
    print(f"{name:12} MAE={mae:.3f}, RMSE={rmse:.3f}")

Training ImprovedSpreadModel on REAL game data...

✓ Model trained!

ridge        MAE=5.908, RMSE=7.666
gbm          MAE=3.616, RMSE=4.826
ensemble     MAE=4.444, RMSE=5.815


In [9]:
# Cross-validation on real data
print("\nRunning 5-fold time-series cross-validation...\n")

tscv = TimeSeriesSplit(n_splits=5)
cv_results = {'ridge': [], 'ensemble': []}

for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    fold_model = ImprovedSpreadModel(weights=(0.4, 0.6))
    fold_model.fit(X_train, y_train)
    
    preds = fold_model.predict(X_val)
    components = fold_model.predict_components(X_val)
    
    ridge_mae = np.abs(components['ridge'] - y_val).mean()
    ensemble_mae = np.abs(preds - y_val).mean()
    
    cv_results['ridge'].append(ridge_mae)
    cv_results['ensemble'].append(ensemble_mae)
    
    print(f"Fold {fold+1}: Ridge MAE={ridge_mae:.3f}, Ensemble MAE={ensemble_mae:.3f}")

print(f"\n{'='*60}")
print(f"Ridge CV MAE:    {np.mean(cv_results['ridge']):.3f} ± {np.std(cv_results['ridge']):.3f}")
print(f"Ensemble CV MAE: {np.mean(cv_results['ensemble']):.3f} ± {np.std(cv_results['ensemble']):.3f}")
print(f"{'='*60}")


Running 5-fold time-series cross-validation...

Fold 1: Ridge MAE=5.965, Ensemble MAE=5.604
Fold 2: Ridge MAE=6.006, Ensemble MAE=5.375
Fold 3: Ridge MAE=6.213, Ensemble MAE=5.622
Fold 4: Ridge MAE=6.198, Ensemble MAE=5.463
Fold 5: Ridge MAE=5.713, Ensemble MAE=4.975

Ridge CV MAE:    6.019 ± 0.182
Ensemble CV MAE: 5.408 ± 0.235


## 6. Generate 2026 Predictions with Real-Data-Trained Model

In [10]:
# Load 2026 prediction template and team stats
team_stats_2026 = pd.read_csv('../data/processed/team_stats_2025_26.csv')
template = pd.read_csv('../tsa_pt_spread_template_2026 - Sheet1.csv')
template = template.dropna(subset=['Home', 'Away'])

print(f"Teams for 2026: {len(team_stats_2026)}")
print(f"Games to predict: {len(template)}")

Teams for 2026: 21
Games to predict: 78


In [11]:
# Create prediction features
team_dict = team_stats_2026.set_index('team').to_dict('index')

pred_features = []
valid_indices = []

for idx, row in template.iterrows():
    home = row['Home']
    away = row['Away']
    
    if home not in team_dict or away not in team_dict:
        continue
    
    home_stats = team_dict[home]
    away_stats = team_dict[away]
    
    home_oe = home_stats.get('off_efficiency', 100)
    home_de = home_stats.get('def_efficiency', 100)
    away_oe = away_stats.get('off_efficiency', 100)
    away_de = away_stats.get('def_efficiency', 100)
    
    features = {
        'home_adj_oe': home_oe,
        'home_adj_de': home_de,
        'home_adj_em': home_oe - home_de,
        'away_adj_oe': away_oe,
        'away_adj_de': away_de,
        'away_adj_em': away_oe - away_de,
        'eff_diff': (home_oe - home_de) - (away_oe - away_de),
        'home_elo_before': elo.get_rating(home),
        'away_elo_before': elo.get_rating(away),
        'elo_diff': elo.get_rating(home) - elo.get_rating(away),
        'predicted_spread': elo.predict_spread(home, away),
    }
    
    pred_features.append(features)
    valid_indices.append(idx)

X_pred = pd.DataFrame(pred_features)
print(f"✓ Created features for {len(X_pred)} games")

✓ Created features for 78 games


In [12]:
# Generate predictions
predictions = model.predict(X_pred)
components = model.predict_components(X_pred)

results = template.copy()
for i, idx in enumerate(valid_indices):
    results.loc[idx, 'pt_spread'] = predictions[i]
    results.loc[idx, 'ridge_pred'] = components['ridge'][i]
    results.loc[idx, 'gbm_pred'] = components['gbm'][i]
    results.loc[idx, 'elo_spread'] = X_pred.iloc[i]['predicted_spread']

print("✓ Predictions generated!")
results[['Date', 'Away', 'Home', 'pt_spread', 'ridge_pred', 'gbm_pred']].head(15)

✓ Predictions generated!


Unnamed: 0,Date,Away,Home,pt_spread,ridge_pred,gbm_pred
0,2/7/2026,Syracuse,Virginia,15.809205,11.791718,18.487529
1,2/7/2026,Louisville,Wake Forest,1.06315,-0.791455,2.299554
2,2/7/2026,Virginia Tech,NC State,9.00409,8.613828,9.264265
3,2/7/2026,Miami,Boston College,5.524618,1.218076,8.395646
4,2/7/2026,SMU,Pitt,-3.542347,-3.817733,-3.358757
5,2/7/2026,Florida State,Notre Dame,10.447435,12.691405,8.951455
6,2/7/2026,Duke,North Carolina,0.866825,-1.215819,2.255254
7,2/7/2026,Clemson,California,-12.258296,-8.26743,-14.918873
8,2/7/2026,Georgia Tech,Stanford,11.658607,10.864811,12.187805
9,2/9/2026,NC State,Louisville,19.77836,18.042397,20.935669


## 7. Save Real-Data Predictions

In [None]:
# Prepare submission
submission = results[['Date', 'Away', 'Home', 'pt_spread']].copy()
submission = submission.dropna(subset=['pt_spread'])

submission['team_name'] = ''
submission['team_member'] = ''
submission['team_email'] = ''

submission.loc[submission.index[0], 'team_name'] = 'CMMT'
submission.loc[submission.index[0], 'team_member'] = 'Caleb Han'
submission.loc[submission.index[0], 'team_email'] = 'calebhan@unc.edu'
submission.loc[submission.index[1], 'team_member'] = 'Mason Mines'
submission.loc[submission.index[1], 'team_email'] = 'mmines@unc.edu'
submission.loc[submission.index[2], 'team_member'] = 'Mason Wang'
submission.loc[submission.index[2], 'team_email'] = 'masonw@unc.edu'
submission.loc[submission.index[3], 'team_member'] = 'Tony Wang'
submission.loc[submission.index[3], 'team_email'] = 'tonyw@unc.edu'

# Save with real data suffix
output_path = '../data/predictions/tsa_pt_spread_CMM_2026_real_data.csv'
submission.to_csv(output_path, index=False)
print(f"✓ Saved: {output_path}")

# Update main submission
main_path = '../data/predictions/tsa_pt_spread_CMM_2026.csv'
submission.to_csv(main_path, index=False)
print(f"✓ Updated: {main_path}")

✓ Saved: ../data/predictions/tsa_pt_spread_CMM_2026_real_data.csv
✓ Updated: ../data/predictions/tsa_pt_spread_CMM_2026.csv


In [14]:
# Final Summary
print("\n" + "="*60)
print("REAL DATA MODEL SUMMARY")
print("="*60)
print(f"Training: {len(train_data)} real games (2020-2025)")
print(f"Features: {len(feature_cols)}")
print(f"Predictions: {len(submission)} games\n")
print(f"Cross-Validation Results:")
print(f"  Ridge MAE:    {np.mean(cv_results['ridge']):.3f} ± {np.std(cv_results['ridge']):.3f}")
print(f"  Ensemble MAE: {np.mean(cv_results['ensemble']):.3f} ± {np.std(cv_results['ensemble']):.3f}")
print(f"\nComparison to synthetic data (8.818 MAE):")
improvement = 8.818 - np.mean(cv_results['ridge'])
print(f"  Improvement: {improvement:.3f} points ({improvement/8.818*100:.1f}%)")
print("="*60)


REAL DATA MODEL SUMMARY
Training: 8850 real games (2020-2025)
Features: 11
Predictions: 78 games

Cross-Validation Results:
  Ridge MAE:    6.019 ± 0.182
  Ensemble MAE: 5.408 ± 0.235

Comparison to synthetic data (8.818 MAE):
  Improvement: 2.799 points (31.7%)
