In [1]:
from pathlib import Path
import re
import pandas as pd
import numpy as np
import json
import pickle

# directory containing game parquet files (relative to this notebook)
path = Path('..') / 'data' / 'processed' / 'structured'
files = sorted(path.glob('*.parquet'))
# load exactly one DataFrame per season (keep the first file encountered for each year)
games = {}  # maps YYYY -> DataFrame
for f in files:
    # read parquet first so any I/O/parquet-engine errors surface immediately
    df = pd.read_parquet(f)
    fname = f.name
    m = re.search(r'(19|20)\d{2}', fname)
    if m:
        year = m.group(0)
    else:
        # fallback: use filename without suffix
        year = fname.rsplit('.parquet', 1)[0]
    # normalize to a 4-digit year string when possible
    year_str = year if (isinstance(year, str) and len(str(year)) == 4 and str(year).isdigit()) else str(year)
    # if we've already loaded a DataFrame for this season, skip further files
    if year_str in games:
        # skip duplicates for the same season
        continue
    # register the DataFrame for this season
    games[year_str] = df
    # expose short-name globals: games_YY (e.g. games_16) and games_YYYY
    short = year_str[-2:] if year_str.isdigit() and len(year_str) == 4 else year_str
    globals()[f'games_{year_str}'] = df
    globals()[f'games_{short}'] = df
    print(f'Loaded {f} -> games_{year_str} (alias games_{short}), shape={df.shape}')
# convenience alias: dfs points to the per-season mapping we just built
dfs = games
# use dfs['2017'] or globals()['games_17'] as needed

Loaded ../data/processed/structured/games_2016.parquet -> games_2016 (alias games_16), shape=(832, 166)
Loaded ../data/processed/structured/games_2017.parquet -> games_2017 (alias games_17), shape=(834, 166)
Loaded ../data/processed/structured/games_2018.parquet -> games_2018 (alias games_18), shape=(845, 166)
Loaded ../data/processed/structured/games_2019.parquet -> games_2019 (alias games_19), shape=(848, 166)
Loaded ../data/processed/structured/games_2020.parquet -> games_2020 (alias games_20), shape=(542, 166)
Loaded ../data/processed/structured/games_2021.parquet -> games_2021 (alias games_21), shape=(849, 166)
Loaded ../data/processed/structured/games_2022.parquet -> games_2022 (alias games_22), shape=(854, 166)
Loaded ../data/processed/structured/games_2023.parquet -> games_2023 (alias games_23), shape=(868, 166)
Loaded ../data/processed/structured/games_2024.parquet -> games_2024 (alias games_24), shape=(874, 166)


In [2]:
games_2016['total'] = games_2016['home_points'] + games_2016['away_points']
games_2017['total'] = games_2017['home_points'] + games_2017['away_points']
games_2018['total'] = games_2018['home_points'] + games_2018['away_points']
games_2019['total'] = games_2019['home_points'] + games_2019['away_points']
games_2020['total'] = games_2020['home_points'] + games_2020['away_points']
games_2021['total'] = games_2021['home_points'] + games_2021['away_points']
games_2022['total'] = games_2022['home_points'] + games_2022['away_points']
games_2023['total'] = games_2023['home_points'] + games_2023['away_points']
games_2024['total'] = games_2024['home_points'] + games_2024['away_points'] 

In [7]:
# Mutate original season DataFrames: add team ids, winner, favorite (as team_id), point diff, normalize spreads/totals
# Return a dict mapping season global-name -> mutated DataFrame with requested column order

import re
from collections import OrderedDict

# discover season DataFrames in globals (games_YYYY or df_YYYY)
season_dfs = {}
for name, val in list(globals().items()):
    if re.match(r'^(games|df)_(19|20)\d{2}$', name) and isinstance(val, pd.DataFrame):
        season_dfs[name] = val

if not season_dfs:
    raise RuntimeError('No season DataFrames found in the notebook globals to operate on. Ensure you have games_YYYY or df_YYYY loaded.')

# collect unique team names across all season DataFrames
teams = OrderedDict()
for df in season_dfs.values():
    for col in ('home_team', 'away_team'):
        if col in df.columns:
            for t in df[col].dropna().astype(str).unique():
                if t not in teams:
                    teams[t] = None

# assign stable numeric IDs starting from 1
team_list = list(teams.keys())
team_map = {name: i for i, name in enumerate(team_list, start=1)}
team_lookup = pd.DataFrame({'team_name': team_list, 'team_id': [team_map[n] for n in team_list]})
team_lookup = team_lookup.set_index('team_name')

# helper to map team name -> id (case-sensitive fallback tries case-insensitive)
def _team_id_from_name(name):
    if pd.isna(name):
        return None
    s = str(name)
    if s in team_map:
        return team_map[s]
    lower = s.lower()
    for k,v in team_map.items():
        if k.lower() == lower:
            return v
    return None

# mapping to return
mutated = {}

for name, df in season_dfs.items():
    # operate in-place on the original DataFrame object
    # create id columns for home and away
    if 'home_team' in df.columns:
        df['home_team_id'] = df['home_team'].map(lambda x: _team_id_from_name(x))
    else:
        df['home_team_id'] = None
    if 'away_team' in df.columns:
        df['away_team_id'] = df['away_team'].map(lambda x: _team_id_from_name(x))
    else:
        df['away_team_id'] = None

    # favorite: preserve original name then map to team_id
    if 'favorite' in df.columns:
        df['favorite_name'] = df['favorite']

        def _favorite_to_id(row):
            f = row['favorite_name']
            if pd.isna(f):
                return None
            fs = str(f).strip()
            if fs.lower() in ('home', 'h'):
                return row.get('home_team_id')
            if fs.lower() in ('away', 'a'):
                return row.get('away_team_id')
            tid = _team_id_from_name(fs)
            if tid is not None:
                return tid
            # try to parse formats like 'Team Name -3.5' or 'Team Name (-3.5)'
            for tname in team_map.keys():
                if fs.lower().startswith(tname.lower()):
                    return team_map[tname]
            return None

        df['favorite'] = df.apply(_favorite_to_id, axis=1)
    else:
        df['favorite'] = None

    # ensure numeric points and compute winner_team_id
    df['home_points'] = pd.to_numeric(df.get('home_points'), errors='coerce')
    df['away_points'] = pd.to_numeric(df.get('away_points'), errors='coerce')

    def _winner_from_points(row):
        hp = row.get('home_points')
        ap = row.get('away_points')
        if pd.isna(hp) or pd.isna(ap):
            return None
        if hp > ap:
            return row.get('home_team_id')
        if ap > hp:
            return row.get('away_team_id')
        return None

    df['winner_team_id'] = df.apply(_winner_from_points, axis=1)

    # point differential (home - away)
    df['point_differential'] = df['home_points'] - df['away_points']

    # normalize spread/total/o/u
    if 'spread' not in df.columns and 'lspread' in df.columns:
        df['spread'] = df['lspread']
    if 'total' not in df.columns and 'o/u' in df.columns:
        df['total'] = df['o/u']
    if 'o/u' not in df.columns and 'total' in df.columns:
        df['o/u'] = df['total']

    # desired order per your request
    desired = ['game_id', 'home_team_id', 'away_team_id', 'home_points', 'away_points',
               'winner_team_id', 'favorite', 'point_differential', 'spread', 'total', 'o/u']
    front = [c for c in desired if c in df.columns]
    rest = [c for c in df.columns if c not in front]
    new_order = front + rest

    # reindex columns in-place by assignment to the same global name
    if new_order != list(df.columns):
        df_reordered = df.reindex(columns=new_order)
        # update the global reference to the mutated DataFrame
        globals()[name] = df_reordered
        mutated[name] = df_reordered
    else:
        mutated[name] = df
        globals()[name] = df

# expose team_lookup and mutated mapping
globals()['team_lookup'] = team_lookup.reset_index()
globals()['mutated_season_dfs'] = mutated

print(f'Mutated {len(mutated)} DataFrames and exposed mapping as `mutated_season_dfs`.')

# return the mutated mapping as the cell output
mutated

Mutated 9 DataFrames and exposed mapping as `mutated_season_dfs`.


{'games_2016':        game_id  home_team_id  away_team_id  home_points  away_points  \
 0    400869090             1           118           51           31   
 1    400869503             2           130           24           21   
 2    400869257             3           131           49            3   
 3    400869502             4           132           28            7   
 4    400869341             5           102           70           14   
 ..         ...           ...           ...          ...          ...   
 827  400869855           129            27           14           36   
 828  400926949            82           116           24           27   
 829  400926944            40            91           35           42   
 830  400926946            48            46           38           31   
 831  400868921            87            39           21           17   
 
      winner_team_id  favorite  point_differential  spread  total  ...  \
 0                 1         1    

In [9]:
games_2017

Unnamed: 0,game_id,home_team_id,away_team_id,home_points,away_points,winner_team_id,favorite,point_differential,spread,total,...,home_def_totalPPA,away_def_totalPPA,home_def_ppa,away_def_ppa,home_def_drives,away_def_drives,home_def_plays,away_def_plays,point_diff,favorite_name
0,400935282,97,126,58,27,97,97,31,-3.0,85,...,5.053376,32.460304,0.064787,0.381886,15.0,12.0,78.0,85.0,31,home
1,400945031,128,187,20,6,128,80,14,0.0,26,...,-0.254264,5.417063,-0.003531,0.084642,12.0,12.0,72.0,64.0,14,0
2,400938887,90,118,35,38,118,90,-3,-3.0,73,...,20.215692,21.895034,0.273185,0.291934,13.0,12.0,74.0,75.0,-3,home
3,400941786,101,64,22,42,64,64,-20,20.5,64,...,20.925079,-1.439283,0.209251,-0.016355,20.0,19.0,100.0,88.0,-20,away
4,400935257,119,26,7,62,26,26,-55,29.5,69,...,44.376863,-8.791357,0.583906,-0.141796,13.0,12.0,76.0,62.0,-55,away
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,400944885,27,57,25,32,57,57,-7,1.0,57,...,2.238064,1.155540,0.033404,0.012164,14.0,17.0,67.0,95.0,-7,away
830,400955154,115,114,17,14,115,115,3,-10.0,31,...,8.351740,4.714115,0.121040,0.066396,13.0,13.0,69.0,71.0,3,home
831,400955155,91,59,38,3,91,91,35,-12.5,41,...,-14.396845,14.644238,-0.239947,0.192687,14.0,14.0,60.0,76.0,35,home
832,400955156,46,31,21,27,31,31,-6,3.5,48,...,6.857683,0.859018,0.097967,0.011454,15.0,13.0,70.0,75.0,-6,away


In [15]:
# Dimensionality reduction check: Ridge (L2) and p-values via OLS (for interpretability)
# Target: home win (binary). We'll fit a Ridge regression on numeric features and compute OLS p-values
# Note: p-values come from an unregularized OLS; ridge coefficients show regularized importance.

import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import re

# Collect mutated/mutating DataFrames: prefer `mutated_season_dfs` mapping if present, else look for games_YYYY
if 'mutated_season_dfs' in globals():
    season_mapping = globals()['mutated_season_dfs']
else:
    season_mapping = {name: val for name,val in globals().items() if isinstance(val, pd.DataFrame) and re.match(r'^(games|df)_(19|20)\d{2}$', name)}

if not season_mapping:
    raise RuntimeError('No season DataFrames found. Run the feature engineering cell first to create mutated DataFrames.')

model_results = {}

# Columns to exclude from modeling (identifiers, direct score/line targets):
EXCLUDE_COLUMNS = {'game_id', 'home_points', 'away_points', 'point_differential', 'spread', 'total', 'o/u'}

def prepare_df_for_model(df):
    # copy and compute binary target: home_win=1 if winner_team_id==home_team_id, 0 if winner==away
    d = df.copy()
    if 'winner_team_id' not in d.columns:
        raise RuntimeError('winner_team_id column required')
    # binary target
    d['home_win'] = d.apply(lambda r: 1 if r.get('winner_team_id') == r.get('home_team_id')
                                      else (0 if pd.notna(r.get('winner_team_id')) and r.get('winner_team_id') != r.get('home_team_id') else np.nan), axis=1)
    d = d[d['home_win'].notna()].copy()
    # select numeric features only (exclude identifiers and the target)
    numeric = d.select_dtypes(include=[np.number]).columns.tolist()
    exclude = {'home_win', 'winner_team_id'}
    # also exclude team id fields
    exclude.update({'home_team_id','away_team_id'})
    # apply user-requested exclusions
    exclude.update(EXCLUDE_COLUMNS)
    features = [c for c in numeric if c not in exclude]
    X = d[features].fillna(0)
    y = d['home_win'].astype(float)
    return X, y, d

for season, df in season_mapping.items():
    try:
        X, y, dclean = prepare_df_for_model(df)
    except Exception as e:
        model_results[season] = {'error': str(e)}
        continue
    if X.shape[0] < 10 or X.shape[1] == 0:
        model_results[season] = {'error': f'Insufficient data rows={X.shape[0]} or no numeric features={X.shape[1]}'}
        continue

    # scale features for ridge
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)

    # fit ridge (regression treating binary target as continuous)
    ridge = Ridge(alpha=1.0)
    ridge.fit(Xs, y.values)
    coef = pd.Series(ridge.coef_, index=X.columns).sort_values(key=lambda s: np.abs(s), ascending=False)

    # OLS for p-values (unregularized) - add constant
    X_sm = sm.add_constant(X)
    try:
        ols = sm.OLS(y.values, X_sm.values).fit()
        pvalues = pd.Series(ols.pvalues, index=['const'] + list(X_sm.columns[1:]))
        tvalues = pd.Series(ols.tvalues, index=['const'] + list(X_sm.columns[1:]))
    except Exception as e:
        pvalues = pd.Series(dtype=float)
        tvalues = pd.Series(dtype=float)

    model_results[season] = {
        'n_rows': X.shape[0],
        'n_features': X.shape[1],
        'features': X.columns.tolist(),
        'ridge_coefficients': coef,
        'ols_pvalues': pvalues,
        'ols_tvalues': tvalues,
        'ols_summary': ols.summary() if 'ols' in locals() else None
    }

# also run on all seasons concatenated
all_dfs = []
for s, df in season_mapping.items():
    all_dfs.append(df)
all_concat = pd.concat(all_dfs, ignore_index=True)
X_all, y_all, d_all = prepare_df_for_model(all_concat)
if X_all.shape[0] >= 10 and X_all.shape[1] > 0:
    scaler = StandardScaler()
    Xs_all = scaler.fit_transform(X_all)
    ridge_all = Ridge(alpha=1.0).fit(Xs_all, y_all.values)
    coef_all = pd.Series(ridge_all.coef_, index=X_all.columns).sort_values(key=lambda s: np.abs(s), ascending=False)
    X_sm_all = sm.add_constant(X_all)
    ols_all = sm.OLS(y_all.values, X_sm_all.values).fit()
    pvals_all = pd.Series(ols_all.pvalues, index=['const'] + list(X_sm_all.columns[1:]))
    tvals_all = pd.Series(ols_all.tvalues, index=['const'] + list(X_sm_all.columns[1:]))
    model_results['ALL_SEASONS'] = {
        'n_rows': X_all.shape[0],
        'n_features': X_all.shape[1],
        'features': X_all.columns.tolist(),
        'ridge_coefficients': coef_all,
        'ols_pvalues': pvals_all,
        'ols_tvalues': tvals_all,
        'ols_summary': ols_all.summary()
    }
else:
    model_results['ALL_SEASONS'] = {'error': 'Insufficient concatenated data or no features.'}

# expose results to globals and return
globals()['model_results'] = model_results
print('Completed ridge + OLS p-values. Results in `model_results` mapping (per-season + ALL_SEASONS).')
model_results

Completed ridge + OLS p-values. Results in `model_results` mapping (per-season + ALL_SEASONS).


{'games_2016': {'n_rows': 832,
  'n_features': 159,
  'features': ['favorite',
   'week',
   'home_defensiveTDs',
   'away_defensiveTDs',
   'home_firstDowns',
   'away_firstDowns',
   'home_fumblesLost',
   'away_fumblesLost',
   'home_fumblesRecovered',
   'away_fumblesRecovered',
   'home_interceptions',
   'away_interceptions',
   'home_kickingPoints',
   'away_kickingPoints',
   'home_netPassingYards',
   'away_netPassingYards',
   'home_passesDeflected',
   'away_passesDeflected',
   'home_passesIntercepted',
   'away_passesIntercepted',
   'home_passingTDs',
   'away_passingTDs',
   'home_qbHurries',
   'away_qbHurries',
   'home_rushingAttempts',
   'away_rushingAttempts',
   'home_rushingTDs',
   'away_rushingTDs',
   'home_rushingYards',
   'away_rushingYards',
   'home_sacks',
   'away_sacks',
   'home_tackles',
   'away_tackles',
   'home_tacklesForLoss',
   'away_tacklesForLoss',
   'home_totalFumbles',
   'away_totalFumbles',
   'home_totalYards',
   'away_totalYards',
  

In [18]:
model_results['ALL_SEASONS']['ols_pvalues']

const                1.603721e-14
favorite             9.462825e-01
week                 2.642489e-01
home_defensiveTDs    2.264451e-01
away_defensiveTDs    2.959064e-01
                         ...     
home_def_drives      7.644342e-17
away_def_drives      4.075814e-08
home_def_plays       1.420461e-18
away_def_plays       1.603246e-10
point_diff           2.870489e-21
Length: 160, dtype: float64