In [19]:
import sys
import importlib
import pandas as pd
import numpy as np
import nfl_data_py as nfl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression

# Remove the wrong one if it's already imported
if 'functions' in sys.modules:
    del sys.modules['functions']

# Ensure correct path
sys.path.insert(0, '/Users/aidanbeilke/Desktop/Football Projects')

# Now re-import
from functions import bayes_modeling_process

In [20]:
pbp = nfl.import_pbp_data(list(range(2000,2025)))

pbp = pbp[pbp['season_type'] == 'REG']

2000 done.
2001 done.
2002 done.
2003 done.
2004 done.
2005 done.
2006 done.
2007 done.
2008 done.
2009 done.
2010 done.
2011 done.
2012 done.
2013 done.
2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.
Downcasting floats.


### Passing Yards Model

In [None]:
pbp = pbp[pbp['season_type'] == 'REG']

summary = (
    pbp.groupby(['posteam', 'season'], as_index=False)
      .agg({
          'pass_attempt': 'sum',
          'passing_yards': 'sum',
          'air_epa': 'sum',
          'complete_pass': 'sum'
      })
).sort_values(['posteam', 'season'])

summary['completion_perc'] = summary['complete_pass'] / summary['pass_attempt']

summary['passing_yards_last_year'] = summary.groupby('posteam')['passing_yards'].shift(1)
summary['pass_attempt_last_year'] = summary.groupby('posteam')['pass_attempt'].shift(1)
summary['completion_perc_last_year'] = summary.groupby('posteam')['completion_perc'].shift(1)
summary['passing_yards_two_year'] = summary.groupby('posteam')['passing_yards'].shift(2)
summary['pass_attempt_two_year'] = summary.groupby('posteam')['pass_attempt'].shift(2)
summary['completion_perc_two_year'] = summary.groupby('posteam')['completion_perc'].shift(2)

summary = summary.dropna()

In [60]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

features = [
    'passing_yards_last_year',
    'pass_attempt_last_year',
    'completion_perc_last_year',
    'passing_yards_two_year',
    'pass_attempt_two_year',
    'completion_perc_two_year'
]

# Drop rows with missing values in target or features
model_df = summary.dropna(subset=features + ['passing_yards'])

X = model_df[features].values
y = model_df['passing_yards'].values

# Fit linear regression
lr = LinearRegression()
lr.fit(X, y)

# Predict and calculate MSE
preds = lr.predict(X)
mse = mean_squared_error(y, preds)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error: {rmse:.2f}")


Root Mean Squared Error: 470.97


In [61]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# --- Define features & target ---
features = [
    'passing_yards_last_year',
    'pass_attempt_last_year',
    'completion_perc_last_year',
    'passing_yards_two_year',
    'pass_attempt_two_year',
    'completion_perc_two_year'
]
target = 'passing_yards'

# --- Drop missing values ---
data = summary.dropna(subset=features + [target])

# --- Split train/test ---
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Ridge Regression ---
ridge = Ridge(alpha=1.0)  # You can tune alpha
ridge.fit(X_train, y_train)

# --- Predictions ---
y_pred = ridge.predict(X_test)

# --- Evaluation ---
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"RMSE: {rmse:.2f}")

RMSE: 474.88


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


### Rushing Yards Model

In [10]:
pbp = pbp[pbp['season_type'] == 'REG']

summary = (
    pbp.groupby(['posteam', 'season'], as_index=False)
      .agg({
          'rush_attempt': 'sum',
          'rushing_yards': 'sum',
      })
).sort_values(['posteam', 'season'])

summary['yards_per_attempt'] = summary['rushing_yards'] / summary['rush_attempt']

summary['rushing_yards_last_year'] = summary.groupby('posteam')['rushing_yards'].shift(1)
summary['rush_attempt_last_year'] = summary.groupby('posteam')['rush_attempt'].shift(1)
summary['yards_per_attempt_last_year'] = summary.groupby('posteam')['yards_per_attempt'].shift(1)
summary['rushing_yards_two_year'] = summary.groupby('posteam')['rushing_yards'].shift(2)
summary['rush_attempt_two_year'] = summary.groupby('posteam')['rush_attempt'].shift(2)
summary['yards_per_attempt_two_year'] = summary.groupby('posteam')['yards_per_attempt'].shift(2)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

features = [
    'rushing_yards_last_year',
    'rush_attempt_last_year',
    'yards_per_attempt_last_year',
    'rushing_yards_two_year',
    'rush_attempt_two_year',
    'yards_per_attempt_two_year'
]

# Drop rows with missing values in target or features
model_df = summary.dropna(subset=features + ['rushing_yards'])

X = model_df[features].values
y = model_df['rushing_yards'].values

# Fit linear regression
lr = LinearRegression()
lr.fit(X, y)

# Predict and calculate MSE
preds = lr.predict(X)
mse = mean_squared_error(y, preds)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error: {rmse:.2f}")


Root Mean Squared Error: 316.28


### EPA Modeling

In [None]:
def generate_predicted_qb_epa(pbp, holdout_season, n_splits=5):

    summary = (
        pbp.groupby(['posteam', 'season'], as_index=False)
        .agg({
            'fumble_forced': 'sum',
            'air_epa': 'mean',
            'yac_epa': 'mean',
            'qb_scramble' : 'sum'
        })
    ).sort_values(['posteam', 'season'])

    teams = summary['posteam'].unique()

    placeholder_rows = pd.DataFrame({
    'posteam': teams,
    'season': 2025,
    'qb_epa': np.nan,
    'air_epa': np.nan,
    'yac_epa': np.nan,
    'qb_scramble' : np.nan
    })

    summary = pd.concat([summary, placeholder_rows], ignore_index=True).sort_values(['posteam', 'season'])

    for lag in [1, 2]:
        summary[f'qb_epa_lag{lag}'] = summary.groupby('posteam')['qb_epa'].shift(lag)
        summary[f'air_epa_lag{lag}'] = summary.groupby('posteam')['air_epa'].shift(lag)
        summary[f'yac_epa_lag{lag}'] = summary.groupby('posteam')['yac_epa'].shift(lag)
        summary[f'qb_scrambles_lag{lag}'] = summary.groupby('posteam')['qb_scramble'].shift(lag)

    summary = summary[~((summary['season'] <= 2024) & summary.isna().any(axis=1))]
    summary = summary[summary['posteam'] != '']
    summary = summary[summary['season'] >= 2002]

    features = [
        'qb_epa_lag1',
        'air_epa_lag1',
        'yac_epa_lag1',
        'qb_scrambles_lag1'
        'qb_epa_lag2',
        'yac_epa_lag2',
        'air_epa_lag2',
        'qb_scrambles_lag2'
    ]

    # Separate holdout season
    holdout_df = summary[summary['season'] == holdout_season]
    train_df = summary[summary['season'] < holdout_season].copy().dropna()

    # Scale features
    scaler = StandardScaler()
    X_train_all = train_df[features].values
    X_train_all_scaled = scaler.fit_transform(X_train_all)

    y_train_all = train_df['qb_epa'].values

    # 1. Cross-validation predictions on train_df
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(train_df))  # out-of-fold predictions

    for train_idx, val_idx in kf.split(X_train_all_scaled):
        X_tr, X_val = X_train_all_scaled[train_idx], X_train_all_scaled[val_idx]
        y_tr = y_train_all[train_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)
        oof_preds[val_idx] = model.predict(X_val)

    train_df['pred_qb_epa'] = oof_preds
    train_df['team'] = train_df['posteam']

    # 2. Fit on all training data, predict holdout season
    model = LinearRegression()
    model.fit(X_train_all_scaled, y_train_all)

    X_holdout = scaler.transform(holdout_df[features].values)
    holdout_df = holdout_df.copy()
    holdout_df['pred_qb_epa'] = model.predict(X_holdout)
    holdout_df['team'] = holdout_df['posteam']

    # 3. Combine and return
    combined_df = pd.concat([train_df, holdout_df], ignore_index=True)

    return combined_df

### Offensive Interceptions

In [18]:
def generate_predicted_offensive_interceptions(pbp, holdout_season, n_splits=5):

    summary = (
        pbp.groupby(['posteam', 'season'], as_index=False)
        .agg({
            'passing_yards': 'sum',
            'pass_attempt': 'sum',
            'interception': 'sum'
        })
    ).sort_values(['posteam', 'season'])

    teams = summary['posteam'].unique()

    placeholder_rows = pd.DataFrame({
    'posteam': teams,
    'season': 2025,
    'passing_yards': np.nan,
    'pass_attempt': np.nan,
    'interception': np.nan
    })

    summary = pd.concat([summary, placeholder_rows], ignore_index=True).sort_values(['defteam', 'season'])

    for lag in [1, 2]:
        summary[f'passing_yards_lag{lag}'] = summary.groupby('posteam')['passing_yards'].shift(lag)
        summary[f'pass_attempts_lag{lag}'] = summary.groupby('posteam')['pass_attempt'].shift(lag)
        summary[f'interceptions_lag{lag}'] = summary.groupby('posteam')['interception'].shift(lag)

    summary = summary[~((summary['season'] <= 2024) & summary.isna().any(axis=1))]
    summary = summary[summary['posteam'] != '']
    summary = summary[summary['season'] >= 2002]

    features = [
        'passing_yards_lag1',
        'pass_attempts_lag1',
        'interceptions_lag1',
        'passing_yards_lag2',
        'pass_attempts_lag2',
        'interceptions_lag2'
    ]

    # Separate holdout season
    holdout_df = summary[summary['season'] == holdout_season]
    train_df = summary[summary['season'] < holdout_season].copy().dropna()

    # Scale features
    scaler = StandardScaler()
    X_train_all = train_df[features].values
    X_train_all_scaled = scaler.fit_transform(X_train_all)

    y_train_all = train_df['interception'].values

    # 1. Cross-validation predictions on train_df
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(train_df))  # out-of-fold predictions

    for train_idx, val_idx in kf.split(X_train_all_scaled):
        X_tr, X_val = X_train_all_scaled[train_idx], X_train_all_scaled[val_idx]
        y_tr = y_train_all[train_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)
        oof_preds[val_idx] = model.predict(X_val)

    train_df['pred_interceptions'] = oof_preds
    train_df['team'] = train_df['posteam']

    # 2. Fit on all training data, predict holdout season
    model = LinearRegression()
    model.fit(X_train_all_scaled, y_train_all)

    X_holdout = scaler.transform(holdout_df[features].values)
    holdout_df = holdout_df.copy()
    holdout_df['pred_interceptions'] = model.predict(X_holdout)
    holdout_df['team'] = holdout_df['posteam']

    # 3. Combine and return
    combined_df = pd.concat([train_df, holdout_df], ignore_index=True)

    return combined_df

### Time to Throw

In [36]:
ngs = nfl.import_ngs_data('passing', list(range(2000, 2024)))

In [37]:
[c for c in ngs.columns]

['season',
 'season_type',
 'week',
 'player_display_name',
 'player_position',
 'team_abbr',
 'avg_time_to_throw',
 'avg_completed_air_yards',
 'avg_intended_air_yards',
 'avg_air_yards_differential',
 'aggressiveness',
 'max_completed_air_distance',
 'avg_air_yards_to_sticks',
 'attempts',
 'pass_yards',
 'pass_touchdowns',
 'interceptions',
 'passer_rating',
 'completions',
 'completion_percentage',
 'expected_completion_percentage',
 'completion_percentage_above_expectation',
 'avg_air_distance',
 'max_air_distance',
 'player_gsis_id',
 'player_first_name',
 'player_last_name',
 'player_jersey_number',
 'player_short_name']

In [39]:
def generate_predicted_time_to_throw(pbp, holdout_season, n_splits=5):

    pbp = nfl.import_ngs_data('passing', list(range(2000, 2024)))
    pbp['posteam'] = pbp['team_abbr']

    summary = (
        pbp.groupby(['posteam', 'season'], as_index=False)
        .agg({
            'avg_time_to_throw': 'mean',
            'attempts': 'sum',
            'aggressiveness': 'mean'
        })
    ).sort_values(['posteam', 'season'])

    teams = summary['posteam'].unique()

    placeholder_rows = pd.DataFrame({
    'posteam': teams,
    'season': 2025,
    'avg_time_to_throw': np.nan,
    'attempts': np.nan,
    'aggressiveness': np.nan,
    })

    summary = pd.concat([summary, placeholder_rows], ignore_index=True).sort_values(['posteam', 'season'])

    for lag in [1, 2]:
        summary[f'avg_time_to_throw_lag{lag}'] = summary.groupby('posteam')['avg_time_to_throw'].shift(lag)
        summary[f'attempts_lag{lag}'] = summary.groupby('posteam')['attempts'].shift(lag)
        summary[f'aggressiveness_lag{lag}'] = summary.groupby('posteam')['aggressiveness'].shift(lag)

    summary = summary[~((summary['season'] <= 2024) & summary.isna().any(axis=1))]
    summary = summary[summary['posteam'] != '']
    summary = summary[summary['season'] >= 2002]

    features = [
        'avg_time_to_throw_lag1',
        'attempts_lag1',
        'aggressiveness_lag1',
        'avg_time_to_throw_lag2',
        'attempts_lag2',
        'avg_time_to_throw_lag2',
    ]

    # Separate holdout season
    holdout_df = summary[summary['season'] == holdout_season]
    train_df = summary[summary['season'] < holdout_season].copy().dropna()

    # Scale features
    scaler = StandardScaler()
    X_train_all = train_df[features].values
    X_train_all_scaled = scaler.fit_transform(X_train_all)

    y_train_all = train_df['avg_time_to_throw'].values

    # 1. Cross-validation predictions on train_df
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(train_df))  # out-of-fold predictions

    for train_idx, val_idx in kf.split(X_train_all_scaled):
        X_tr, X_val = X_train_all_scaled[train_idx], X_train_all_scaled[val_idx]
        y_tr = y_train_all[train_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)
        oof_preds[val_idx] = model.predict(X_val)

    train_df['pred_time_to_throw'] = oof_preds
    train_df['team'] = train_df['posteam']

    # 2. Fit on all training data, predict holdout season
    model = LinearRegression()
    model.fit(X_train_all_scaled, y_train_all)

    X_holdout = scaler.transform(holdout_df[features].values)
    holdout_df = holdout_df.copy()
    holdout_df['pred_time_to_throw'] = model.predict(X_holdout)
    holdout_df['team'] = holdout_df['posteam']

    # 3. Combine and return
    combined_df = pd.concat([train_df, holdout_df], ignore_index=True)

    return combined_df

### Passer Rating

In [40]:
def generate_predicted_passer_rating(pbp, holdout_season, n_splits=5):

    pbp = nfl.import_ngs_data('passing', list(range(2000, 2025)))
    pbp['posteam'] = pbp['team_abbr']

    summary = (
        pbp.groupby(['posteam', 'season'], as_index=False)
        .agg({
            'passer_rating': 'mean',
            'pass_yards': 'sum',
            'pass_touchdowns': 'sum',
            'interceptions' : 'sum',
            'avg_time_to_throw' : 'mean'
        })
    ).sort_values(['posteam', 'season'])

    summary['td_int_ratio'] = summary['pass_touchdowns'] / summary['interceptions']

    teams = summary['posteam'].unique()

    placeholder_rows = pd.DataFrame({
    'posteam': teams,
    'season': 2025,
    'passer_rating': np.nan,
    'pass_yards': np.nan,
    'pass_touchdowns': np.nan,
    'interceptions' : np.nan,
    'avg_time_to_throw' : np.nan,
    'td_int_ratio' : np.nan
    })

    summary = pd.concat([summary, placeholder_rows], ignore_index=True).sort_values(['posteam', 'season'])

    for lag in [1, 2]:
        summary[f'passer_rating_lag{lag}'] = summary.groupby('posteam')['passer_rating'].shift(lag)
        summary[f'pass_yards_lag{lag}'] = summary.groupby('posteam')['pass_yards'].shift(lag)
        summary[f'pass_touchdowns_lag{lag}'] = summary.groupby('posteam')['aggressiveness'].shift(lag)
        summary[f'interceptions_lag{lag}'] = summary.groupby('posteam')['interceptions'].shift(lag)
        summary[f'avg_time_to_throw_lag{lag}'] = summary.groupby('posteam')['avg_time_to_throw'].shift(lag)
        summary[f'td_int_ratio_lag{lag}'] = summary.groupby('posteam')['td_int_ratio'].shift(lag)

    summary = summary[~((summary['season'] <= 2024) & summary.isna().any(axis=1))]
    summary = summary[summary['posteam'] != '']
    summary = summary[summary['season'] >= 2002]

    features = [
        'passer_rating_lag1',
        'pass_yards_lag1',
        'pass_touchdowns_lag1',
        'interceptions_lag1',
        'avg_time_to_throw_lag1',
        'td_int_ratio_lag1',
        'passer_rating_lag2',
        'pass_yards_lag2',
        'pass_touchdowns_lag2',
        'interceptions_lag2',
        'avg_time_to_throw_lag2',
        'td_int_ratio_lag2'
    ]


    # Separate holdout season
    holdout_df = summary[summary['season'] == holdout_season]
    train_df = summary[summary['season'] < holdout_season].copy().dropna()

    # Scale features
    scaler = StandardScaler()
    X_train_all = train_df[features].values
    X_train_all_scaled = scaler.fit_transform(X_train_all)

    y_train_all = train_df['passer_rating'].values

    # 1. Cross-validation predictions on train_df
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(train_df))  # out-of-fold predictions

    for train_idx, val_idx in kf.split(X_train_all_scaled):
        X_tr, X_val = X_train_all_scaled[train_idx], X_train_all_scaled[val_idx]
        y_tr = y_train_all[train_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)
        oof_preds[val_idx] = model.predict(X_val)

    train_df['pred_passer_rating'] = oof_preds
    train_df['team'] = train_df['posteam']

    # 2. Fit on all training data, predict holdout season
    model = LinearRegression()
    model.fit(X_train_all_scaled, y_train_all)

    X_holdout = scaler.transform(holdout_df[features].values)
    holdout_df = holdout_df.copy()
    holdout_df['pred_passer_rating'] = model.predict(X_holdout)
    holdout_df['team'] = holdout_df['posteam']

    # 3. Combine and return
    combined_df = pd.concat([train_df, holdout_df], ignore_index=True)

    return combined_df

### Defensive Modeling

In [11]:
summary = (
    pbp.groupby(['defteam', 'season'], as_index=False)
    .agg({
        'fumble_forced': 'sum',
        'sack': 'sum',
        'interception': 'sum',
        'incomplete_pass' : 'sum',
        'passing_yards' : 'sum',
        'rushing_yards' : 'sum'
    })
).sort_values(['defteam', 'season'])
summary['total_yards'] = summary['passing_yards'] + summary['rushing_yards']

summary[summary['season'] == 2024].sort_values(by = 'total_yards')

Unnamed: 0,defteam,season,fumble_forced,sack,interception,incomplete_pass,passing_yards,rushing_yards,total_yards
647,PHI,2024,21.0,41.0,13.0,192.0,3266.0,1771.0,5037.0
772,TEN,2024,12.0,32.0,11.0,148.0,3216.0,2277.0,5493.0
497,MIA,2024,8.0,35.0,10.0,197.0,3829.0,1763.0,5592.0
722,SF,2024,12.0,37.0,11.0,167.0,3476.0,2118.0,5594.0
622,NYJ,2024,15.0,43.0,7.0,187.0,3588.0,2059.0,5647.0
299,GB,2024,18.0,45.0,17.0,167.0,3959.0,1689.0,5648.0
322,HOU,2024,9.0,49.0,19.0,209.0,3753.0,1925.0,5678.0
397,KC,2024,14.0,39.0,13.0,181.0,3970.0,1731.0,5701.0
249,DEN,2024,12.0,63.0,15.0,198.0,4112.0,1639.0,5751.0
447,LAC,2024,10.0,46.0,15.0,183.0,3762.0,1997.0,5759.0


In [None]:
def generate_predicted_yards_allowed(pbp, holdout_season, n_splits=5):

    summary = (
        pbp.groupby(['defteam', 'season'], as_index=False)
        .agg({
            'fumble_forced': 'sum',
            'sack': 'sum',
            'interception': 'sum',
            'incomplete_pass' : 'sum',
            'passing_yards' : 'sum',
            'rushing_yards' : 'sum'
        })
    ).sort_values(['defteam', 'season'])
    summary['total_yards'] = summary['passing_yards'] + summary['rushing_yards']

    teams = summary['defteam'].unique()

    placeholder_rows = pd.DataFrame({
    'defteam': teams,
    'season': 2025,
    'fumble_forced': np.nan,
    'sack': np.nan,
    'interception': np.nan,
    'incomplete_pass' : np.nan,
    'passing_yards' : np.nan,
    'rushing_yards' : np.nan,
    'total_yards' : np.nan
    })

    summary = pd.concat([summary, placeholder_rows], ignore_index=True).sort_values(['defteam', 'season'])

    for lag in [1, 2]:
        summary[f'forced_fumbles_lag{lag}'] = summary.groupby('defteam')['fumble_forced'].shift(lag)
        summary[f'sacks_lag{lag}'] = summary.groupby('defteam')['sack'].shift(lag)
        summary[f'interceptions_lag{lag}'] = summary.groupby('defteam')['interception'].shift(lag)
        summary[f'incomplete_passes_lag{lag}'] = summary.groupby('defteam')['incomplete_pass'].shift(lag)
        summary[f'total_yards_lag{lag}'] = summary.groupby('defteam')['total_yards'].shift(lag)

    summary = summary[~((summary['season'] <= 2024) & summary.isna().any(axis=1))]
    summary = summary[summary['defteam'] != '']
    summary = summary[summary['season'] >= 2002]

    features = [
        'forced_fumbles_lag1',
        'sacks_lag1',
        'interceptions_lag1',
        'incomplete_passes_lag1',
        'total_yards_lag1',
        'forced_fumbles_lag2',
        'sacks_lag2',
        'interceptions_lag2',
        'incomplete_passes_lag2',
        'total_yards_lag2'
    ]

    # Separate holdout season
    holdout_df = summary[summary['season'] == holdout_season]
    train_df = summary[summary['season'] < holdout_season].copy().dropna()

    # Scale features
    scaler = StandardScaler()
    X_train_all = train_df[features].values
    X_train_all_scaled = scaler.fit_transform(X_train_all)

    y_train_all = train_df['total_yards'].values

    # 1. Cross-validation predictions on train_df
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(train_df))  # out-of-fold predictions

    for train_idx, val_idx in kf.split(X_train_all_scaled):
        X_tr, X_val = X_train_all_scaled[train_idx], X_train_all_scaled[val_idx]
        y_tr = y_train_all[train_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)
        oof_preds[val_idx] = model.predict(X_val)

    train_df['pred_total_yards'] = oof_preds
    train_df['team'] = train_df['defteam']

    # 2. Fit on all training data, predict holdout season
    model = LinearRegression()
    model.fit(X_train_all_scaled, y_train_all)

    X_holdout = scaler.transform(holdout_df[features].values)
    holdout_df = holdout_df.copy()
    holdout_df['pred_total_yards'] = model.predict(X_holdout)
    holdout_df['team'] = holdout_df['defteam']

    # 3. Combine and return
    combined_df = pd.concat([train_df, holdout_df], ignore_index=True)

    return combined_df

In [None]:
def generate_predicted_interceptions(pbp, holdout_season, n_splits=5):

    summary = (
        pbp.groupby(['defteam', 'season'], as_index=False)
        .agg({
            'fumble_forced': 'sum',
            'sack': 'sum',
            'interception': 'sum',
            'incomplete_pass' : 'sum',
            'passing_yards' : 'sum',
            'rushing_yards' : 'sum'
        })
    ).sort_values(['defteam', 'season'])
    summary['total_yards'] = summary['passing_yards'] + summary['rushing_yards']

    teams = summary['defteam'].unique()

    placeholder_rows = pd.DataFrame({
    'defteam': teams,
    'season': 2025,
    'fumble_forced': np.nan,
    'sack': np.nan,
    'interception': np.nan,
    'incomplete_pass' : np.nan,
    'passing_yards' : np.nan,
    'rushing_yards' : np.nan,
    'total_yards' : np.nan
    })

    summary = pd.concat([summary, placeholder_rows], ignore_index=True).sort_values(['defteam', 'season'])

    for lag in [1, 2]:
        summary[f'forced_fumbles_lag{lag}'] = summary.groupby('defteam')['fumble_forced'].shift(lag)
        summary[f'sacks_lag{lag}'] = summary.groupby('defteam')['sack'].shift(lag)
        summary[f'interceptions_lag{lag}'] = summary.groupby('defteam')['interception'].shift(lag)
        summary[f'incomplete_passes_lag{lag}'] = summary.groupby('defteam')['incomplete_pass'].shift(lag)
        summary[f'total_yards_lag{lag}'] = summary.groupby('defteam')['total_yards'].shift(lag)

    summary = summary[~((summary['season'] <= 2024) & summary.isna().any(axis=1))]
    summary = summary[summary['defteam'] != '']
    summary = summary[summary['season'] >= 2002]

    features = [
        'forced_fumbles_lag1',
        'sacks_lag1',
        'interceptions_lag1',
        'incomplete_passes_lag1',
        'total_yards_lag1',
        'forced_fumbles_lag2',
        'sacks_lag2',
        'interceptions_lag2',
        'incomplete_passes_lag2',
        'total_yards_lag2'
    ]

    # Separate holdout season
    holdout_df = summary[summary['season'] == holdout_season]
    train_df = summary[summary['season'] < holdout_season].copy().dropna()

    # Scale features
    scaler = StandardScaler()
    X_train_all = train_df[features].values
    X_train_all_scaled = scaler.fit_transform(X_train_all)

    y_train_all = train_df['interception'].values

    # 1. Cross-validation predictions on train_df
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(train_df))  # out-of-fold predictions

    for train_idx, val_idx in kf.split(X_train_all_scaled):
        X_tr, X_val = X_train_all_scaled[train_idx], X_train_all_scaled[val_idx]
        y_tr = y_train_all[train_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)
        oof_preds[val_idx] = model.predict(X_val)

    train_df['pred_interceptions'] = oof_preds
    train_df['team'] = train_df['defteam']

    # 2. Fit on all training data, predict holdout season
    model = LinearRegression()
    model.fit(X_train_all_scaled, y_train_all)

    X_holdout = scaler.transform(holdout_df[features].values)
    holdout_df = holdout_df.copy()
    holdout_df['pred_interceptions'] = model.predict(X_holdout)
    holdout_df['team'] = holdout_df['defteam']

    # 3. Combine and return
    combined_df = pd.concat([train_df, holdout_df], ignore_index=True)

    return combined_df

In [None]:
def generate_predicted_sacks(pbp, holdout_season, n_splits=5):

    summary = (
        pbp.groupby(['defteam', 'season'], as_index=False)
        .agg({
            'fumble_forced': 'sum',
            'sack': 'sum',
            'interception': 'sum',
            'incomplete_pass' : 'sum',
            'passing_yards' : 'sum',
            'rushing_yards' : 'sum'
        })
    ).sort_values(['defteam', 'season'])
    summary['total_yards'] = summary['passing_yards'] + summary['rushing_yards']

    teams = summary['defteam'].unique()

    placeholder_rows = pd.DataFrame({
    'defteam': teams,
    'season': 2025,
    'fumble_forced': np.nan,
    'sack': np.nan,
    'interception': np.nan,
    'incomplete_pass' : np.nan,
    'passing_yards' : np.nan,
    'rushing_yards' : np.nan,
    'total_yards' : np.nan
    })

    summary = pd.concat([summary, placeholder_rows], ignore_index=True).sort_values(['defteam', 'season'])

    for lag in [1, 2]:
        summary[f'forced_fumbles_lag{lag}'] = summary.groupby('defteam')['fumble_forced'].shift(lag)
        summary[f'sacks_lag{lag}'] = summary.groupby('defteam')['sack'].shift(lag)
        summary[f'interceptions_lag{lag}'] = summary.groupby('defteam')['interception'].shift(lag)
        summary[f'incomplete_passes_lag{lag}'] = summary.groupby('defteam')['incomplete_pass'].shift(lag)
        summary[f'total_yards_lag{lag}'] = summary.groupby('defteam')['total_yards'].shift(lag)

    summary = summary[~((summary['season'] <= 2024) & summary.isna().any(axis=1))]
    summary = summary[summary['defteam'] != '']
    summary = summary[summary['season'] >= 2002]

    features = [
        'forced_fumbles_lag1',
        'sacks_lag1',
        'interceptions_lag1',
        'incomplete_passes_lag1',
        'total_yards_lag1',
        'forced_fumbles_lag2',
        'sacks_lag2',
        'interceptions_lag2',
        'incomplete_passes_lag2',
        'total_yards_lag2'
    ]

    # Separate holdout season
    holdout_df = summary[summary['season'] == holdout_season]
    train_df = summary[summary['season'] < holdout_season].copy().dropna()

    # Scale features
    scaler = StandardScaler()
    X_train_all = train_df[features].values
    X_train_all_scaled = scaler.fit_transform(X_train_all)

    y_train_all = train_df['sack'].values

    # 1. Cross-validation predictions on train_df
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(train_df))  # out-of-fold predictions

    for train_idx, val_idx in kf.split(X_train_all_scaled):
        X_tr, X_val = X_train_all_scaled[train_idx], X_train_all_scaled[val_idx]
        y_tr = y_train_all[train_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)
        oof_preds[val_idx] = model.predict(X_val)

    train_df['pred_total_sacks'] = oof_preds
    train_df['team'] = train_df['defteam']

    # 2. Fit on all training data, predict holdout season
    model = LinearRegression()
    model.fit(X_train_all_scaled, y_train_all)

    X_holdout = scaler.transform(holdout_df[features].values)
    holdout_df = holdout_df.copy()
    holdout_df['pred_total_sacks'] = model.predict(X_holdout)
    holdout_df['team'] = holdout_df['defteam']

    # 3. Combine and return
    combined_df = pd.concat([train_df, holdout_df], ignore_index=True)

    return combined_df