## Dependencies

In [1]:
# Import dependencies
# NFL pbp data
import nfl_data_py as nfl

# Basics / visualizations
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Machine learning tools
import lightgbm as lgb
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV

# Stats Stuff
from scipy import stats
from scipy.stats import skew
import statsmodels.api as sm


# Turn off max columns for pandas DataFrame
pd.set_option('display.max_columns', None)

## Importing and Prepping Schedules

In [2]:
schedules_df = nfl.import_schedules([2024])

In [3]:
def get_upcoming_week(schedule):
    """
    Finds the nearest upcoming week where games have not yet been played.
    """
    # Filter out games where both home_score and away_score are NaN (future games)
    future_games = schedule[schedule['home_score'].isna() & schedule['away_score'].isna()]
    
    # Find the soonest week with future games
    upcoming_week = future_games['week'].min()
    
    return upcoming_week

In [4]:
get_upcoming_week(schedules_df)

4

## Importing PBP Data

In [107]:
# Select only the relevant columns
columns = ['game_id', 'passer_player_name', 'posteam', 'defteam', 'season', 'week', 'home_team', 'away_team', 'play_type', 'air_yards', 
           'yards_after_catch', 'epa', 'complete_pass', 'incomplete_pass', 'interception', 'qb_hit', 'sack', 'pass_touchdown',
           'passing_yards', 'cpoe', 'roof', 'surface']

# Loading in the NFL pbp data
data = nfl.import_pbp_data(range(2014,2025), columns, include_participation=False)

# nfl-data-py still loads other columns, so we again need to set our data equal to only the columns we want
data = data[columns]

# Drop all rows that are not a pass
data = data[data['play_type'] == 'pass']

# Drop the play type column
passer_data = data.drop(columns=['play_type'])

2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.
Downcasting floats.


## Passer DF Prep

In [108]:
# Group the data together by passer, week, season and aggregate
passer_df = passer_data.groupby(['game_id', 'passer_player_name', 'week', 'season'], as_index=False).agg(
    {'posteam' : 'first',
     'defteam' : 'first',
     'home_team' : 'first',
     'away_team' : 'first',
     'air_yards' : 'sum',
     'yards_after_catch' : 'sum',
     'epa' : 'sum',
     'complete_pass' : 'sum',
     'incomplete_pass' : 'sum',
     'interception' : 'sum',
     'qb_hit' : 'sum',
     'sack' : 'sum',
     'pass_touchdown' : 'sum',
     'passing_yards' : 'sum',
     'cpoe' : 'mean',
     'roof' : 'first',
     'surface' : 'first'
     }
)

# Create a new column that is completion percentage
passer_df['completion_percentage'] = passer_df['complete_pass'] / (passer_df['complete_pass'] + passer_df['incomplete_pass'])

# Create a new column that is the number of pass attempts
passer_df['pass_attempts'] = passer_df['complete_pass'] + passer_df['incomplete_pass']

# Drop the complete_pass and incomplete_pass columns
passer_df = passer_df.drop(columns=['complete_pass', 'incomplete_pass'])

# Create a new column that equals 1 if the passer is the home team and 0 if the passer is the away team
passer_df['home_flag'] = passer_df['home_team'] == passer_df['posteam']

# Drop the home_team and away_team columns
passer_df = passer_df.drop(columns=['home_team', 'away_team'])

# Reorder the columns
passer_df = passer_df[['game_id', 'passer_player_name', 'posteam', 'defteam', 'season', 'week', 'passing_yards', 'home_flag', 'completion_percentage', 'pass_attempts',
                       'air_yards',  'yards_after_catch', 'epa', 'interception', 'qb_hit', 'sack', 'pass_touchdown', 
                        'cpoe', 'roof', 'surface']]

## Defense DF Prep

In [109]:
# Select only the relevant columns
defense_columns = ['defteam', 'season', 'week', 'home_team', 'away_team', 'play_type', 'air_yards',
                   'yards_after_catch', 'epa', 'complete_pass', 'incomplete_pass', 'interception', 'qb_hit', 'sack', 'pass_touchdown',
                   'passing_yards', 'cpoe', 'roof', 'surface']


# nfl-data-py still loads other columns, so we again need to set our data equal to only the columns we want
defense_data = data[defense_columns]

# Drop the play type column
defense_data = defense_data.drop(columns=['play_type'])

# Group the data together by passer, week, season and aggregate
defense_df = defense_data.groupby(['defteam', 'week', 'season'], as_index=False).agg(
    {'home_team': 'first',
     'away_team': 'first',
     'air_yards': 'sum',
     'yards_after_catch': 'sum',
     'epa': 'sum',
     'complete_pass': 'sum',
     'incomplete_pass': 'sum',
     'interception': 'sum',
     'qb_hit': 'sum',
     'sack': 'sum',
     'pass_touchdown': 'sum',
     'passing_yards': 'sum',
     'cpoe': 'mean',
     'roof': 'first',
     'surface': 'first'
     }
)

# Create a new column that is completion percentage
defense_df['completion_percentage'] = defense_df['complete_pass'] / (defense_df['complete_pass'] + defense_df['incomplete_pass'])

# Create a new column that is the number of pass attempts
defense_df['pass_attempts'] = defense_df['complete_pass'] + defense_df['incomplete_pass']

# Drop the complete_pass and incomplete_pass columns
defense_df = defense_df.drop(columns=['complete_pass', 'incomplete_pass'])

# Create a new column that equals 1 if the defense is the home team and 0 if the defense is the away team
defense_df['home_flag'] = defense_df['home_team'] == defense_df['defteam']

# Drop the home_team and away_team columns
defense_df = defense_df.drop(columns=['home_team', 'away_team'])

# Reorder the columns
defense_df = defense_df[['defteam', 'season', 'week', 'home_flag', 'passing_yards', 'completion_percentage', 'pass_attempts',
                       'air_yards',  'yards_after_catch', 'epa', 'interception', 'qb_hit', 'sack', 'pass_touchdown', 
                       'cpoe', 'roof', 'surface']]

In [110]:
defense_df

Unnamed: 0,defteam,season,week,home_flag,passing_yards,completion_percentage,pass_attempts,air_yards,yards_after_catch,epa,interception,qb_hit,sack,pass_touchdown,cpoe,roof,surface
0,ARI,2014,1,True,238.0,0.600000,35.0,345.0,90.0,0.799435,1.0,4.0,0.0,1.0,-6.609174,closed,grass
1,ARI,2015,1,True,355.0,0.638298,47.0,233.0,277.0,1.829823,1.0,4.0,2.0,1.0,-8.648963,closed,grass
2,ARI,2016,1,True,264.0,0.727273,33.0,210.0,142.0,13.317495,0.0,4.0,2.0,1.0,5.301393,closed,grass
3,ARI,2017,1,False,292.0,0.725000,40.0,317.0,88.0,-2.884409,1.0,4.0,1.0,4.0,5.455871,dome,fieldturf
4,ARI,2018,1,True,255.0,0.700000,30.0,101.0,180.0,9.827952,0.0,9.0,3.0,2.0,1.424664,closed,grass
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5543,WAS,2015,18,True,210.0,0.583333,36.0,365.0,78.0,10.725159,0.0,2.0,1.0,2.0,1.810942,outdoors,grass
5544,WAS,2020,18,True,381.0,0.550000,40.0,461.0,119.0,19.551708,0.0,7.0,3.0,2.0,-2.638113,outdoors,grass
5545,WAS,2021,18,False,103.0,0.517241,29.0,215.0,63.0,-19.281746,2.0,8.0,3.0,1.0,-16.412041,outdoors,fieldturf
5546,WAS,2022,18,True,128.0,0.378378,37.0,401.0,35.0,-14.503402,1.0,4.0,1.0,1.0,-24.287834,outdoors,grass


In [111]:
# def filter_most_recent_weeks(df, weeks_needed=2):
#     """
#     Filters the dataframe to only include the most recent weeks needed for EWMA calculation.
#     Automatically handles offense and defense data by checking for 'passer_player_name' or 'defteam'.
    
#     :param df: The dataframe to filter (either passer_df or defense_df)
#     :param weeks_needed: Number of recent weeks needed for EWMA calculation
#     :return: Filtered dataframe with only the most recent weeks
#     """
#     # Dynamically detect whether to use passer_player_name or defteam
#     group_column = 'passer_player_name' if 'passer_player_name' in df.columns else 'defteam'
    
#     # Sort by the group column and 'week' in descending order
#     df = df.sort_values([group_column, 'week'], ascending=[True, False])
    
#     # Group by the detected column and get the most recent N weeks
#     recent_weeks = df.groupby(group_column).head(weeks_needed)
    
#     return recent_weeks

In [112]:
# passer_df.head(15)

## Preprocessing

In [113]:
def calculate_offensive_ewma(passer_df):
    """
    Calculates EWMA for offensive columns using previous weeks' data, ensuring no leakage by excluding the current week.
    Takes into account multiple seasons.
    """
    # Sort by passer, season, and week
    passer_df = passer_df.sort_values(by=['passer_player_name', 'season', 'week'])

    # Calculate the exponentially weighted moving average for each offensive feature, excluding the current week
    passer_df['completion_percentage_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['completion_percentage']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['pass_attempts_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['pass_attempts']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['air_yards_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['air_yards']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['yards_after_catch_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['yards_after_catch']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['epa_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['epa']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['interception_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['interception']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['qb_hit_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['qb_hit']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['sack_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['sack']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['pass_touchdown_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['pass_touchdown']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['passing_yards_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['passing_yards']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['cpoe_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['cpoe']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    return passer_df

In [114]:
# def calculate_defensive_ewma(defense_df):
#     """
#     Calculates EWMA for defensive columns using previous weeks' data (excluding the current week).
#     """
#     # Sort by 'defteam', 'season', and 'week' in ascending order
#     defense_df = defense_df.sort_values(['defteam', 'season', 'week'], ascending=[True, True, True])

#     # Shift data to exclude the current week's data for EWMA calculations
#     defense_df['completion_percentage_shifted'] = defense_df.groupby(['defteam'])['completion_percentage'].shift()
#     defense_df['pass_attempts_shifted'] = defense_df.groupby(['defteam'])['pass_attempts'].shift()
#     defense_df['air_yards_shifted'] = defense_df.groupby(['defteam'])['air_yards'].shift()
#     defense_df['yards_after_catch_shifted'] = defense_df.groupby(['defteam'])['yards_after_catch'].shift()
#     defense_df['epa_shifted'] = defense_df.groupby(['defteam'])['epa'].shift()
#     defense_df['interception_shifted'] = defense_df.groupby(['defteam'])['interception'].shift()
#     defense_df['qb_hit_shifted'] = defense_df.groupby(['defteam'])['qb_hit'].shift()
#     defense_df['sack_shifted'] = defense_df.groupby(['defteam'])['sack'].shift()
#     defense_df['pass_touchdown_shifted'] = defense_df.groupby(['defteam'])['pass_touchdown'].shift()
#     defense_df['passing_yards_shifted'] = defense_df.groupby(['defteam'])['passing_yards'].shift()
#     defense_df['cpoe_shifted'] = defense_df.groupby(['defteam'])['cpoe'].shift()

#     # Calculate the exponentially weighted moving average for each defensive feature
#     defense_df['completion_percentage_ewma'] = defense_df.groupby('defteam')['completion_percentage_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())
    
#     defense_df['pass_attempts_ewma'] = defense_df.groupby('defteam')['pass_attempts_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

#     defense_df['air_yards_ewma'] = defense_df.groupby('defteam')['air_yards_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

#     defense_df['yards_after_catch_ewma'] = defense_df.groupby('defteam')['yards_after_catch_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

#     defense_df['epa_ewma'] = defense_df.groupby('defteam')['epa_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

#     defense_df['interception_ewma'] = defense_df.groupby('defteam')['interception_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

#     defense_df['qb_hit_ewma'] = defense_df.groupby('defteam')['qb_hit_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

#     defense_df['sack_ewma'] = defense_df.groupby('defteam')['sack_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

#     defense_df['pass_touchdown_ewma'] = defense_df.groupby('defteam')['pass_touchdown_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

#     defense_df['passing_yards_ewma'] = defense_df.groupby('defteam')['passing_yards_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

#     defense_df['cpoe_ewma'] = defense_df.groupby('defteam')['cpoe_shifted']\
#         .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

#     # Drop the shifted columns (as they were just used for EWMA calculation)
#     defense_df = defense_df.drop(columns=[col for col in defense_df.columns if 'shifted' in col])

#     return defense_df

In [115]:
def calculate_defensive_ewma(defense_df, weeks_needed=2):
    """
    Calculates EWMA for defensive columns using previous weeks' data (excluding the current week).
    """
    # Sort by 'defteam', 'season', and 'week' in ascending order (to ensure time order)
    defense_df = defense_df.sort_values(by=['defteam', 'season', 'week'])
    
    # Calculate the exponentially weighted moving average (EWMA) for each defensive feature
    defense_df['completion_percentage_ewma'] = defense_df.groupby('defteam')['completion_percentage']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['pass_attempts_ewma'] = defense_df.groupby('defteam')['pass_attempts']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['air_yards_ewma'] = defense_df.groupby('defteam')['air_yards']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['yards_after_catch_ewma'] = defense_df.groupby('defteam')['yards_after_catch']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['epa_ewma'] = defense_df.groupby('defteam')['epa']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['interception_ewma'] = defense_df.groupby('defteam')['interception']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['qb_hit_ewma'] = defense_df.groupby('defteam')['qb_hit']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['sack_ewma'] = defense_df.groupby('defteam')['sack']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['pass_touchdown_ewma'] = defense_df.groupby('defteam')['pass_touchdown']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['passing_yards_ewma'] = defense_df.groupby('defteam')['passing_yards']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['cpoe_ewma'] = defense_df.groupby('defteam')['cpoe']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    return defense_df

In [125]:
def calculate_defensive_ewma_2(defense_df):
    """
    Calculates EWMA for defensive columns using previous weeks' data (excluding the current week).
    """
    # Sort by 'defteam', 'season', and 'week' in ascending order (to ensure time order)
    defense_df = defense_df.sort_values(by=['defteam', 'season', 'week'])

    # Ensure proper grouping by both defteam and season
    defense_df['completion_percentage_ewma'] = defense_df.groupby(['defteam', 'season'])['completion_percentage']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())
    
    defense_df['pass_attempts_ewma'] = defense_df.groupby(['defteam', 'season'])['pass_attempts']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['air_yards_ewma'] = defense_df.groupby(['defteam', 'season'])['air_yards']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['yards_after_catch_ewma'] = defense_df.groupby(['defteam', 'season'])['yards_after_catch']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['epa_ewma'] = defense_df.groupby(['defteam', 'season'])['epa']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['interception_ewma'] = defense_df.groupby(['defteam', 'season'])['interception']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['qb_hit_ewma'] = defense_df.groupby(['defteam', 'season'])['qb_hit']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['sack_ewma'] = defense_df.groupby(['defteam', 'season'])['sack']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['pass_touchdown_ewma'] = defense_df.groupby(['defteam', 'season'])['pass_touchdown']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['passing_yards_ewma'] = defense_df.groupby(['defteam', 'season'])['passing_yards']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    defense_df['cpoe_ewma'] = defense_df.groupby(['defteam', 'season'])['cpoe']\
        .transform(lambda x: x.shift(1).ewm(min_periods=1, span=2).mean())

    return defense_df

In [119]:
# Apply the function to the passer dataframe
passer_df_with_ewma = calculate_offensive_ewma(passer_df)

# Check the EWMA values and ensure that they don't use current week data
# Example: Checking for a specific player and season
example_player = 'P.Mahomes'
example_season = 2023
example_week = 4

# Select the player, season, and weeks of interest
player_data = passer_df_with_ewma[(passer_df_with_ewma['passer_player_name'] == example_player) &
                                  (passer_df_with_ewma['season'] == example_season) &
                                  (passer_df_with_ewma['week'] <= example_week)]

# Check the rows around the example week and verify EWMAs
player_data[['passer_player_name', 'season', 'week', 'completion_percentage', 'completion_percentage_ewma']]

Unnamed: 0,passer_player_name,season,week,completion_percentage,completion_percentage_ewma
5936,P.Mahomes,2023,1,0.552632,
5970,P.Mahomes,2023,2,0.725,0.552632
6001,P.Mahomes,2023,3,0.727273,0.681908
6044,P.Mahomes,2023,4,0.642857,0.713314


In [127]:
# Apply the function to the defense dataframe
defense_df_with_ewma = calculate_defensive_ewma_2(defense_df)

# Check the EWMA values and ensure that they don't use current week data
# Example: Checking for a specific team and season
example_team = 'KC'  # Example defensive team
example_season = 2024
example_week = 4

# Select the defensive team, season, and weeks of interest
team_data = defense_df_with_ewma[(defense_df_with_ewma['defteam'] == example_team) &
                                 (defense_df_with_ewma['season'] == example_season) &
                                 (defense_df_with_ewma['week'] <= example_week)]

# Check the rows around the example week and verify EWMAs
team_data[['defteam', 'season', 'week', 'completion_percentage', 'completion_percentage_ewma']]

Unnamed: 0,defteam,season,week,completion_percentage,completion_percentage_ewma
2599,KC,2024,1,0.634146,
2610,KC,2024,2,0.638889,0.634146
2621,KC,2024,3,0.714286,0.637703


## Model

In [None]:
# Define the best parameters from Bayesian optimization
best_params = {
    'bagging_fraction': 0.7,
    'boosting_type': 'goss',
    'colsample_bytree': 0.9,
    'feature_fraction': 0.8,
    'lambda_l1': 1,
    'lambda_l2': 0,
    'learning_rate': 0.01,
    'max_depth': 5,
    'metric': 'rmse',
    'min_data_in_leaf': 10,
    'min_split_gain': 0.2,
    'n_estimators': 1000,
    'num_leaves': 10,
    'objective': 'regression',
    'subsample': 0.9,
    'subsample_for_bin': 30000
}

# Step 1: Separate data into training (before 2024) and testing (2024)

# Assuming 'season' and 'week' columns are in your dataframe
train_df = full_with_ewma[full_with_ewma['season'] < 2024]
test_df = full_with_ewma[full_with_ewma['season'] == 2024]

# Step 2: Define features and target
features = [col for col in full_with_ewma.columns if col not in ['passing_yards', 'game_id', 'season', 'week']]
target = 'passing_yards'

# Prepare training data
X_train = train_df[features]
y_train = train_df[target]

# Step 3: Train the LGBM model on all pre-2024 data
model = lgb.LGBMRegressor(**best_params, random_state=42)
model.fit(X_train, y_train)

# Step 4: Week-by-week testing on the 2024 season
for week in sorted(test_df['week'].unique()):
    print(f"Testing on Week {week}")
    
    # Filter test data for the current week
    week_test_df = test_df[test_df['week'] == week]
    
    # Prepare test features and target
    X_test = week_test_df[features]
    y_test = week_test_df[target]
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate RMSE for the current week
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"Week {week} RMSE: {rmse}")
    
    # Optional: Re-train the model incrementally with more 2024 data
    # (Uncomment if you'd like to update the model as new weeks come in)
    # X_train = pd.concat([X_train, X_test])
    # y_train = pd.concat([y_train, y_test])
    # model.fit(X_train, y_train, init_model=model)