## Dependencies

In [634]:
# Import dependencies
# NFL pbp data
import nfl_data_py as nfl

# Basics / visualizations
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Machine learning tools
import lightgbm as lgb
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

# Stats Stuff
from scipy import stats
from scipy.stats import skew
import statsmodels.api as sm


# Turn off max columns for pandas DataFrame
pd.set_option('display.max_columns', None)

## Importing and Prepping Schedules

In [647]:
schedules_df = nfl.import_schedules([2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024])

In [648]:
def calculate_implied_totals(df):
    """
    Calculate the implied home and away team totals based on the spread and total lines.
    """
    # Implied totals based on the total and spread lines
    df['implied_home_total'] = (df['total_line'] + df['spread_line']) / 2
    df['implied_away_total'] = (df['total_line'] - df['spread_line']) / 2
    
    return df

In [649]:
schedules_df = calculate_implied_totals(schedules_df)
# explore this if we can prove lines are out before upcoming week

In [650]:
schedules_df.drop(columns= ['away_coach', 'home_coach', 'referee',
       'stadium_id', 'stadium', 'away_score', 'home_score',
       'location', 'result', 'total', 'overtime', 'old_game_id', 'gsis',
       'nfl_detail_id', 'pfr', 'pff', 'espn', 'ftn', 'away_rest', 'home_rest',
       'away_moneyline', 'home_moneyline', 'spread_line', 'away_spread_odds',
       'home_spread_odds', 'total_line', 'under_odds', 'over_odds', 'div_game', 'away_qb_id', 'home_qb_id', 'gameday', 'weekday',
       'gametime'], inplace=True)

In [652]:
# Create a DataFrame for away quarterbacks
away_df = schedules_df[['game_id', 'season', 'game_type', 'week', 'away_team', 'home_team', 'away_qb_name', 'implied_away_total', 'roof', 'surface',]].copy()
away_df.rename(columns={'away_qb_name': 'qb_name', 'implied_away_total': 'implied_total'}, inplace=True)
away_df['home_away'] = 'away'
away_df['posteam'] = away_df['away_team']
away_df['defteam'] = away_df['home_team']

# Create a DataFrame for home quarterbacks
home_df = schedules_df[['game_id', 'season', 'game_type', 'week', 'away_team', 'home_team', 'home_qb_name', 'implied_home_total', 'roof', 'surface']].copy()
home_df.rename(columns={'home_qb_name': 'qb_name', 'implied_home_total': 'implied_total'}, inplace=True)
home_df['home_away'] = 'home'
home_df['posteam'] = home_df['home_team']
home_df['defteam'] = home_df['away_team']

# Concatenate both DataFrames
combined_schedule_df = pd.concat([away_df, home_df])

# Reset index for cleanliness
combined_schedule_df.reset_index(drop=True, inplace=True)

In [653]:
def format_passer_name(qb_name):
    if pd.isna(qb_name):  # Check if the name is NaN
        return ""
    
    name_parts = qb_name.split()
    
    # Extract the first name and last name
    first_name = name_parts[0]
    last_name = name_parts[-1]  # Last name should always be the last part
    
    return f"{first_name[0]}.{last_name}"

# Apply the function to create the new 'player_passer_name' column
combined_schedule_df['passer_player_name'] = combined_schedule_df['qb_name'].apply(format_passer_name)

In [654]:
#Reorder Columns
combined_schedule_df = combined_schedule_df[['game_id', 'season', 'game_type', 'week', 'roof', 'surface',
                                             'posteam', 'defteam', 'home_team', 'away_team', 'qb_name', 'passer_player_name', 'implied_total', 'home_away']]

## Importing PBP Training Data

In [656]:
# Select only the relevant columns
columns = ['game_id', 'passer_player_name', 'posteam', 'defteam', 'season', 'week', 'home_team', 'away_team', 'play_type', 'air_yards', 
           'yards_after_catch', 'epa', 'complete_pass', 'incomplete_pass', 'interception', 'qb_hit', 'sack', 'pass_touchdown',
           'passing_yards', 'cpoe', 'roof', 'surface']

# Loading in the NFL pbp data
data = nfl.import_pbp_data(range(2014,2024 + 1), columns, include_participation=False)

# nfl-data-py still loads other columns, so we again need to set our data equal to only the columns we want
data = data[columns]

# Drop all rows that are not a pass
data = data[data['play_type'] == 'pass']

# Drop the play type column
passer_data = data.drop(columns=['play_type'])

2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.
Downcasting floats.


## Passer DF Prep

In [657]:
# Group the data together by passer, week, season and aggregate
passer_df = passer_data.groupby(['game_id', 'passer_player_name', 'week', 'season'], as_index=False).agg(
    {'posteam' : 'first',
     'defteam' : 'first',
     'home_team' : 'first',
     'away_team' : 'first',
     'air_yards' : 'sum',
     'yards_after_catch' : 'sum',
     'epa' : 'sum',
     'complete_pass' : 'sum',
     'incomplete_pass' : 'sum',
     'interception' : 'sum',
     'qb_hit' : 'sum',
     'sack' : 'sum',
     'pass_touchdown' : 'sum',
     'passing_yards' : 'sum',
     'cpoe' : 'mean',
     'roof' : 'first',
     'surface' : 'first'
     }
)

# Create a new column that is completion percentage
passer_df['completion_percentage'] = passer_df['complete_pass'] / (passer_df['complete_pass'] + passer_df['incomplete_pass'])

# Create a new column that is the number of pass attempts
passer_df['pass_attempts'] = passer_df['complete_pass'] + passer_df['incomplete_pass']

# Create a new column that equals 1 if the passer is the home team and 0 if the passer is the away team
passer_df['home_flag'] = passer_df['home_team'] == passer_df['posteam']

# Reorder the columns
passer_df = passer_df[['home_team', 'away_team', 'complete_pass', 'incomplete_pass' ,'game_id', 'passer_player_name', 'posteam', 'defteam', 'season', 'week', 'passing_yards', 'home_flag', 'completion_percentage', 'pass_attempts',
                       'air_yards',  'yards_after_catch', 'epa', 'interception', 'qb_hit', 'sack', 'pass_touchdown', 
                        'cpoe', 'roof', 'surface']]

## Defense DF Prep

In [658]:
# Select only the relevant columns
defense_columns = ['defteam', 'season', 'week', 'home_team', 'away_team', 'play_type', 'air_yards',
                   'yards_after_catch', 'epa', 'complete_pass', 'incomplete_pass', 'interception', 'qb_hit', 'sack', 'pass_touchdown',
                   'passing_yards', 'cpoe', 'roof', 'surface']


# nfl-data-py still loads other columns, so we again need to set our data equal to only the columns we want
defense_data = data[defense_columns]

# Drop the play type column
defense_data = defense_data.drop(columns=['play_type'])

# Group the data together by passer, week, season and aggregate
defense_df = defense_data.groupby(['defteam', 'week', 'season'], as_index=False).agg(
    {'home_team': 'first',
     'away_team': 'first',
     'air_yards': 'sum',
     'yards_after_catch': 'sum',
     'epa': 'sum',
     'complete_pass': 'sum',
     'incomplete_pass': 'sum',
     'interception': 'sum',
     'qb_hit': 'sum',
     'sack': 'sum',
     'pass_touchdown': 'sum',
     'passing_yards': 'sum',
     'cpoe': 'mean',
     'roof': 'first',
     'surface': 'first'
     }
)

# Create a new column that is completion percentage
defense_df['completion_percentage'] = defense_df['complete_pass'] / (defense_df['complete_pass'] + defense_df['incomplete_pass'])

# Create a new column that is the number of pass attempts
defense_df['pass_attempts'] = defense_df['complete_pass'] + defense_df['incomplete_pass']

# Create a new column that equals 1 if the defense is the home team and 0 if the defense is the away team
defense_df['home_flag'] = defense_df['home_team'] == defense_df['defteam']

# Reorder the columns
defense_df = defense_df[['home_team', 'away_team', 'complete_pass', 'incomplete_pass', 'defteam', 'season', 'week', 'home_flag', 'passing_yards', 'completion_percentage', 'pass_attempts',
                       'air_yards',  'yards_after_catch', 'epa', 'interception', 'qb_hit', 'sack', 'pass_touchdown', 
                       'cpoe', 'roof', 'surface']]

## Preprocessing

In [659]:
def calculate_offensive_ewma(passer_df):
    """
    Calculates EWMA for offensive columns using previous weeks' data, ensuring no leakage by excluding the current week.
    Takes into account multiple seasons.
    """
    # Sort by passer, season, and week
    passer_df = passer_df.sort_values(by=['passer_player_name', 'season', 'week'])

    # Calculate the exponentially weighted moving average for each offensive feature, excluding the current week
    passer_df['completion_percentage_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['completion_percentage']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['pass_attempts_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['pass_attempts']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['air_yards_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['air_yards']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['yards_after_catch_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['yards_after_catch']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['epa_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['epa']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['interception_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['interception']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['qb_hit_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['qb_hit']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['sack_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['sack']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['pass_touchdown_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['pass_touchdown']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['passing_yards_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['passing_yards']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    passer_df['cpoe_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['cpoe']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    return passer_df

In [660]:
def pass_cleaner(passer_df):
    """
    Preps passer df for merging; drops unnecessary columns
    """
    passer_df.drop(columns=['home_team', 'away_team', 'complete_pass', 'incomplete_pass', 'completion_percentage', 'air_yards', 'yards_after_catch', 'epa', 
                                    'interception', 'qb_hit', 'sack', 'pass_touchdown', 'cpoe', 'home_team', 'away_team', 
                                    'complete_pass', 'incomplete_pass'], inplace=True)
    
    return passer_df

In [661]:
passer_emwa = calculate_offensive_ewma(passer_df)

In [662]:
passer_prepped = pass_cleaner(passer_emwa)

In [663]:
def calculate_defensive_ewma(defense_df):
    """
    Calculates EWMA for defensive columns using previous weeks' data (excluding the current week).
    """
    # Sort by 'defteam', 'season', and 'week' in ascending order (to ensure time order)
    defense_df = defense_df.sort_values(by=['defteam', 'season', 'week'])

    # Ensure proper grouping by both defteam and season
    defense_df['completion_percentage_ewma'] = defense_df.groupby(['defteam', 'season'])['completion_percentage']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())
    
    defense_df['pass_attempts_ewma'] = defense_df.groupby(['defteam', 'season'])['pass_attempts']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    defense_df['air_yards_ewma'] = defense_df.groupby(['defteam', 'season'])['air_yards']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    defense_df['yards_after_catch_ewma'] = defense_df.groupby(['defteam', 'season'])['yards_after_catch']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    defense_df['epa_ewma'] = defense_df.groupby(['defteam', 'season'])['epa']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    defense_df['interception_ewma'] = defense_df.groupby(['defteam', 'season'])['interception']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    defense_df['qb_hit_ewma'] = defense_df.groupby(['defteam', 'season'])['qb_hit']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    defense_df['sack_ewma'] = defense_df.groupby(['defteam', 'season'])['sack']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    defense_df['pass_touchdown_ewma'] = defense_df.groupby(['defteam', 'season'])['pass_touchdown']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    defense_df['passing_yards_ewma'] = defense_df.groupby(['defteam', 'season'])['passing_yards']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    defense_df['cpoe_ewma'] = defense_df.groupby(['defteam', 'season'])['cpoe']\
        .transform(lambda x: x.shift().ewm(min_periods=1, span=2).mean())

    return defense_df

In [664]:
def def_cleaner(defense_df):
    """
    Preps passer df for merging; drops unnecessary columns
    """
    # Drop the non-ewma columns
    defense_df.drop(columns=['passing_yards','completion_percentage',
                            'air_yards', 'yards_after_catch', 'epa',     
                            'interception', 'qb_hit', 'sack', 'pass_touchdown', 'pass_attempts', 'cpoe', 'complete_pass', 'incomplete_pass',
                            'home_team', 'away_team'
                            ], inplace=True)
    
    return defense_df

In [665]:
defense_ewma = calculate_defensive_ewma(defense_df)

In [666]:
defense_prepped = def_cleaner(defense_ewma)

In [667]:
# Merge the defense and passer dataframes together
full_with_ewma = passer_prepped.merge(defense_prepped, how='inner', on=['defteam', 'season', 'week', 'roof', 'surface'], suffixes=('_passer', '_defense'))

# Get rid of flukey rows
filtered_with_ewma = full_with_ewma[(full_with_ewma['pass_attempts'] > 8) & (full_with_ewma['passing_yards'] >= 10)]

#Ford Field is empty, you hate to see that
filtered_with_ewma['surface'].replace('', 'fieldturf', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_with_ewma['surface'].replace('', 'fieldturf', inplace=True)


## Week 1 is NaN's (duh, we use past weeks for EWMA's) for MVP I'll just get rid but for future seasons/rookies I do need a strategy

In [668]:
filtered_with_ewma = filtered_with_ewma.dropna()

In [669]:
# Perform the merge based on game_id, season, week, and team
filtered_with_ewma_it = pd.merge(filtered_with_ewma, combined_schedule_df[['game_id', 'season', 'week', 'posteam', 'implied_total']], 
                     on=['game_id', 'season', 'week', 'posteam'], 
                     how='inner')

## Modeling

In [671]:
# Define the target and features
target = 'passing_yards'
features = filtered_with_ewma_it.columns.drop(['pass_attempts', 'passer_player_name', 'passing_yards', 'posteam', 'defteam', 'season', 'week', 'game_id'])

# Define categorical and numeric columns
categorical_columns = ['roof', 'surface']
numeric_columns = [col for col in features if col not in categorical_columns]

# Split the data into training (up to 2023) and testing (2024)
train_data = filtered_with_ewma_it[filtered_with_ewma_it['season'] <= 2023]
test_data = filtered_with_ewma_it[filtered_with_ewma_it['season'] == 2024]

# Separate features and target for both training and testing
X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]


# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ])

# Apply the transformations to the training and test sets
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convert the transformed data into a LightGBM-compatible format
train_data_lgb = lgb.Dataset(X_train_transformed, label=y_train)
test_data_lgb = lgb.Dataset(X_test_transformed, label=y_test, reference=train_data_lgb)

# Set up the parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 20,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'verbose': -1,
    'bagging_freq': 1
}

# Train the model with early stopping
lgb_model = lgb.train(params, train_data_lgb, valid_sets=[test_data_lgb], callbacks=[lgb.early_stopping(stopping_rounds=10)], num_boost_round=1000)

# Make predictions on the test set
y_pred = lgb_model.predict(X_test_transformed, num_iteration=lgb_model.best_iteration)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse}")

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[108]	valid_0's rmse: 72.3338
RMSE: 72.33378639523052


In [706]:
# features

In [673]:
# Ensure test_data contains the 2024 season
test_data = filtered_with_ewma_it[filtered_with_ewma_it['season'] == 2024]

In [728]:
# Add predictions to the test_data (2024) DataFrame
test_data['predicted_passing_yards'] = y_pred

# Calculate whether the model over-predicted or under-predicted
test_data['pyoe'] = test_data['passing_yards'] - test_data['predicted_passing_yards']

# # Display the updated dataframe with actual, predicted, and over/under information
test_data[['week', 'passer_player_name', 'passing_yards', 'predicted_passing_yards', 'pyoe']].sort_values(by='predicted_passing_yards', key=abs, ascending=False).tail(60)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_passing_yards'] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['pyoe'] = test_data['passing_yards'] - test_data['predicted_passing_yards']


Unnamed: 0,week,passer_player_name,passing_yards,predicted_passing_yards,pyoe
3387,6,K.Murray,214.0,230.893137,-16.893137
4629,3,S.Darnold,181.0,230.363347,-49.363347
2052,4,D.Watson,176.0,229.429676,-53.429676
229,3,A.Richardson,167.0,228.869172,-61.869172
2051,3,D.Watson,196.0,228.699899,-32.699899
4939,2,T.Lawrence,220.0,227.810904,-7.810904
2914,2,J.Herbert,130.0,227.530256,-97.530256
5066,3,W.Levis,260.0,227.014732,32.985268
2179,4,G.Minshew,130.0,226.37604,-96.37604
2524,3,J.Daniels,254.0,226.3335,27.6665


## Current Week Preprocessing

In [675]:
def calculate_ewma_tester_off(passer_df):
    """
    Calculates EWMA for offensive columns using the current and previous weeks' data, including the current week.
    Takes into account multiple seasons.
    """
    # Sort by passer, season, and week
    passer_df = passer_df.sort_values(by=['passer_player_name', 'season', 'week'])

    # Calculate the exponentially weighted moving average for each offensive feature, including the current week
    passer_df['completion_percentage_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['completion_percentage']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    passer_df['pass_attempts_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['pass_attempts']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    passer_df['air_yards_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['air_yards']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    passer_df['yards_after_catch_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['yards_after_catch']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    passer_df['epa_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['epa']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    passer_df['interception_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['interception']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    passer_df['qb_hit_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['qb_hit']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    passer_df['sack_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['sack']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    passer_df['pass_touchdown_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['pass_touchdown']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    passer_df['passing_yards_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['passing_yards']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    passer_df['cpoe_ewma'] = passer_df.groupby(['passer_player_name', 'season'])['cpoe']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    return passer_df

In [676]:
passer_df_tester = passer_df[(passer_df['season'] == 2024)]

In [677]:
prediction_set_passer_nc = calculate_ewma_tester_off(passer_df_tester)

In [678]:
prediction_set_passer = pass_cleaner(prediction_set_passer_nc)

In [679]:
def calculate_ewma_tester_def(defense_df):
    """
    Calculates EWMA for defensive columns using previous weeks' data (excluding the current week).
    """
    # Sort by 'defteam', 'season', and 'week' in ascending order (to ensure time order)
    defense_df = defense_df.sort_values(by=['defteam', 'season', 'week'])

    # Ensure proper grouping by both defteam and season
    defense_df['completion_percentage_ewma'] = defense_df.groupby(['defteam', 'season'])['completion_percentage']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())
    
    defense_df['pass_attempts_ewma'] = defense_df.groupby(['defteam', 'season'])['pass_attempts']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    defense_df['air_yards_ewma'] = defense_df.groupby(['defteam', 'season'])['air_yards']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    defense_df['yards_after_catch_ewma'] = defense_df.groupby(['defteam', 'season'])['yards_after_catch']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    defense_df['epa_ewma'] = defense_df.groupby(['defteam', 'season'])['epa']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    defense_df['interception_ewma'] = defense_df.groupby(['defteam', 'season'])['interception']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    defense_df['qb_hit_ewma'] = defense_df.groupby(['defteam', 'season'])['qb_hit']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    defense_df['sack_ewma'] = defense_df.groupby(['defteam', 'season'])['sack']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    defense_df['pass_touchdown_ewma'] = defense_df.groupby(['defteam', 'season'])['pass_touchdown']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    defense_df['passing_yards_ewma'] = defense_df.groupby(['defteam', 'season'])['passing_yards']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    defense_df['cpoe_ewma'] = defense_df.groupby(['defteam', 'season'])['cpoe']\
        .transform(lambda x: x.ewm(min_periods=1, span=2).mean())

    return defense_df

In [680]:
defense_df_tester = defense_df[defense_df['season'] == 2024]

In [681]:
prediction_set_defense_nc = calculate_ewma_tester_def(defense_df_tester)

In [682]:
prediction_set_defense = def_cleaner(prediction_set_defense_nc)

In [683]:
# Merge the defense and passer dataframes together
prediction_set = prediction_set_passer.merge(prediction_set_defense, how='inner', on=['defteam', 'season', 'week', 'roof', 'surface'], suffixes=('_passer', '_defense'))

# Get rid of flukey rows
filtered_prediction_set = prediction_set[(prediction_set['pass_attempts'] > 8) & (prediction_set['passing_yards'] >= 10)]

#Ford Field is empty, you hate to see that
filtered_prediction_set['surface'].replace('', 'fieldturf', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_prediction_set['surface'].replace('', 'fieldturf', inplace=True)


In [684]:
filtered_prediction_set = filtered_prediction_set[filtered_prediction_set['week'] ==  5]
#need to make this dynamic instead of a manual change

In [687]:
filtered_prediction_set.drop(columns = ['game_id', 'passing_yards', 'pass_attempts'], inplace=True)

In [688]:
defense_columns = [
    'season', 'week', 'defteam', 'completion_percentage_ewma_defense', 'pass_attempts_ewma_defense',
    'air_yards_ewma_defense', 'yards_after_catch_ewma_defense', 'epa_ewma_defense',
    'interception_ewma_defense', 'qb_hit_ewma_defense', 'sack_ewma_defense',
    'pass_touchdown_ewma_defense', 'passing_yards_ewma_defense', 'cpoe_ewma_defense'
]

defense_df_tbj = filtered_prediction_set[defense_columns]

In [689]:
offense_columns = [
    'season', 'week', 'posteam', 'passer_player_name',
    'completion_percentage_ewma_passer', 'pass_attempts_ewma_passer', 'air_yards_ewma_passer',
    'yards_after_catch_ewma_passer', 'epa_ewma_passer', 'interception_ewma_passer',
    'qb_hit_ewma_passer', 'sack_ewma_passer', 'pass_touchdown_ewma_passer',
    'passing_yards_ewma_passer', 'cpoe_ewma_passer'
]

offense_df_tbj = filtered_prediction_set[offense_columns]

In [690]:
#Get EWMA's to future week for joining
offense_df_tbj.loc[:, 'week'] += 1
defense_df_tbj.loc[:, 'week'] += 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offense_df_tbj.loc[:, 'week'] += 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  defense_df_tbj.loc[:, 'week'] += 1


In [691]:
combined_schedule_preds = combined_schedule_df[combined_schedule_df['season'] == combined_schedule_df['season'].max()]
# combined_schedule_preds[combined_schedule_preds['week'] == 6]

In [693]:
# Merge the two DataFrames on relevant columns, avoiding duplicates
off_plus_schedule = pd.merge(
    offense_df_tbj, 
    combined_schedule_preds, 
    on=['season', 'week', 'posteam', 'passer_player_name'], 
    suffixes=('', '_implied'), 
    how='inner'
)

# Drop redundant columns if needed but keep 'game_id' as identifier
# If columns like 'posteam' or 'defteam' are duplicated, keep only one version
off_plus_schedule = off_plus_schedule.loc[:, ~off_plus_schedule.columns.duplicated()]

In [694]:
# Perform the merge on season, week, and defteam
full_pred_features = pd.merge(
    off_plus_schedule,
    defense_df_tbj,
    on=['season', 'week', 'defteam'],  # Merging on these columns
    suffixes=('_offense', '_defense'),
    how='inner'  # Use inner join to avoid boof rows
)

In [697]:
# List of the new column order
new_column_order = ['game_id', 'season', 'week','game_type','posteam', 'defteam', 'home_team', 'away_team', 'qb_name', 'passer_player_name',
                    'home_away', 'implied_total', 'roof', 'surface',
       'completion_percentage_ewma_passer', 'pass_attempts_ewma_passer',
       'air_yards_ewma_passer', 'yards_after_catch_ewma_passer',
       'epa_ewma_passer', 'interception_ewma_passer', 'qb_hit_ewma_passer',
       'sack_ewma_passer', 'pass_touchdown_ewma_passer',
       'passing_yards_ewma_passer', 'cpoe_ewma_passer', 'completion_percentage_ewma_defense',
       'pass_attempts_ewma_defense', 'air_yards_ewma_defense',
       'yards_after_catch_ewma_defense', 'epa_ewma_defense',
       'interception_ewma_defense', 'qb_hit_ewma_defense', 'sack_ewma_defense',
       'pass_touchdown_ewma_defense', 'passing_yards_ewma_defense',
       'cpoe_ewma_defense']

# Reorder inplace
full_pred_features = full_pred_features[new_column_order]

In [719]:
# Create the home_flag_passer column
full_pred_features['home_flag_passer'] = full_pred_features['posteam'] == full_pred_features['home_team']

# Create the home_flag_defense column
full_pred_features['home_flag_defense'] = full_pred_features['defteam'] == full_pred_features['home_team']

In [704]:
full_pred_features['game_id'].value_counts()

#three qb changes (LV, NO, NE), how the fuck do I wanna handle this?
#PITTSBURGH IS MIA WHY ITS NOT EVEN THE NEW QB FUCK FUCK FUCK FUCK FUCK FUCK FUCK FUCK FUCK

2024_06_ATL_CAR    2
2024_06_BUF_NYJ    2
2024_06_SF_SEA     2
2024_06_JAX_CHI    2
2024_06_CIN_NYG    2
2024_06_WAS_BAL    2
2024_06_ARI_GB     2
2024_06_PIT_LV     1
2024_06_TB_NO      1
2024_06_HOU_NE     1
Name: game_id, dtype: int64

In [712]:
# Function to make predictions on a new dataframe
def make_predictions(new_data, model, preprocessor, features, categorical_columns, numeric_columns):
    # Ensure the new data has the required columns
    new_data = new_data[features]

    # Apply the same preprocessing (OneHotEncoding and Scaling)
    new_data_transformed = preprocessor.transform(new_data)
    
    # Use the trained model to make predictions
    predictions = model.predict(new_data_transformed, num_iteration=model.best_iteration)
    
    return predictions

In [723]:
# Call the function to make predictions
predictions = make_predictions(full_pred_features, lgb_model, preprocessor, features, categorical_columns, numeric_columns)

full_pred_features['predicted_passing_yards'] = predictions

In [726]:
full_pred_features

Unnamed: 0,game_id,season,week,game_type,posteam,defteam,home_team,away_team,qb_name,passer_player_name,home_away,implied_total,roof,surface,completion_percentage_ewma_passer,pass_attempts_ewma_passer,air_yards_ewma_passer,yards_after_catch_ewma_passer,epa_ewma_passer,interception_ewma_passer,qb_hit_ewma_passer,sack_ewma_passer,pass_touchdown_ewma_passer,passing_yards_ewma_passer,cpoe_ewma_passer,completion_percentage_ewma_defense,pass_attempts_ewma_defense,air_yards_ewma_defense,yards_after_catch_ewma_defense,epa_ewma_defense,interception_ewma_defense,qb_hit_ewma_defense,sack_ewma_defense,pass_touchdown_ewma_defense,passing_yards_ewma_defense,cpoe_ewma_defense,home_flag_passer,home_flag_defense,predicted_passing_yards
0,2024_06_ATL_CAR,2024,6,REG,CAR,ATL,CAR,ATL,Andy Dalton,A.Dalton,home,19.75,outdoors,grass,0.646933,29.8,225.675,72.825,-11.152178,0.9,5.475,2.175,0.675,165.225,-2.434265,0.785135,27.61157,145.033058,112.239669,6.473052,0.322314,2.016529,0.933884,2.181818,195.793388,5.745237,True,False,210.151235
1,2024_06_PIT_LV,2024,6,REG,LV,PIT,LV,PIT,Aidan O'Connell,A.O'Connell,home,16.5,dome,grass,0.582237,17.25,161.25,30.5,-2.61917,0.75,3.5,1.0,0.25,91.0,-6.053392,0.699495,35.966942,295.876033,142.719008,9.376312,1.404959,6.107438,2.22314,1.867769,307.429752,3.785156,True,False,204.741886
2,2024_06_BUF_NYJ,2024,6,REG,NYJ,BUF,NYJ,BUF,Aaron Rodgers,A.Rodgers,home,19.75,outdoors,fieldturf,0.585787,47.024793,365.049587,118.214876,-12.770483,2.016529,10.661157,3.330579,1.545455,240.190083,-6.285805,0.7368,33.504132,220.479339,151.743802,3.542846,0.818182,5.595041,1.347107,1.22314,278.404959,6.35576,True,False,233.892222
3,2024_06_TB_NO,2024,6,REG,TB,NO,NO,TB,Baker Mayfield,B.Mayfield,away,23.0,dome,sportturf,0.753639,29.628099,162.31405,130.371901,6.738869,0.099174,1.942149,1.768595,2.586777,217.024793,3.049693,0.710883,36.983471,218.247934,205.404959,0.944839,1.033058,5.520661,1.966942,0.024793,306.561983,5.768513,False,True,243.889471
4,2024_06_SF_SEA,2024,6,REG,SF,SEA,SEA,SF,Brock Purdy,B.Purdy,away,26.25,outdoors,fieldturf,0.593954,31.231405,321.07438,95.099174,2.084596,1.586777,4.661157,1.801653,1.140496,259.140496,-0.41128,0.741618,30.380165,155.206612,138.38843,7.994588,0.016529,5.371901,3.214876,2.033058,257.429752,8.749064,False,True,252.141805
5,2024_06_HOU_NE,2024,6,REG,HOU,NE,NE,HOU,C.J. Stroud,C.Stroud,away,22.5,outdoors,fieldturf,0.732376,36.933884,257.305785,170.016529,6.806071,0.818182,5.975207,1.520661,1.231405,322.933884,3.953158,0.612779,29.793388,303.842975,83.68595,0.699661,0.892562,4.016529,2.479339,0.396694,224.495868,-2.789433,False,True,246.233541
6,2024_06_JAX_CHI,2024,6,REG,CHI,JAX,CHI,JAX,Caleb Williams,C.Williams,home,21.5,outdoors,grass,0.696975,29.297521,220.297521,156.942149,8.753779,0.198347,2.661157,1.826446,1.710744,270.619835,4.207281,0.73517,42.504132,321.446281,159.495868,13.451544,0.0,4.991736,3.198347,2.760331,360.561983,6.988943,True,False,244.932029
7,2024_06_CIN_NYG,2024,6,REG,NYG,CIN,NYG,CIN,Daniel Jones,D.Jones,home,20.75,outdoors,fieldturf,0.689986,35.016529,231.322314,121.702479,6.786679,0.239669,4.884298,2.446281,1.53719,258.247934,5.909178,0.646348,39.520661,345.181818,145.933884,15.591248,0.272727,5.512397,0.876033,3.322314,305.68595,1.023459,True,False,242.101048
8,2024_06_ATL_CAR,2024,6,REG,ATL,CAR,CAR,ATL,Kirk Cousins,K.Cousins,away,25.75,outdoors,grass,0.724993,47.396694,377.917355,200.363636,14.920277,0.983471,8.438017,3.090909,2.809917,418.206612,3.537554,0.701612,29.677686,206.115702,166.61157,11.668436,0.322314,2.289256,0.92562,2.008264,282.165289,2.782019,False,True,264.073071
9,2024_06_SF_SEA,2024,6,REG,SEA,SF,SEA,SF,Geno Smith,G.Smith,home,22.75,outdoors,fieldturf,0.708425,42.694215,224.818182,155.760331,-6.334534,0.380165,7.842975,5.669421,1.0,309.272727,0.238671,0.644292,29.090909,214.239669,96.520661,-4.697065,0.92562,4.123967,2.338843,1.033058,192.785124,-3.240653,True,False,243.70359


## Charts for model stuff if I need to look. It's a pretty shit model. It does ok I guess but not a lot of variance in projection lmao

In [710]:
# # Plotting the histogram of PYOE
# plt.figure(figsize=(10, 6))
# plt.hist(test_data['pyoe'], bins=25, edgecolor='black', alpha=0.7)
# plt.title('Distribution of Pass Yards Over Expected (PYOE)', fontsize=16)
# plt.xlabel('PYOE (Predicted Passing Yards - Actual Passing Yards)', fontsize=12)
# plt.ylabel('Frequency', fontsize=12)
# plt.grid(True, alpha=0.5)

# # Show the plot
# plt.show()

In [709]:
# # Plotting the histogram of PYOE
# plt.figure(figsize=(10, 6))
# plt.hist(test_data['passing_yards'], bins=30, edgecolor='black', alpha=0.7)
# plt.title('Distribution of Pass Yards', fontsize=16)
# plt.xlabel('Passing Yards', fontsize=12)
# plt.ylabel('Frequency', fontsize=12)
# plt.grid(True, alpha=0.5)

# # Show the plot
# plt.show()

In [708]:
# # Plotting the histogram of predicted passing yards and true passing yards
# plt.figure(figsize=(10, 6))

# # Plot histogram for true passing yards
# plt.hist(test_data['passing_yards'], bins=30, alpha=0.5, label='True Passing Yards', color='blue')

# # Plot histogram for predicted passing yards
# plt.hist(test_data['predicted_passing_yards'], bins=30, alpha=0.5, label='Predicted Passing Yards', color='orange')

# # Adding labels and title
# plt.xlabel('Passing Yards')
# plt.ylabel('Frequency')
# plt.title('Histogram of True vs Predicted Passing Yards on 2024 Season, Bins = 25')
# plt.legend(loc='upper right')

# # Show the plot
# plt.show()