In [1]:
import pandas as pd
import numpy as np

## A lot of the below follows this guide youtube.com/watch?v=GrJP9FLV3FE&t=407s ##
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

  from pandas import MultiIndex, Int64Index


In [2]:
#output_folder = '/content/drive/MyDrive/Colab Notebooks/data'
output_folder = 'data/'

In [3]:
#df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/nfl_df_cv.zip', low_memory=False, index_col=False)
df = pd.read_csv('data/nfl_df_cv.zip', low_memory=False, index_col=False)

<details>
<summary><h3> Wrangling </h3></summary>
I had the data on my local machine so I read it in, then dropped columns that I did not need to decrease how much on disk memory was used

```python
df = pd.concat([nfl.import_pbp_data(years = range(1999, 2022))])

df = df[[
  'play_id', 'game_id', 'season', 'posteam', 'posteam_type', 'spread_line', 
  'game_seconds_remaining', 'play_type', 'game_half', 'result', 
  'score_differential', 'half_seconds_remaining', 'game_seconds_remaining', 
  'down', 'ydstogo', 'yardline_100', 'posteam_timeouts_remaining', 'defteam_timeouts_remaining' 
]]
```

The dataset is 1098040 `rows` by 382 `columns` so you might need to break this into multiple steps. Since I had the data locally it was slightly quicker for me.  

Using `R` you can use

```r
library(tidyverse)

# #https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_{x}.rds
df <- purrr::map_df(c(1999:2021), function(x){
  read_rds(glue::glue('~/Downloads/nfl_pbp/play_by_play_{x}.rds'))
  })

pbp <- df |>
  select(play_id, game_id, season, posteam, posteam_type, 
         result, play_type, down, ydstogo, yardline_100,
         spread_line, game_seconds_remaining, game_half, 
         score_differential, half_seconds_remaining, 
         posteam_timeouts_remaining, defteam_timeouts_remaining)
```

Continue

A lot of wrangling was done, which is why I saved the output. The steps taken were

```python
## create some new variables for the model ##
## most features taken directly from nflfastR ##

## SPREAD_LINE_DIFFERENTIAL ##
## instead of a point differential, use a spread line differential ##
## ie how close is the team to covering ##
df['spread_line_differential'] = np.where(
    df['posteam_type'] == 'home',
    -1 * df['spread_line'] + df['score_differential'],
    np.where(
        df['posteam_type'] == 'away',
        df['spread_line'] + df['score_differential'],
        np.nan
    ))

## elapsed share, spread_time, and Diff_Time_Ratio are all custom features from nflfastR's model ##
## https://raw.githubusercontent.com/mrcaseb/nflfastR/master/R/helper_add_ep_wp.R ##
## elapsed share ##
df['elapsed_share'] = (
    (3600 - df['game_seconds_remaining']) / 3600
)

df['posteam_spread'] = np.where(
    df['posteam_type'] == 'home',
    df['spread_line'],
    -1 * df['spread_line']
)

## spread_time ##
df['spread_time'] = df['posteam_spread'] * np.exp(-4 * df['elapsed_share'])

## Diff_Time_Ratio ##
df['diff_time_ratio'] = df['score_differential'] / np.exp(-4 * df['elapsed_share'])

## RECEIVE_2H_KO ##
## determine who received the first kickoff ##
kickoff_df = df[df['play_type'] == 'kickoff'].groupby(['game_id'])[['game_id','posteam_type']].head(1)

## add back to df ##
df = pd.merge(df,
kickoff_df.rename(columns={
    'posteam_type' : 'received_first_ko'
    }),
    on=['game_id'],
    how='left'
)

## create receive 2nd half ko variable ##
df['receive_2h_ko'] = np.where(
    (df['game_half'] == 'Half1') & (df['posteam_type'] != df['received_first_ko']),
    1, 0
)

## IS_PAT || denote if a play is a pat ##
df['is_pat'] = np.where(
    df['play_type'] == 'extra_point',
    1, 0
)

## POSTEAM_IS_HOME || turn posteam_type into a boolean ##
df['posteam_is_home'] = np.where(
    df['posteam_type'] == 'home',
    1, np.where(
        df['posteam_type'] == 'away',
        0, np.nan
    ))

## COVER_RESULT ##
df['cover_result'] = np.where(
    df['posteam_type'] == 'home',
    np.where(
        -1 * df['spread_line'] + df['result'] > 0,
        1, 0),
    np.where(
        df['posteam_type'] == 'away',
        np.where(
            df['spread_line'] + -1 * df['result'] > 0,
            1, 0),
        np.nan
    ))

model_df = df[[
    ## only needed for train/test split ##
    'game_id',
    'season',
    ## dependent var ##
    'cover_result',
    ## independent vars from WP model ##
    'spread_time',
    'score_differential',
    'diff_time_ratio',
    'posteam_is_home',
    'half_seconds_remaining',
    'game_seconds_remaining',
    'down',
    'ydstogo',
    'yardline_100',
    'posteam_timeouts_remaining',
    'defteam_timeouts_remaining',
    'receive_2h_ko',
    ## new features for CP model ##
    'is_pat',
    'spread_line_differential',
]].copy()

## remove NAs ##
model_df = model_df.dropna()

# save as a .zip to reduce size 
model_df.to_csv('nfl_df_cv.zip', index=False, compression=dict(method='zip', archive_name='nfl_df_cv.csv'))
```

</details>

In [4]:
df.head(3)

Unnamed: 0,game_id,season,cover_result,spread_time,score_differential,diff_time_ratio,posteam_is_home,half_seconds_remaining,game_seconds_remaining,down,ydstogo,yardline_100,posteam_timeouts_remaining,defteam_timeouts_remaining,receive_2h_ko,is_pat,spread_line_differential
0,1999_01_ARI_PHI,1999,1.0,-3.0,0.0,0.0,1.0,1800.0,3600.0,1.0,10,77.0,3.0,3.0,0,0,3.0
1,1999_01_ARI_PHI,1999,1.0,-3.0,0.0,0.0,1.0,1800.0,3600.0,2.0,10,77.0,3.0,3.0,0,0,3.0
2,1999_01_ARI_PHI,1999,1.0,-3.0,0.0,0.0,1.0,1800.0,3600.0,3.0,9,76.0,3.0,3.0,0,0,3.0


In [5]:
df.shape

(913762, 17)

## MODEL 

Split dependent and independent data frames since data is at the play level and we want to predict something that occurs at the game level, we can't just take a random sample of plays instead, we will take a random sample of games and apply test/train sets that way this ensures no game has plays in both the test and train 

In [4]:
## we'll also hold out the last two seasons (2020 and 2021) for validation ##
model_construction_df = df[df['season'] < 2020].copy()
model_validation_df = df[df['season'] >= 2020].copy()

## get df of unique games ##
set_key_df = model_construction_df.groupby(['game_id'])['game_id'].head(1).reset_index()[['game_id']].copy()

## assign to test / train randomly ##
set_key_df['rand_float'] = np.random.uniform(
    low=0,
    high=1,
    size=len(set_key_df)
)

## assign set ##
set_key_df['is_training_set'] = np.where(
    set_key_df['rand_float'] > .33,
    1, 0
)

## match back to model df ##
model_construction_df = pd.merge(
    model_construction_df,
    set_key_df[['game_id', 'is_training_set']],
    on=['game_id'],
    how='left'
)

In [5]:
## create training and test sets ##
training_df = model_construction_df[model_construction_df['is_training_set'] == 1].copy()
test_df = model_construction_df[model_construction_df['is_training_set'] == 0].copy()

## create x and y versions ##
X_train = training_df.drop(columns=['game_id', 'season', 'is_training_set', 'cover_result']).copy()
X_test = test_df.drop(columns=['game_id', 'season', 'is_training_set', 'cover_result']).copy()

y_train = training_df['cover_result'].copy()
y_test = test_df['cover_result'].copy()

In [None]:
## create first model to make sure evrything works ##
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False)
clf_xgb.fit(
    X_train,
    y_train,
    verbose=True,
    early_stopping_rounds = 10,
    eval_metric='aucpr',
    eval_set=[(X_test, y_test)]
)

In [7]:
## Hyperparameter Optimization ##
## do some hyper parameter optimization ##
## Round 1 ##
param_grid = {
    'max_depth' : [3, 4, 5, 6, 7],
    'learning_rate' : [0.1, 0.05, 0.01, 0.025, 0.005],
    'gamma' : [0.25],
    'reg_lambda' : [4, 5, 6, 7, 8],
    'n_estimators' : [100, 500, 1000, 1125, 1250],
}

## set up grid search ##
optimal_params = GridSearchCV(
    estimator=xgb.XGBClassifier(
        objective='binary:logistic',
        subsample=0.9,
        colsample_bytree=0.75
    ),
    param_grid=param_grid,
    scoring='roc_auc',
    verbose=0,
    cv=3
)

In [None]:
## fit ##
optimal_params.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)]
)

In [None]:
clf_xgb.get_params()

In [None]:
## Round 1 Results ##
## {'gamma': 0., 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_lambda': 1}

## Round 2 ##
param_grid = {
    'max_depth' : [5, 6, 7],
    'learning_rate' : [0.005, 0.01, 0.025],
    'gamma' : [.25],
    'reg_lambda' : [6, 8, 10],
    'n_estimators' : [1000, 1250, 1500],
}

## {'gamma': 0.25, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000, 'reg_lambda': 6}

In [None]:
## Round 3 ##
param_grid = {
    'max_depth' : [5],
    'learning_rate' : [0.01],
    'gamma' : [.25],
    'reg_lambda' : [2, 4, 6],
    'n_estimators' : [1000, 1125],
}

## {'gamma': 0.25, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000, 'reg_lambda': 6}

In [None]:
## set up grid search ##
optimal_params = GridSearchCV(
    estimator=xgb.XGBClassifier(
        objective='binary:logistic',
        subsample=0.9,
        colsample_bytree=0.75
    ),
    param_grid=param_grid,
    scoring='roc_auc',
    verbose=0,
    cv=3
)

In [None]:
## fit ##
optimal_params.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)]
)

In [None]:
## rerun w/ tuned params ##
clf_xgb = xgb.XGBClassifier(
    objective='binary:logistic',
    gamma=0.25,
    max_depth=5,
    reg_lambda=6,
    learning_rate=0.01,
    n_estimators=1000
)

In [None]:
clf_xgb.fit(
    X_train,
    y_train,
    verbose=True,
    early_stopping_rounds = 10,
    eval_metric='aucpr',
    eval_set=[(X_test, y_test)]
)

In [None]:
## save model for future use ##
clf_xgb.save_model(
    '{0}/cp.model'.format(output_folder)
)

In [None]:
## a function for saving csvs of model performance locally. Not necessary to run ##
def score_models(model_arrays):
    bin_dfs = []
    confusion_dfs = []
    metric_dfs = []
    for i in model_arrays:
        df = i[0].copy()
        output_name = i[1]
        ## create predictions ##
        df['cover_prob'] = clf_xgb.predict_proba(df.drop(columns=['cover_result']))[:,1]
        ## bins ##
        bins = np.linspace(0, 1, 100)
        binned_df = df.groupby(
            np.digitize(df['cover_prob'], bins)
        ).agg(
            cover_average = ('cover_result', 'mean'),
            observations = ('cover_result', 'count'),
        ).reset_index().rename(columns={
            'index' : 'cover_prob'
        })
        binned_df['set_type'] = output_name
        ## confusion ##
        df['true_pos'] = np.where(
            (df['cover_prob'] > .5) &
            (df['cover_result'] == 1),
            1, 0)
        df['false_pos'] = np.where(
            (df['cover_prob'] > .5) &
            (df['cover_result'] == 0),
            1, 0)
        df['true_neg'] = np.where(
            (df['cover_prob'] < .5) &
            (df['cover_result'] == 0),
            1, 0)
        df['false_neg'] = np.where(
            (df['cover_prob'] < .5) &
            (df['cover_result'] == 1),
            1, 0)
        confusion_df = pd.DataFrame([{
            'set_type:' : output_name,
            'true_positive' : df['true_pos'].sum(),
            'false_positive' : df['false_pos'].sum(),
            'true_negative' : df['true_neg'].sum(),
            'false_negative' : df['false_neg'].sum(),
        }])
        ## log loss ##
        log_loss_score = log_loss(
            df['cover_result'],
            df['cover_prob'])
        auc = roc_auc_score(
            df['cover_result'],
            df['cover_prob'])
        metric_df = pd.DataFrame([{
            'set_type:' : output_name,
            'log_loss' : log_loss_score,
            'roc_auc' : auc,
        }])
        bin_dfs.append(binned_df)
        confusion_dfs.append(confusion_df)
        metric_dfs.append(metric_df)
    bin_output = pd.concat(bin_dfs)
    confusion_output = pd.concat(confusion_dfs)
    metrics_output = pd.concat(metric_dfs)
    ## output ##
    bin_output.to_csv(
        '{0}/binned_results.csv'.format(
            output_folder))
    confusion_output.to_csv(
        '{0}/confusion_results.csv'.format(
            output_folder))
    metrics_output.to_csv(
        '{0}/metric_results.csv'.format(
            output_folder))

In [None]:
test_arrays = [
    [training_df.drop(columns=['game_id', 'season', 'is_training_set']).copy(),
        'training'],
    [test_df.drop(columns=['game_id', 'season', 'is_training_set']).copy(),
        'test'],
    [model_validation_df.drop(columns=['game_id', 'season']).copy(),
        'validate'],
        ]

score_models(test_arrays)

In [None]:
import nfl_data_py as nfl
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.ticker as plticker
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

plt.style.use('seaborn-talk')
plt.style.use('ggplot')

pd.set_option('display.max_columns', 7)

2020


Data not available for 2021
Downcasting floats.


Index(['play_id', 'game_id', 'old_game_id', 'home_team', 'away_team',
       'season_type', 'week', 'posteam', 'posteam_type', 'defteam',
       ...
       'possession_team', 'offense_formation', 'offense_personnel',
       'defenders_in_box', 'defense_personnel', 'number_of_pass_rushers',
       'offense_players', 'n_offense', 'defense_players', 'n_defense'],
      dtype='object', length=382)