In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#libraries for classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score

#Plot style
plt.style.use('fivethirtyeight')
#Plot sizing
plt.rcParams["figure.figsize"] = [15, 7.5]

#change display option to view all columns
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('Model Data.csv')[['classification', 'defendersInTheBox', 'DL', 'LB', 'DB', 'quarter', 'down.x', 'yardsToGo', 'yardline_100']].reset_index(drop = True)

df.head(10)

Unnamed: 0,classification,defendersInTheBox,DL,LB,DB,quarter,down.x,yardsToGo,yardline_100
0,pass short right,6,4,2,5,1,1,10,77
1,run middle,6,4,2,5,1,2,6,73
2,pass short middle,5,4,1,6,1,3,4,71
3,run end,7,4,3,4,1,1,10,56
4,pass short left,5,4,1,6,1,3,12,58
5,run end,6,3,4,4,1,1,10,26
6,pass short middle,6,2,4,5,1,2,5,21
7,run end,7,2,4,5,1,1,10,85
8,pass short right,5,3,3,5,1,2,10,85
9,pass deep right,4,1,3,7,1,3,8,83


In [3]:
#split data into testing and training
#split model data into test and training sets
train, test = train_test_split(df, test_size = .25, stratify = df.classification, random_state = 801)

#split into x's and y's
x_train = train.drop('classification', axis = 1)
x_test = test.drop('classification', axis = 1)
y_train = train['classification']
y_test = test['classification']

In [4]:
#fit initial gradient boosting model
gb = GradientBoostingClassifier(max_depth = 3)
gb.fit(x_train, y_train)
yhat_gb = (gb.predict(x_test))

In [5]:
#Validating metric:
print('Gradient Boosting:')
print('Accuracy:', accuracy_score(y_test, yhat_gb))

Gradient Boosting:
Accuracy: 0.2043906131718395


In [22]:
parameters = {'max_depth':list(range(1, 15, 1))}
gb_par = GradientBoostingClassifier()

# *** comment out below lines so it does not need to run every time I restart the Kernel

#param_gb = GridSearchCV(gb_par, parameters)
#param_gb.fit(x_train, y_train)

#print out optimal parameter
#print(param_gb.best_params_)

#optimal max_depth is 3

{'max_depth': 3}


In [23]:
#fit initial random forest model
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
yhat_rf = (rf.predict(x_test))

In [24]:
#Validating metric:
print('Gradient Boosting:')
print('Accuracy:', accuracy_score(y_test, yhat_rf))

Gradient Boosting:
Accuracy: 0.1684330052990159


In [6]:
plays = pd.read_csv('Plays Data.csv', encoding= 'unicode_escape').sort_values(['possessionTeam', 'gameId', 'playId']).reset_index(drop = True)
plays.reset_index(inplace = True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Rolling Offensive EPA

In [24]:
rolling_off_epa = pd.DataFrame()
for team in plays.possessionTeam.unique():
    #initialize temporary variables
    temp_team = plays.query('possessionTeam == @team')
    temp_play_type = temp_team.play_type.reset_index(drop = True)
    temp_epa = temp_team.epa.reset_index(drop = True)
    total = 0
    average = 0
    rolling_avg = list()
    run_total = 0
    run_average = 0
    run_rolling_avg = list()
    pass_total = 0
    pass_average = 0
    pass_rolling_avg = list()
    #get the rolling average for each team and play_type
    for i in range(0, len(temp_team)):
        if temp_play_type[i] == "run":
            #update runs on run plays
            run_total = run_total + temp_epa[i]
            run_average = run_total / (i + 1)
            run_rolling_avg.append(run_average)
            #update passes on run plays
            pass_rolling_avg.append(pass_average)
        elif temp_play_type[i] == "pass":
            #update pass on pass plays
            pass_total = pass_total + temp_epa[i]
            pass_average = pass_total / (i + 1)
            pass_rolling_avg.append(pass_average)
            #update runs on pass plays
            run_rolling_avg.append(run_average)
        else:
            #update runs and passes on weird plays
            pass_rolling_avg.append(pass_average)
            run_rolling_avg.append(run_average)
        #update overall rolling epa
        total = total + temp_epa[i]
        average = total / (i + 1)
        rolling_avg.append(average)
            
    #add rolling average to dataframe
    rolling_off_epa = rolling_off_epa.append(pd.DataFrame({"ind" : temp_team.index, 
                                                           "rolling_pass_off_epa" : pass_rolling_avg,
                                                           "rolling_run_off_epa" : run_rolling_avg}))
    

# Rolling Defensive EPA

In [40]:
defense_sorted = plays.sort_values(['possessionTeam', 'gameId', 'playId']).reset_index(drop = True)
defense_sorted.reset_index(inplace = True)
rolling_def_epa = pd.DataFrame()
for team in defense_sorted.defteam.unique():
    #initialize temporary variables
    temp_team = defense_sorted.query('defteam == @team')
    temp_play_type = temp_team.play_type.reset_index(drop = True)
    temp_epa = temp_team.epa.reset_index(drop = True)
    total = 0
    average = 0
    rolling_avg = list()
    run_total = 0
    run_average = 0
    run_rolling_avg = list()
    pass_total = 0
    pass_average = 0
    pass_rolling_avg = list()
    #get the rolling average for each team and play_type
    for i in range(0, len(temp_team)):
        if temp_play_type[i] == "run":
            #update runs on run plays
            run_total = run_total + temp_epa[i]
            run_average = run_total / (i + 1)
            run_rolling_avg.append(run_average)
            #update passes on run plays
            pass_rolling_avg.append(pass_average)
        elif temp_play_type[i] == "pass":
            #update pass on pass plays
            pass_total = pass_total + temp_epa[i]
            pass_average = pass_total / (i + 1)
            pass_rolling_avg.append(pass_average)
            #update runs on pass plays
            run_rolling_avg.append(run_average)
        else:
            #update runs and passes on weird plays
            pass_rolling_avg.append(pass_average)
            run_rolling_avg.append(run_average)
        #update overall rolling epa
        total = total + temp_epa[i]
        average = total / (i + 1)
        rolling_avg.append(average)
            
    #add rolling average to dataframe
    rolling_def_epa = rolling_def_epa.append(pd.DataFrame({"ind" : temp_team.index, 
                                                           "rolling_pass_def_epa" : pass_rolling_avg,
                                                           "rolling_run_def_epa" : run_rolling_avg}))

In [41]:
#merge rolling defensive epa
def_merged = plays.merge(rolling_def_epa, how = 'left', left_on = 'index', right_on = 'ind')

#merge rolling offensive epa
merged = def_merged.merge(rolling_off_epa, how = 'left', left_on = 'ind', right_on = 'ind')

In [47]:
final_merged = merged[['gameId', 'playId', 'rolling_pass_def_epa', 'rolling_run_def_epa', 'rolling_pass_off_epa', 'rolling_run_off_epa']]

In [48]:
#export merged data
final_merged.to_csv('rolling epa per play.csv')