In [317]:
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as smf
import seaborn as sns  # useful for exploratory data analysis (EDA)
import sklearn         # machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import nflfastpy as nfl
#comments

In [None]:
data = pd.concat([nfl.load_pbp_data(season).assign(season=season) for season in range(1999, 2021)])

In [None]:
home_epa_play = data.groupby(['game_id']).apply(lambda x: x[x['posteam'] ==x['home_team']]['epa'].mean()).reset_index().rename(columns = {0:'home_epa_play'})

In [None]:
home_epa_play

In [None]:
away_epa_play = data.groupby(['game_id']).apply(lambda x: x[x['posteam'] ==x['away_team']]['epa'].mean()).reset_index().rename(columns = {0:'home_epa_play'})
away_epa_play

In [None]:
away_epa_play


In [None]:
df1 = home_epa_play.merge(away_epa_play)
df2 = home_epa_play.merge(away_epa_play, left_on='game_id', right_on='game_id')

In [None]:
df2.columns = ['game_id', 'home_epa_play', 'away_epa_play']

In [None]:
scores = data.groupby(['game_id']).agg({'home_score':'last', 'away_score': 'last'}).reset_index()
scores['total'] = scores['home_score'] + scores['away_score']
scores

In [None]:
final_prep = pd.merge(scores, df2)
final_prep

In [None]:
x = final_prep.total
y = final_prep.home_epa_play + final_prep.away_epa_play
fig, ax = plt.subplots(figsize = (12,7))
ax.scatter(x, y) 
plt.style.use('seaborn-dark')

In [None]:
g = sns.pairplot(final_prep, hue='total', palette = "YlOrBr")

In [None]:
train, test = train_test_split(final_prep, test_size = .2)

In [None]:
lin_reg = LinearRegression()
trained_model = lin_reg.fit(train [['home_epa_play', 'away_epa_play']], train ['total'])
r_sq = lin_reg.score(train [['home_epa_play', 'away_epa_play']], train ['total'])

In [None]:
test['pred_total'] = trained_model.predict(test[['home_epa_play', 'away_epa_play']])
test['resid'] = test['total'] - test['pred_total']
test

In [None]:
# new more compplex model
rushing_offense_epa = data.loc[data['rush_attempt'] == 1, :]\
.groupby(['game_id', 'posteam', 'season', 'week'], as_index=False)['epa'].mean()

rushing_defense_epa = data.loc[data['rush_attempt'] == 1, :]\
.groupby(['game_id','defteam', 'season', 'week'], as_index=False)['epa'].mean()

passing_offense_epa = data.loc[data['pass_attempt'] == 1, :]\
.groupby(['game_id','posteam', 'season', 'week'], as_index=False)['epa'].mean()

passing_defense_epa = data.loc[data['pass_attempt'] == 1, :]\
.groupby(['game_id','defteam', 'season', 'week'], as_index=False)['epa'].mean()

In [None]:
#adding lagged variables
rushing_offense_epa['epa_shifted2'] = rushing_offense_epa.groupby('posteam')['epa'].shift(-2)
#rushing_offense_epa['epa_shifted3'] = rushing_offense_epa.groupby('posteam')['epa'].shift(-3)
#rushing_offense_epa['epa_shifted4'] = rushing_offense_epa.groupby('posteam')['epa'].shift(-4)

rushing_defense_epa['epa_shifted2'] = rushing_defense_epa.groupby('defteam')['epa'].shift(-2)
#rushing_defense_epa['epa_shifted3'] = rushing_defense_epa.groupby('defteam')['epa'].shift(-3)
#rushing_defense_epa['epa_shifted4'] = rushing_defense_epa.groupby('defteam')['epa'].shift(-4)

passing_offense_epa['epa_shifted2'] = passing_offense_epa.groupby('posteam')['epa'].shift(-2)
#passing_offense_epa['epa_shifted3'] = passing_offense_epa.groupby('posteam')['epa'].shift(-3)
#passing_offense_epa['epa_shifted4'] = passing_offense_epa.groupby('posteam')['epa'].shift(-4)

passing_defense_epa['epa_shifted2'] = passing_defense_epa.groupby('defteam')['epa'].shift(-2)
#passing_defense_epa['epa_shifted3'] = passing_defense_epa.groupby('defteam')['epa'].shift(-3)
#passing_defense_epa['epa_shifted4'] = passing_defense_epa.groupby('defteam')['epa'].shift(-4)

In [None]:
offense_epa = rushing_offense_epa.merge(passing_offense_epa, on=['posteam', 'season', 'week'], suffixes=('_rushing', '_passing'))\
.rename(columns={'posteam': 'team'})
defense_epa = rushing_defense_epa.merge(passing_defense_epa, on=['defteam', 'season', 'week'], suffixes=('_rushing', '_passing'))\
.rename(columns={'defteam': 'team'})
epa = offense_epa.merge(defense_epa, on=['team', 'season', 'week'], suffixes=('_offense', '_defense'))

In [None]:
epa = epa.rename(columns = {'game_id_rushing_offense':'game_id'})

In [None]:
schedule = data[['season', 'week', 'home_team', 'away_team', 'home_score', 'away_score']]\
.drop_duplicates().reset_index(drop=True)\
.assign(home_team_win = lambda x: (x.home_score > x.away_score).astype(int))

df = schedule.merge(epa.rename(columns={'team': 'home_team'}), on=['home_team', 'season', 'week'])\
.merge(epa.rename(columns={'team': 'away_team'}), on=['away_team', 'season', 'week'], suffixes=('_home', '_away'))

df.head()

In [None]:
epa2 = df[[
 'away_score',
'season','epa_rushing_offense_home',
 'epa_passing_offense_home','epa_rushing_defense_home','epa_passing_defense_home']]

In [None]:
#a = sns.pairplot(epa2, hue='home_team_win', palette = "YlOrBr")
#plt.savefig('EPA Correlations')

In [None]:
df = df.dropna()
df2 = df.drop(['season','home_score', 'away_score', 'home_team', 'week', 'away_team', 'game_id_home', 'game_id_passing_defense_home', 'game_id_passing_offense_home', 'game_id_rushing_defense_home','game_id_away','game_id_passing_offense_away','game_id_rushing_defense_away','game_id_passing_defense_away'], axis = 1)
features = list(df2.columns)
features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                                    df2.drop('home_team_win',axis=1), 
                                                    df2['home_team_win'], 
                                                    test_size=0.20,
                                                    random_state=10)

In [None]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression



# Fit model based on training data
logit = LogisticRegression(random_state=0,solver='liblinear')
logit.fit(X_train, y_train)

# making predictions on the testing set
y_pred = logit.predict(X_test)
  
# Model Accuracy: how often is the classifier correct?
print(f'The Logistic model correctly predicts a win {100*metrics.accuracy_score(y_test, y_pred):.2f}% of the time.')

In [None]:
importance = logit.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix


cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(7,7))

sns.set(font_scale=1.4) # for label size
sns.heatmap(cm, ax=ax,annot=True, annot_kws={"size": 16}) # font size

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
#random forest model
from sklearn.ensemble import RandomForestClassifier

#Create the Classifier
forest = RandomForestClassifier(n_estimators=100,random_state=0)

#Train the model using the training sets y_pred=clf.predict(X_test)
forest.fit(X_train,y_train)

# making predictions on the testing set
y_pred = forest.predict(X_test)

# comparing actual response values (y_test) with predicted response values (y_pred)
forest_score = metrics.accuracy_score(y_test, y_pred)*100
print(f'The Random Forest model correctly predicts a win {forest_score:.2f}% of the time.')

In [None]:
# Recover feature importance scores
feature_imp = pd.Series(forest.feature_importances_,index=([
 'epa_rushing_offense_home',
 'epa_shifted2_rushing_offense_home',
 'epa_passing_offense_home',
 'epa_shifted2_passing_offense_home',
 'epa_rushing_defense_home',
 'epa_shifted2_rushing_defense_home',
 'epa_passing_defense_home',
 'epa_shifted2_passing_defense_home',
 'epa_rushing_offense_away',
 'epa_shifted2_rushing_offense_away',
 'epa_passing_offense_away',
 'epa_shifted2_passing_offense_away',
 'epa_rushing_defense_away',
 'epa_shifted2_rushing_defense_away',
 'epa_passing_defense_away',
 'epa_shifted2_passing_defense_away'])).sort_values(ascending=False)

# So what do we have?
feature_imp

In [None]:
# import sklearn.cluster
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10,6))

sns.barplot(x=feature_imp, y=feature_imp.index,ax=ax)

plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
sns.despine()

plt.show()

In [None]:
#look at DVOA, weighted DVOA, DYAR
#play action vs run success
#defenses become more predictable when you have a good rush offense