In [1]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import requests
gc.collect()

from tqdm import tqdm

# Load Processed Data

In [2]:
PATH = './output/processed.csv'
df = pd.read_csv(PATH)
print(len(df))

1927115


## Challenges

There are two main challenges when weighting an EPA model.  

One, when teams get up 30 points, they will stop running efficient plays and just run clock. There needs to be some sort of penalty weight for plays when the score is out of hand. I'll try and arrive to this penalty empirically.  

Two, in the Ron Yurko et. al. paper I'm referencing, they assign weighting penalties if the next score is 4 or 5 drives in the future. Essentially, in that case, the current drive isn't deterministic when it comes to expected points. Again, I'll try to arrive to this penalty empirically.  

I'll start with the second challenge, and I'm going to approach it slightly differently. First, I'm going to take a small subset of the data and try to predict drives till next score. This should be near zero when in the opponent's redzone, and might max out (just a guess, 1.5) around a team's own 25. With multinomial logit, I can get a probability of zero, one, two, etc drives till next score. Then, using the rest of the data, I can group by drives till next score and predict probabilities of each type of score that way.

In [3]:
gb = df.groupby(['game_id','drive_id'])['down'].count().reset_index()
# gb = gb.groupby(['down'])['drive_id'].count()
# gb = gb.sort_values(ascending=False)
# gb = gb.reset_index()
# gb = gb.rename(columns={'drive_id':'play_count'})
_max = gb.down.max()
print(_max)
print(len(gb))

36
296840


In [4]:
# just for fun/validation, wanted to look at 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

x = pd.Series(gb.down, name="play_count")
fig = plt.figure(figsize=[15,9])
# ax1 = sns.kdeplot(x, bw=0.14, label="play_count")
ax = sns.distplot(x, bins=np.arange(0,_max), kde=False, norm_hist=True)
ax.set(xlabel='play_count', ylabel='percentage of drives')
ax.set_title('Distribution of Play Count on 300,000 CFB Drives')
plt.show()

fig.savefig("./plots/play_counts.png")

<Figure size 1500x900 with 1 Axes>

In [5]:
# add indicator if it's a scoring drive

In [94]:
gb = df.groupby(['drive_result'])['down'].count()
gb

drive_result
BLOCKED FG                       144
BLOCKED FG (TD) TD               159
BLOCKED PUNT                     159
BLOCKED PUNT TD                  125
DOWNS                          51298
END OF GAME                    33926
END OF HALF                    32523
FG GOOD                       239806
FG GOOD TD                        31
FG MISSED                      89035
FG MISSED TD                     128
FUMBLE                         73702
FUMBLE RETURN TD                1497
FUMBLE TD                       2652
INCOMPLETE                       600
INT                            98930
INT RETURN TOUCH                 162
INT TD                          6653
KICKOFF                           39
KICKOFF RETURN TD                 30
LATERAL                            2
MISSED FG TD                     275
PASS COMPLETE                    297
PASSING TD                    159048
PASSING TD TD                    153
PENALTY                          223
POSS. ON DOWNS           

In [108]:
test = df.loc[df.drive_id == 4005484217]
print(test)

for pt in list(test.play_text.values):
    print(pt)

uncat = df.loc[(df['drive_result']=='KICKOFF')]

print(len(uncat))
print(uncat[['game_id','drive_id','tr_game','offense','defense','play_text']])
# for uc in list(uncat.play_text.values):
#     print(uc)
# {'FG TD'}

print(list(uncat.drive_id.unique()))

                     away           defense defense_conference  defense_score  \
1256922  Georgia Southern  Georgia Southern           Sun Belt            0.0   
1256923  Georgia Southern  Georgia Southern           Sun Belt            0.0   
1256924  Georgia Southern  Georgia Southern           Sun Belt            0.0   
1256925  Georgia Southern  Georgia Southern           Sun Belt            0.0   
1256926  Georgia Southern  Georgia Southern           Sun Belt            0.0   
1256927  Georgia Southern  Georgia Southern           Sun Belt            0.0   
1256928  Georgia Southern  Georgia Southern           Sun Belt            0.0   
1256929  Georgia Southern  Georgia Southern           Sun Belt            0.0   

         down    drive_id              home                  id  \
1256922   3.0  4005484217  New Mexico State  400548421102849906   
1256923   2.0  4005484217  New Mexico State  400548421102849905   
1256924   1.0  4005484217  New Mexico State  400548421102849904   
12

In [104]:
for text in list(uncat.play_text.values):
    print(text)

Favian Upshaw run for a loss of 13 yards to the GeoSo 21
Favian Upshaw run for no gain to the GeoSo 34
Matt Breida run for 4 yds to the GeoSo 34
Austin Howard pass complete to Randall Menard for 11 yds to the Sthrn 20 for a 1ST down
Austin Howard pass complete to Randall Menard for 10 yds to the Sthrn 30 for a 1ST down
Austin Howard pass complete to Dillon Beard for 8 yds to the Sthrn 38
Karson Roberts pass complete to Alex Ludowig for 10 yds to the AFA 39 for a 1ST down
Garrett Brown run for a loss of 5 yards to the AFA 29
D.J. Johnson run for 2 yds to the AFA 41
Karson Roberts pass incomplete
Jacobi Owens run for 6 yds to the AFA 34
Karson Roberts run for 1 yd to the AFA 42
Kurt Palandech run for a loss of 1 yard to the UNLV 29
Kurt Palandech pass incomplete to Devonte Boyd
SAN JOSÉ ST Penalty, Defensive offside (5 Yards) to the UNLV 30 for a 1ST down
Kurt Palandech pass incomplete to Devonte Boyd
Team run for a loss of 1 yard to the Wash 31
Josh Cleveland run for 6 yds to the BwGrn 

In [77]:
# Kickoff Return TD drive results are usually penalties

# except for this game, it seems very off
df = df.loc[df.game_id != 252602572]

In [100]:
# fix drive result kickoffs
kos = df.loc[(df['drive_result']=='KICKOFF')]
dids = list(kos.drive_id.unique())

fgs = df.loc[(df['drive_id'].isin(dids))&df['play_text'].str.contains('Field Goal')]
fg_ids = list(fgs.drive_id.unique())

df.loc[df.drive_id.isin(fg_ids), 'drive_result'] = 'FG GOOD'


punts = [4005483434]
end_of_half = [40054770815]

df.loc[df.drive_id.isin(punts), 'drive_result'] = 'PUNT'
df.loc[df.drive_id.isin(end_of_half), 'drive_result'] = 'END OF HALF'

# special cases

df.loc[df.drive_id == 24269009911, 'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id == 24269009911, 'drive_id'] = 24269009912

In [71]:
# fix Fg missed TDs
brt = df.loc[(df['drive_result']=='FG MISSED TD')&df['play_text'].str.contains('BLOCKED')]
brt_ids = list(brt.drive_id.unique())

df.loc[df['drive_id'].isin(brt_ids), 'drive_result'] = 'BLOCKED FG (TD) TD'

brt = df.loc[(df['drive_result']=='FG MISSED TD')&df['play_text'].str.contains('blocked')]
brt_ids = list(brt.drive_id.unique())

df.loc[df['drive_id'].isin(brt_ids), 'drive_result'] = 'BLOCKED FG (TD) TD'

brt = df.loc[(df['drive_result']=='FG MISSED TD')&df['play_text'].str.contains('blocked,')]
brt_ids = list(brt.drive_id.unique())

df.loc[df['drive_id'].isin(brt_ids), 'drive_result'] = 'BLOCKED FG (TD) TD'

In [58]:
# merge 'FG' and 'FG GOOD'
df.loc[df['drive_result']=='FG', 'drive_result'] = 'FG GOOD' 
df.loc[df['drive_result']=='MADE FG', 'drive_result'] = 'FG GOOD' 

df.loc[df['drive_result']=='MISSED FG', 'drive_result'] = 'FG MISSED' 

In [50]:
# fix 'end of half TD'
end_of_half = [29290230615,29304013515,32259006215,32308025415,32315015219,33250230613,33285263815,33327000821,40060388027]
fumble_tds = [30279211614]
int_tds = [30282006815,40079088219]
rush_tds = [30247263316,30268006217,32243211716,32301000917,40054794315]
pass_tds = [30324002112,32329023512]
block_fg_td = [40054834612]

df.loc[df['drive_id'].isin(end_of_half), 'drive_result'] = 'END OF HALF'
df.loc[df['drive_id'].isin(fumble_tds), 'drive_result'] = 'FUMBLE RETURN TD'
df.loc[df['drive_id'].isin(int_tds), 'drive_result'] = 'INT TD'
df.loc[df['drive_id'].isin(rush_tds), 'drive_result'] = 'RUSHING TD'
df.loc[df['drive_id'].isin(pass_tds), 'drive_result'] = 'PASSING TD'
df.loc[df['drive_id'].isin(block_fg_td), 'drive_result'] = 'BLOCKED FG (TD) TD'

In [21]:
# fix "end of game TD"
int_tds = [30324020406,40087609227]
fumble_tds = [40076354226,40086912120]
end_of_game = [40078746229,40094526122]

df.loc[df['drive_id'].isin(int_tds), 'drive_result'] = 'INT TD'
df.loc[df['drive_id'].isin(fumble_tds), 'drive_result'] = 'FUMBLE RETURN TD'
df.loc[df['drive_id'].isin(end_of_game), 'drive_result'] = 'END OF GAME'


In [9]:
# fix "downs TD"
passing_tds = [40054825721,4007635338,40076343011,40086953320,4010133465,4010128564]
rushing_tds = [40054786022,40060392118,40086953316]
interception_tds = [4005482704,40075690213,40076355220,4008696135,40086963817]

df.loc[df['drive_id'].isin(passing_tds), 'drive_result'] = 'PASSING TD'
df.loc[df['drive_id'].isin(rushing_tds), 'drive_result'] = 'RUSHING TD'
df.loc[df['drive_id'].isin(interception_tds), 'drive_result'] = 'INT TD'

In [10]:
# fix 'FG TD' drive result
df.loc[((df.drive_id == 40054786811)&(df.offense=='Baylor')), 'drive_result'] = 'FG GOOD'
df.loc[((df.drive_id == 40054786811)&(df.offense=='Baylor')), 'drive_id'] = 4005478681100

df.loc[(df.drive_id == 40054786811), 'drive_result'] = 'RUSHING TD'

In [11]:
# fix "end of 4th"
df.loc[df['drive_result']=='END OF 4TH QUARTER', 'drive_result'] = 'END OF GAME'

In [12]:
bfg_ids = [242550275,242760259,243250150,243250344]
bfg_dids = [24255027512,24276025916,24325015008,24325034420]
to_fix = [(x-1) for x in bfg_dids]

df.loc[df['drive_id'].isin(to_fix), 'drive_result'] = 'BLOCKED FG (TD) TD' 
df.loc[df['drive_id'].isin(bfg_dids), 'drive_id'] = df.drive_id - 1


In [13]:
bps = list(df.loc[df['drive_result']=='BLOCKED PUNT TD'].drive_id.unique())
to_fix = [(x-1) for x in bps]

df.loc[df['drive_id'].isin(to_fix), 'drive_result'] = 'BLOCKED PUNT TD' 
df.loc[df['drive_id'].isin(bps), 'drive_id'] = df.drive_id - 1