In [1]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import requests
gc.collect()

from tqdm import tqdm

# Load Processed Data

In [2]:
PATH = './output/processed.csv'
df = pd.read_csv(PATH)

## Challenges

There are two main challenges when weighting an EPA model.  

One, when teams get up 30 points, they will stop running efficient plays and just run clock. There needs to be some sort of penalty weight for plays when the score is out of hand. I'll try and arrive to this penalty empirically.  

Two, in the Ron Yurko et. al. paper I'm referencing, they assign weighting penalties if the next score is 4 or 5 drives in the future. Essentially, in that case, the current drive isn't deterministic when it comes to expected points. Again, I'll try to arrive to this penalty empirically.  

I'll start with the second challenge, and I'm going to approach it slightly differently. First, I'm going to take a small subset of the data and try to predict drives till next score. This should be near zero when in the opponent's redzone, and might max out (just a guess, 1.5) around a team's own 25. With multinomial logit, I can get a probability of zero, one, two, etc drives till next score. Then, using the rest of the data, I can group by drives till next score and predict probabilities of each type of score that way.

In [3]:
gb = df.groupby(['game_id','drive_id'])['down'].count().reset_index()
# gb = gb.groupby(['down'])['drive_id'].count()
# gb = gb.sort_values(ascending=False)
# gb = gb.reset_index()
# gb = gb.rename(columns={'drive_id':'play_count'})
_max = gb.down.max()
print(_max)
print(len(gb))

36
275017


In [4]:
# just for fun/validation, wanted to look at 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

x = pd.Series(gb.down, name="play_count")
fig = plt.figure(figsize=[15,9])
# ax1 = sns.kdeplot(x, bw=0.14, label="play_count")
ax = sns.distplot(x, bins=np.arange(0,_max), kde=False, norm_hist=True)
ax.set(xlabel='play_count', ylabel='percentage of drives')
ax.set_title('Distribution of Play Count on 300,000 CFB Drives')
plt.show()

fig.savefig("./plots/play_counts.png")

<Figure size 1500x900 with 1 Axes>

In [5]:
# Drives labelled "FG MISSED TD" are actually made field goals, ensuing kickoff returned for TD

df.loc[df.drive_result=='FG MISSED TD', 'drive_result'] = 'FG GOOD'

In [6]:
# fix 'FG GOOD TD'
# for text in list(cat.tr_game.values):
#     print(text)
fgs = [30282005814]
rush_tds = [32252239316, 32266019408]

df.loc[df.drive_id==30282005814, 'drive_result'] = 'FG GOOD'

df.loc[df.drive_id.isin(rush_tds), 'drive_result'] = 'RUSHING TD'

In [7]:
# standardize
df.loc[df.drive_result=='FUMBLE RETURN TD', 'drive_result'] = 'FUMBLE TD'

df.loc[df.drive_result=='PUNT TD', 'drive_result'] = 'PUNT RETURN TD'

df.loc[df.drive_result == 'INT RETURN TOUCH', 'drive_result'] = 'INT TD'

df.loc[df.drive_result == 'BLOCKED FG (TD) TD', 'drive_result'] = 'MISSED FG TD'

df.loc[df.drive_result=='POSS. ON DOWNS', 'drive_result'] = 'TURNOVER ON DOWNS'

In [8]:
# for incompletes, if there is barely any time left, drive result = end of half or end of game. 
# otherwise turnover on downs

incomp = df.loc[df.drive_result == 'INCOMPLETE']

icb = incomp.groupby(['drive_id'])['tr_game'].min().reset_index()

eofg = icb.loc[icb.tr_game < 30]
eofg_ids = list(eofg.drive_id.values)

incomp = incomp.loc[~incomp.drive_id.isin(eofg_ids)]

icb = incomp.groupby(['drive_id'])['tr_half'].min().reset_index()
eofh = icb.loc[icb.tr_half < 30]
eofh_ids = list(eofh.drive_id.values)

incomp = incomp.loc[~incomp.drive_id.isin(eofh_ids)]

down_ids = list(incomp.drive_id.values)

df.loc[df.drive_id.isin(eofg_ids), 'drive_result'] = 'END OF GAME'
df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'
df.loc[df.drive_id.isin(down_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [9]:
# same with completes

comp = df.loc[df.drive_result == 'PASS COMPLETE']

ccb = comp.groupby(['drive_id'])['tr_game'].min().reset_index()

eofg = ccb.loc[ccb.tr_game < 30]
eofg_ids = list(eofg.drive_id.values)

comp = comp.loc[~comp.drive_id.isin(eofg_ids)]

ccb = comp.groupby(['drive_id'])['tr_half'].min().reset_index()
eofh = ccb.loc[ccb.tr_half < 30]
eofh_ids = list(eofh.drive_id.values)

comp = comp.loc[~comp.drive_id.isin(eofh_ids)]

down_ids = list(comp.drive_id.values)

df.loc[df.drive_id.isin(eofg_ids), 'drive_result'] = 'END OF GAME'
df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'
df.loc[df.drive_id.isin(down_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [10]:
# same with penalties

pens = df.loc[df.drive_result == 'PENALTY']

pcb = pens.groupby(['drive_id'])['tr_game'].min().reset_index()

eofg = pcb.loc[pcb.tr_game < 30]
eofg_ids = list(eofg.drive_id.values)

pens = pens.loc[~pens.drive_id.isin(eofg_ids)]

pcb = pens.groupby(['drive_id'])['tr_half'].min().reset_index()
eofh = pcb.loc[pcb.tr_half < 30]
eofh_ids = list(eofh.drive_id.values)

pens = pens.loc[~pens.drive_id.isin(eofh_ids)]

down_ids = list(pens.drive_id.values)

df.loc[df.drive_id.isin(eofg_ids), 'drive_result'] = 'END OF GAME'
df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'
df.loc[df.drive_id.isin(down_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [11]:
# for drive result == kickoffs, if there is barely any time remain in half, end of half
# otherwise, i think they are just random plays out of other drives. dropping them

kos = df.loc[df.drive_result == 'KICKOFF']

kgb = kos.groupby(['drive_id'])['tr_half'].min().reset_index()

eofh = kgb.loc[kgb.tr_half <= 60]
eofh_ids = list(eofh.drive_id.values)

kos = kos.loc[~kos.drive_id.isin(eofh_ids)]
drops = list(kos.drive_id.values)

df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'

df = df.loc[~df.drive_id.isin(drops)]

In [12]:
# drop these, they're kickoff penalties
df = df.loc[df.drive_result != 'KICKOFF RETURN TD']

In [13]:
# Rushing TD TD often is two drives with same ids

# 24318000805 Ole Miss

# 24339003820 Colorado

# 24276002413 Stanford

df.loc[((df.drive_id == 24318000805)&(df.offense=='Ole Miss')), 'drive_id'] = 2431800080500

df.loc[((df.drive_id == 24339003820)&(df.offense=='Colorado')), 'drive_id'] = 2433900382000

df.loc[((df.drive_id == 24276002413)&(df.offense=='Stanford')), 'drive_id'] = 2427600241300

df.loc[df.drive_id==2431800080500, 'drive_time'] = 150
df.loc[df.drive_id==2431800080500, 'drive_result'] = 'TURNOVER ON DOWNS'

df.loc[df.drive_id==2433900382000, 'drive_time'] = 100
df.loc[df.drive_id==2433900382000, 'drive_result'] = 'TURNOVER ON DOWNS'

df.loc[df.drive_id==2427600241300, 'drive_time'] = 100
df.loc[df.drive_id==2427600241300, 'drive_result'] = 'TURNOVER ON DOWNS'

df.loc[df.drive_result == 'RUSHING TD TD', 'drive_result'] = 'RUSHING TD'

In [14]:
# Same with PASSING TD TD (two drives with same ids)

df.loc[((df.drive_id == 24248001223)&(df.offense=='Northern Arizona')), 'drive_id'] = 2424800122300
df.loc[df.drive_id==2424800122300, 'drive_time'] = 100
df.loc[df.drive_id==2424800122300, 'drive_result'] = 'TURNOVER ON DOWNS'
df.loc[((df.drive_id == 24248001223)&(df.offense=='Arizona')), 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24255025914)&(df.offense=='Western Michigan')), 'drive_id'] = 24255025913
df.loc[df.drive_id==24255025913, 'drive_result'] = 'BLOCKED FG'
df.loc[((df.drive_id == 24255025914)&(df.offense=='Virginia Tech')), 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24262006614)&(df.offense=='Northern Illinois')), 'drive_id'] = 24262006616
df.loc[((df.drive_id == 24262006616)), 'drive_result'] = 'PASSING TD'
df.loc[((df.drive_id == 24262006614)&(df.offense=='Iowa State')), 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24283003001)&(df.offense=='California')), 'drive_id'] = 2428300300100
df.loc[((df.drive_id == 2428300300100)), 'drive_result'] = 'TURNOVER ON DOWNS'
df.loc[((df.drive_id == 24283003001)&(df.offense=='USC')), 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24283007707)&(df.offense=='Northwestern')), 'drive_id'] = 2428300770700
df.loc[(df.drive_id == 2428300770700), 'drive_result'] = 'FG MISSED'
df.loc[((df.drive_id == 24283007707)&(df.offense=='Indiana')), 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24295027705)&(df.offense=='Syracuse')), 'drive_id'] = 24295027704
df.loc[df.drive_id == 24295027704, 'drive_result'] = 'BLOCKED FG'
df.loc[df.drive_id == 24295027705, 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24304015435)&(df.offense=='Wake Forest')), 'drive_id'] = 24304015434
df.loc[df.drive_id == 24304015434, 'drive_result'] = 'BLOCKED FG'
df.loc[df.drive_id == 24304015435, 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24311023826)&(df.offense=='Vanderbilt')), 'drive_id'] = 24311023825
df.loc[df.drive_id == 24311023825, 'drive_result'] = 'BLOCKED FG'
df.loc[df.drive_id == 24311023826, 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24304257915)&(df.offense=='South Carolina')), 'drive_id'] = 2430425791500
df.loc[df.drive_id == 2430425791500, 'drive_result'] = 'FG MISSED'
df.loc[df.drive_id == 24304257915, 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24325025430)&(df.offense=='BYU')), 'drive_id'] = 2432502543000
df.loc[df.drive_id == 2432502543000, 'drive_result'] = 'FG MISSED'
df.loc[df.drive_id == 24325025430, 'drive_result'] = 'PASSING TD'

# df.loc[df.drive_result == 'RUSHING TD TD', 'drive_result'] = 'RUSHING TD'



In [15]:
df.loc[df.drive_id==29311022818,'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id==30338222604,'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id==32252230601,'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id==32301000216,'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id==31001020115,'drive_result'] = 'INT TD'
df.loc[df.drive_id==31281025112,'drive_result'] = 'INT TD'

df.loc[df.drive_id==30254002525,'drive_id'] = 3025400252500
df.loc[df.drive_id==3025400252500, 'drive_result'] = 'FUMBLE TD'
df.loc[df.drive_id==30254002525,'drive_result'] = 'RUSHING TD'

df.loc[((df.drive_id==30282027706)&(df.offense=='West Virginia')),'drive_result'] = 'RUSHING TD'
df.loc[((df.drive_id==30282027706)&(df.offense=='UNLV')),'drive_id'] = 3028202770600
df.loc[df.drive_id==3028202770600,'drive_result'] = 'INT'

df.loc[((df.drive_id==31260015815)&(df.offense=='Nebraska')),'drive_result'] = 'RUSHING TD'
df.loc[((df.drive_id==31260015815)&(df.offense=='Washington')),'drive_id'] = 3126001581500
df.loc[df.drive_id==3126001581500,'drive_result'] = 'PUNT'

df.loc[((df.drive_id==32294006809)&(df.offense=='Boise State')),'drive_result'] = 'RUSHING TD'
df.loc[((df.drive_id==32294006809)&(df.offense=='UNLV')),'drive_id'] = 3229400680900
df.loc[df.drive_id==3229400680900,'drive_result'] = 'INT'

df.loc[((df.drive_id==32307002518)&(df.offense=='Washington')),'drive_result'] = 'PASSING TD'
df.loc[((df.drive_id==32307002518)&(df.offense=='California')),'drive_id'] = 3230700251800
df.loc[df.drive_id==3230700251800,'drive_result'] = 'FG GOOD'

df.loc[((df.drive_id==33242224720)&(df.offense=='Samford')),'drive_result'] = 'RUSHING TD'
df.loc[((df.drive_id==33242224720)&(df.offense=='Georgia State')),'drive_id'] = 3324222472000
df.loc[df.drive_id==3324222472000,'drive_result'] = 'PUNT'

df.loc[((df.drive_id==33311002415)&(df.offense=='Oregon')),'drive_result'] = 'PASSING TD'
df.loc[((df.drive_id==33311002415)&(df.offense=='Stanford')),'drive_id'] = 3331100241500
df.loc[df.drive_id==3331100241500,'drive_result'] = 'MISSED FG TD'

In [16]:
# none of these returned for a TD, i checked
ints = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains('intercepted'))]
int_ids = list(ints.drive_id.unique())

df.loc[df.drive_id.isin(int_ids), 'drive_result'] = 'INT'

In [17]:
safeties = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains('SAFETY'))]
sf_ids = list(safeties.drive_id.unique())

df.loc[df.drive_id.isin(sf_ids), 'drive_result'] = 'SF'

In [18]:
# checked for touchdowns, again only punts
punts = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains('punt'))]
punt_ids = list(punts.drive_id.unique())

df.loc[df.drive_id.isin(punt_ids), 'drive_result'] = 'PUNT'

In [19]:
base = r'^{}'
expr = '(?=.*{})'
words = ['field', 'goal', 'GOOD']
fg_good = base.format(''.join(expr.format(w) for w in words))

fgg = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(fg_good,regex=True))]
fgg_ids = list(fgg.drive_id.unique())
df.loc[df.drive_id.isin(fgg_ids), 'drive_result'] = 'FG GOOD'

In [20]:
words = ['field', 'goal', 'MISSED']
fg_missed = base.format(''.join(expr.format(w) for w in words))

fgm = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(fg_missed,regex=True))]
fgm_ids = list(fgm.drive_id.unique())
df.loc[df.drive_id.isin(fgm_ids), 'drive_result'] = 'FG MISSED'

In [21]:
words = ['field', 'goal', 'BLOCKED']
fg_blocked = base.format(''.join(expr.format(w) for w in words))

fgb = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(fg_blocked,regex=True))]
fgb_ids = list(fgb.drive_id.unique())
df.loc[df.drive_id.isin(fgb_ids), 'drive_result'] = 'BLOCKED FG'

In [22]:
words = ['rush','TOUCHDOWN']
rtd = base.format(''.join(expr.format(w) for w in words))

rush_td = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(rtd,regex=True))]
rtd_ids = list(rush_td.drive_id.unique())
df.loc[df.drive_id.isin(rtd_ids), 'drive_result'] = 'RUSHING TD'

In [23]:
words = ['pass','complete','TOUCHDOWN']
ptd = base.format(''.join(expr.format(w) for w in words))

pass_td = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(ptd,regex=True))]
ptd_ids = list(pass_td.drive_id.unique())
df.loc[df.drive_id.isin(ptd_ids), 'drive_result'] = 'PASSING TD'

In [24]:
fumbles = df.loc[(df.drive_result=='Uncategorized')&(df.play_type=='Fumble Recovery (Opponent)')]
fids = list(fumbles.drive_id.unique())
df.loc[df.drive_id.isin(fids), 'drive_result'] = 'FUMBLE'

fumbles = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains('fumble|fumbles'))&(df.play_type=='Fumble Recovery (Own)')]
fids = list(fumbles.drive_id.unique())
df.loc[df.drive_id.isin(fids), 'drive_result'] = 'FUMBLE'


In [25]:
safeties = df.loc[(df.drive_result=='Uncategorized')&(df.play_type=='Safety')]
sids = list(safeties.drive_id.unique())
df.loc[df.drive_id.isin(sids), 'drive_result'] = 'SF'

In [39]:
end_of_game = df.loc[(df.drive_result=='Uncategorized')&(df.tr_game<=60)]
eog_ids = list(end_of_game.drive_id.unique())
df.loc[df.drive_id.isin(eog_ids), 'drive_result'] = 'END OF GAME'

end_of_half = df.loc[(df.drive_result=='Uncategorized')&(df.tr_half<=60)]
eoh_ids = list(end_of_half.drive_id.unique())
df.loc[df.drive_id.isin(eoh_ids), 'drive_result'] = 'END OF HALF'

In [47]:
# QB kneels, end of game
words = ['TEAM','rush']
kneel = base.format(''.join(expr.format(w) for w in words))
kneel_df = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(kneel,regex=True))&(df.tr_game<=1800)]
kneel_ids = list(kneel_df.drive_id.unique())
df.loc[df.drive_id.isin(kneel_ids), 'drive_result'] = 'END OF GAME'

words = ['TEAM','run','loss']
kneel = base.format(''.join(expr.format(w) for w in words))
kneel_df = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(kneel,regex=True))&(df.tr_game<=1800)]
kneel_ids = list(kneel_df.drive_id.unique())
df.loc[df.drive_id.isin(kneel_ids), 'drive_result'] = 'END OF GAME'

# QB kneels, end of half
words = ['TEAM','rush']
kneel = base.format(''.join(expr.format(w) for w in words))
kneel_df = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(kneel,regex=True))]
kneel_ids = list(kneel_df.drive_id.unique())
df.loc[df.drive_id.isin(kneel_ids), 'drive_result'] = 'END OF HALF'

words = ['TEAM','run','loss']
kneel = base.format(''.join(expr.format(w) for w in words))
kneel_df = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(kneel,regex=True))]
kneel_ids = list(kneel_df.drive_id.unique())
df.loc[df.drive_id.isin(kneel_ids), 'drive_result'] = 'END OF HALF'

In [54]:
# turnover on downs
tod = df.loc[(df.drive_result=='Uncategorized')&(df.down==4)&df.play_text.str.contains('incomplete')]
tod_ids = list(tod.drive_id.unique())
df.loc[df.drive_id.isin(tod_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [64]:
# drop the rest, probably got drive ids split up
df = df.loc[df.drive_result != 'Uncategorized']

In [65]:
gb = df.groupby(['drive_result'])['down'].count()
gb

drive_result
BLOCKED FG               51
DOWNS                 51256
END OF GAME           36068
END OF HALF           32601
FG GOOD              231259
FG MISSED             85557
FUMBLE                66236
FUMBLE TD              3918
INT                   89036
INT TD                 6626
MISSED FG TD            401
PASSING TD           150217
PUNT                 592344
PUNT RETURN TD         3737
RUSHING TD           164017
SF                     2334
TD                   212689
TURNOVER ON DOWNS     69171
Name: down, dtype: int64

In [63]:
cat = df.loc[(df.drive_result=='Uncategorized')]
cat = cat.sort_values(by=['drive_id','tr_game'])
# print(cat[['drive_id','offense','play_text']])

# print(len(cat.drive_id.unique()))
print(cat)

                  away        defense defense_conference  defense_score  down  \
1658126           UTEP           UTEP     Conference USA            0.0   1.0   
1658127           UTEP           UTEP     Conference USA            0.0   1.0   
1781858   Prairie View           UNLV      Mountain West           13.0   2.0   
1781859   Prairie View           UNLV      Mountain West           13.0   1.0   
1781860   Prairie View           UNLV      Mountain West           13.0   3.0   
1781861   Prairie View           UNLV      Mountain West           13.0   2.0   
1781862   Prairie View           UNLV      Mountain West           13.0   1.0   
466         Vanderbilt    Wake Forest                ACC            3.0   2.0   
467         Vanderbilt    Wake Forest                ACC            3.0   1.0   
468         Vanderbilt    Wake Forest                ACC            3.0   3.0   
469         Vanderbilt    Wake Forest                ACC            3.0   2.0   
470         Vanderbilt    Wa

In [28]:
# df[df['col_name'].str.contains(r'^(?=.*apple)(?=.*banana)')]

# base = r'^{}'
# expr = '(?=.*{})'
# words = ['apple', 'banana', 'cat']  # example
# base.format(''.join(expr.format(w) for w in words))

In [30]:
print(fumbles[['offense','defense','play_type','play_text']])

            offense   defense              play_type  \
1384522    Missouri  Kentucky  Fumble Recovery (Own)   
1455213  Pittsburgh   Clemson  Fumble Recovery (Own)   
1677693  Notre Dame  Stanford  Fumble Recovery (Own)   
1719401     Clemson      Duke  Fumble Recovery (Own)   

                                                 play_text  
1384522  Drew Lock sacked by Josh Allen for 0 yards to ...  
1455213  Chris Blewitt kickoff for 40 yds , Grant Radak...  
1677693  Avery Davis run for 3 yds to the Stanf 41 Aver...  
1719401  Chase Brice run for no gain to the Clem 48 Cha...  


In [61]:
for pt in list(cat.play_text.values):
    print(pt)

Jay Cutler rush for loss of 1 yard to the WFrst34.
Paul Dombrowski pass complete to Brandon Allen for 11 yards to the TTL 44.
Isaiah Stanback pass complete to Anthony Russo for no gain to the AFA17, clock 00:43.
Timeout WASHINGTON, clock 00:52.
Timeout AIR FORCE, clock 00:52.
UCLA penalty 54 yard illegal block accepted, no play.
LIBERTY penalty 5 yard delay of game accepted.
Steven Jyles rush for 2 yards to the Wyom29.
Steven Jyles pass complete to Drouzon Quillen for 11 yards to the Wyom24, clock 04:45.
Unlv penalty 4 yard delay of game accepted.
Ucla penalty 64 yard illegal block accepted, no play.
Oklahoma penalty 46 yard holding accepted, no play.
Timeout MARYLAND, clock 01:01.
Danny Baugher rush for 36 yards to the Oregn 44 for a 1ST down.
Arizona penalty 5 yard false start accepted.
Timeout UTAH, clock 03:36.
Miami Oh penalty 36 yard holding accepted, no play.
Timeout MIAMI OH, clock 01:08.
Demyron Martin rush for no gain to the NoTex 30 for a 1ST down, North Texas penalty 13 yar