In [1]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import requests
gc.collect()

from tqdm import tqdm

# Load Processed Data

In [2]:
PATH = './output/processed.csv'
df = pd.read_csv(PATH)
print(len(df))

1878292


## Challenges

There are two main challenges when weighting an EPA model.  

One, when teams get up 30 points, they will stop running efficient plays and just run clock. There needs to be some sort of penalty weight for plays when the score is out of hand. I'll try and arrive to this penalty empirically.  

Two, in the Ron Yurko et. al. paper I'm referencing, they assign weighting penalties if the next score is 4 or 5 drives in the future. Essentially, in that case, the current drive isn't deterministic when it comes to expected points. Again, I'll try to arrive to this penalty empirically.  

I'll start with the second challenge, and I'm going to approach it slightly differently. First, I'm going to take a small subset of the data and try to predict drives till next score. This should be near zero when in the opponent's redzone, and might max out (just a guess, 1.5) around a team's own 25. With multinomial logit, I can get a probability of zero, one, two, etc drives till next score. Then, using the rest of the data, I can group by drives till next score and predict probabilities of each type of score that way.

In [3]:
gb = df.groupby(['game_id','drive_id'])['down'].count().reset_index()
# gb = gb.groupby(['down'])['drive_id'].count()
# gb = gb.sort_values(ascending=False)
# gb = gb.reset_index()
# gb = gb.rename(columns={'drive_id':'play_count'})
_max = gb.down.max()
print(_max)
print(len(gb))

36
286844


In [4]:
# just for fun/validation, wanted to look at 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

x = pd.Series(gb.down, name="play_count")
fig = plt.figure(figsize=[15,9])
# ax1 = sns.kdeplot(x, bw=0.14, label="play_count")
ax = sns.distplot(x, bins=np.arange(0,_max), kde=False, norm_hist=True)
ax.set(xlabel='play_count', ylabel='percentage of drives')
ax.set_title('Distribution of Play Count on 300,000 CFB Drives')
plt.show()

fig.savefig("./plots/play_counts.png")

<Figure size 1500x900 with 1 Axes>

In [5]:
# Drives labelled "FG MISSED TD" are actually made field goals, ensuing kickoff returned for TD

df.loc[df.drive_result=='FG MISSED TD', 'drive_result'] = 'FG GOOD'

In [6]:
# fix 'FG GOOD TD'
# for text in list(cat.tr_game.values):
#     print(text)
fgs = [30282005814]
rush_tds = [32252239316, 32266019408]

df.loc[df.drive_id==30282005814, 'drive_result'] = 'FG GOOD'

df.loc[df.drive_id.isin(rush_tds), 'drive_result'] = 'RUSHING TD'

In [40]:
# standardize
df.loc[df.drive_result=='FUMBLE RETURN TD', 'drive_result'] = 'FUMBLE TD'

df.loc[df.drive_result == 'INT RETURN TOUCH', 'drive_result'] = 'INT TD'

df.loc[df.drive_result == 'BLOCKED FG (TD) TD', 'drive_result'] = 'MISSED FG TD'

df.loc[df.drive_result=='POSS. ON DOWNS', 'drive_result'] = 'TURNOVER ON DOWNS'

In [8]:
# for incompletes, if there is barely any time left, drive result = end of half or end of game. 
# otherwise turnover on downs

incomp = df.loc[df.drive_result == 'INCOMPLETE']

icb = incomp.groupby(['drive_id'])['tr_game'].min().reset_index()

eofg = icb.loc[icb.tr_game < 30]
eofg_ids = list(eofg.drive_id.values)

incomp = incomp.loc[~incomp.drive_id.isin(eofg_ids)]

icb = incomp.groupby(['drive_id'])['tr_half'].min().reset_index()
eofh = icb.loc[icb.tr_half < 30]
eofh_ids = list(eofh.drive_id.values)

incomp = incomp.loc[~incomp.drive_id.isin(eofh_ids)]

down_ids = list(incomp.drive_id.values)

df.loc[df.drive_id.isin(eofg_ids), 'drive_result'] = 'END OF GAME'
df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'
df.loc[df.drive_id.isin(down_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [33]:
# same with completes

comp = df.loc[df.drive_result == 'PASS COMPLETE']

ccb = comp.groupby(['drive_id'])['tr_game'].min().reset_index()

eofg = ccb.loc[ccb.tr_game < 30]
eofg_ids = list(eofg.drive_id.values)

comp = comp.loc[~comp.drive_id.isin(eofg_ids)]

ccb = comp.groupby(['drive_id'])['tr_half'].min().reset_index()
eofh = ccb.loc[ccb.tr_half < 30]
eofh_ids = list(eofh.drive_id.values)

comp = comp.loc[~comp.drive_id.isin(eofh_ids)]

down_ids = list(comp.drive_id.values)

df.loc[df.drive_id.isin(eofg_ids), 'drive_result'] = 'END OF GAME'
df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'
df.loc[df.drive_id.isin(down_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [36]:
# same with penalties

pens = df.loc[df.drive_result == 'PENALTY']

pcb = pens.groupby(['drive_id'])['tr_game'].min().reset_index()

eofg = pcb.loc[pcb.tr_game < 30]
eofg_ids = list(eofg.drive_id.values)

pens = pens.loc[~pens.drive_id.isin(eofg_ids)]

pcb = pens.groupby(['drive_id'])['tr_half'].min().reset_index()
eofh = pcb.loc[pcb.tr_half < 30]
eofh_ids = list(eofh.drive_id.values)

pens = pens.loc[~pens.drive_id.isin(eofh_ids)]

down_ids = list(pens.drive_id.values)

df.loc[df.drive_id.isin(eofg_ids), 'drive_result'] = 'END OF GAME'
df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'
df.loc[df.drive_id.isin(down_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [12]:
# for drive result == kickoffs, if there is barely any time remain in half, end of half
# otherwise, i think they are just random plays out of other drives. dropping them

kos = df.loc[df.drive_result == 'KICKOFF']

kgb = kos.groupby(['drive_id'])['tr_half'].min().reset_index()

eofh = kgb.loc[kgb.tr_half <= 60]
eofh_ids = list(eofh.drive_id.values)

kos = kos.loc[~kos.drive_id.isin(eofh_ids)]
drops = list(kos.drive_id.values)

df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'

df = df.loc[~df.drive_id.isin(drops)]

In [22]:
# drop these, they're kickoff penalties
df = df.loc[df.drive_result != 'KICKOFF RETURN TD']

In [41]:
gb = df.groupby(['drive_result'])['down'].count()
gb

drive_result
BLOCKED FG                 137
BLOCKED PUNT               138
BLOCKED PUNT TD            120
DOWNS                    51256
END OF GAME              34072
END OF HALF              32698
FG GOOD                 239813
FG MISSED                88936
FUMBLE                   69659
FUMBLE TD                 4028
INT                      93414
INT TD                    6712
MISSED FG TD               434
PASSING TD              158987
PASSING TD TD              153
PUNT                    619434
PUNT RETURN TD            2721
PUNT RETURN TD TD           44
PUNT TD                   1017
Punt                        48
RUSH                      1855
RUSHING TD              173461
RUSHING TD TD               55
SACK                       400
SF                        2073
TD                      212689
TIMEOUT                    148
TURNOVER ON DOWNS        73951
TURNOVER ON DOWNS TD       133
Uncategorized             9666
Name: down, dtype: int64

In [45]:
cat = df.loc[df.drive_result=='RUSHING TD TD']
cat = cat.sort_values(by=['drive_id','tr_game'])
print(cat[['drive_id','offense','play_text']])

          drive_id     offense  \
29625  24276002413  Washington   
29627  24276002413  Washington   
29628  24276002413  Washington   
29629  24276002413  Washington   
29630  24276002413  Washington   
29631  24276002413  Washington   
29632  24276002413  Washington   
29633  24276002413  Washington   
29634  24276002413  Washington   
29635  24276002413  Washington   
29636  24276002413  Washington   
29637  24276002413  Washington   
29638  24276002413  Washington   
29639  24276002413  Washington   
29640  24276002413    Stanford   
29641  24276002413    Stanford   
29642  24276002413    Stanford   
29643  24276002413    Stanford   
60154  24311026408  Washington   
60155  24311026408  Washington   
60156  24311026408  Washington   
60157  24311026408  Washington   
60158  24311026408  Washington   
60159  24311026408  Washington   
60160  24311026408  Washington   
62642  24318000805    Arkansas   
62643  24318000805    Arkansas   
62644  24318000805    Arkansas   
62645  2431800

In [None]:
# Rushing TD TD often is two drives with same ids

# 24318000805 Ole Miss

# 24339003820 Colorado

# 24276002413 Stanford

df.loc[((df.drive_id == 24318000805)&(df.offense=='Ole Miss')), 'drive_id'] = 2431800080500

df.loc[((df.drive_id == 24339003820)&(df.offense=='Colorado')), 'drive_id'] = 2433900382000

df.loc[((df.drive_id == 24276002413)&(df.offense=='Stanford')), 'drive_id'] = 2427600241300




In [43]:
for pt in list(cat.play_text.values):
    print(pt)

Kenny James (UW) rushed right side for a 1 yard touchdown.
James Sims Jr. (UW) rushed up the middle for 3 yards.
Kenny James (UW) rushed right side for 6 yards.
Huskies timeout; 05:48 remaining 2nd quarter
Carl Bonnell (UW) pass right side complete to Sonny Shackelford (UW) for 4 yards.
Carl Bonnell (UW) pass incomplete to the left side.
Carl Bonnell (UW) pass right side complete to Jon Lyon (UW) for 14 yards.
Carl Bonnell (UW) rushed right side for 7 yards.
James Sims Jr. (UW) rushed up the middle for 6 yards.
James Sims Jr. (UW) rushed up the middle for 3 yards.
Kenny James (UW) rushed up the middle for 6 yards.
James Sims Jr. (UW) rushed up the middle for 6 yards.
5 yard penalty on Huskies.
Kenny James (UW) rushed up the middle for 13 yards.
Trent Edwards (STAN) pass incomplete across the middle.; turnover on downs.
Trent Edwards (STAN) pass right side complete to Greg Camarillo (STAN) for 7 yards.
Trent Edwards (STAN) pass incomplete to the right side.
Kenneth Tolon (STAN) rushed u