In [22]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import datetime
gc.collect()

from tqdm import tqdm

# Load Data

In [23]:
# load play by play & drive data
years = list(range(2005, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    
    drive_path = './output/'+str(year)+'/'+str(year)+'_drives.csv'
    drive_df = pd.read_csv(drive_path)
    
    drive_df = drive_df.rename(columns={'id':'drive_id'})
    
    sea_df = pd.merge(left=sea_df, right=drive_df, how='left', on=['drive_id','drive_id'])
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(df.shape)
print(str(num_plays) + " plays were loaded")

100%|██████████| 14/14 [00:27<00:00,  2.67s/it]

(2031893, 41)
2031893 plays were loaded





In [24]:
# offense_x seems to be correct while offense_y is not

df = df.drop(columns=['defense_y','defense_conference_y','offense_y','offense_conference_y'])

df = df.rename(columns={'defense_x':'defense','defense_conference_x':'defense_conference','offense_x':'offense','offense_conference_x':'offense_conference'})


In [25]:
# fix yard_line, it's w.r.t the home team
df = df.rename(columns={'yard_line':'wrong_yardline'})

df['yard_line'] = np.where(df['offense']==df['home'],df['wrong_yardline'],100-df['wrong_yardline'])
# print(df[['home','offense','yard_line','wrong_yardline']].head(50))
df = df.drop(columns=['wrong_yardline'])

In [26]:
df['alt_game_id'] = df['game_id'].copy().astype(str)
df['alt_drive_id'] = df['drive_id'].copy().astype(str)

def replace_id(x,y):
    return x.replace(y,'')
df['alt_drive_id'] = df.apply(lambda row: replace_id(row['alt_drive_id'], row['alt_game_id']), axis=1)

#  strip leading zeros from 1-9 drive numbers
df['alt_drive_id'] = df['alt_drive_id'].str.lstrip("0")

# this also eliminates drive "zeros", so replace empty space with zero
df['alt_drive_id'] = df['alt_drive_id'].replace(r'^\s*$', '0', regex=True)

# print(df.groupby(['alt_drive_id'])['down'].count().sort_values(ascending=False))

## Special Teams

Overtimes are for future work, doing point afters last here (or maybe not at all)

In [27]:
# print(len(df))
pat = ['2pt Conversion','Offensive 1pt Safety','Defensive 2pt Conversion','Extra Point Good','Extra Point Missed']

df = df.loc[~df.play_type.isin(pat)]


# kos = ['Kickoff','Kickoff Return (Offense)','Kickoff Return Touchdown']
print(len(df))

1986166


In [28]:
# also drop overtime

df = df.loc[(df.period > 0) & (df.period <= 4)]
print(len(df))


1979243


Quickly, I want to clean/aggregate kickoffs and get that out of the way

In [29]:
# df.loc[df.play_type=='Kickoff Return Touchdown'].play_text
df.loc[df['play_type']=='Kickoff Return (Offense)','play_type'] = 'Kickoff'

krts = df.loc[(df['play_type']=='Kickoff') & (df['play_text'].str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

krts = krts.loc[~krts['play_text'].str.contains('TTD')]

base = r'^{}'
expr = '(?=.*{})'
words = ['penalty', 'accepted']
called_back = base.format(''.join(expr.format(w) for w in words))

krts = krts.loc[~krts.play_text.str.contains(called_back,regex=True)]

# no fumbles
fkrt = krts.loc[krts.play_text.str.contains('fumbled')]
# this play isn't a fumble return touchdown 
fkrt = fkrt.loc[~fkrt.play_text.str.contains('Cole Lerch')]
# fumble return touchdowns
frt_ids = list(fkrt.id.values)

# kick return touchdowns
krt_ids = list(krts.loc[~krts['id'].isin(frt_ids)].id.values)


df.loc[df.id.isin(frt_ids), 'play_type'] = 'Fumble Return Touchdown (Kick Team)'
df.loc[df.id.isin(krt_ids), 'play_type'] = 'Kickoff Return Touchdown'



In [30]:
# drives with two kickoffs probably have a penalty on one kickoff
# gb = df.groupby(['drive_id','play_type'])['id'].count().reset_index()

# two_kos_ids = list(gb.loc[(gb['play_type']=='Kickoff')&(gb['id']>1)].drive_id.values)
# two_kos = df.loc[df.drive_id.isin(two_kos_ids)]
# tkos = two_kos.loc[two_kos.play_type=='Kickoff']

# some duplicate kickoffs
# print(len(tkos))
# tkos = tkos.drop_duplicates(subset=['drive_id','offense','clock.minutes','clock.seconds'])
# print(len(tkos))

# tkos = tkos.dropna(subset=['play_text'])
# subset = tkos.loc[tkos.play_text.str.contains('penalty')]
# print(len(subset))
# # subset[['drive_id','offense','clock.minutes','clock.seconds','play_text']]
# for pt in list(subset.play_text.values):
#     print(pt)

In [31]:
onsides = df.loc[(df.play_type=='Kickoff')&(df.play_text.str.contains('on-side'))]
os_ids = list(onsides.id.values)
df.loc[df['id'].isin(os_ids), 'play_type'] = 'Onside Recovery'


## need further cleaning to figure out who recovered the onside probably 
## below was my first crack at it
# df['next_play_offense'] = df['offense'].copy().shift(-1).fillna('none')
# df['prev_play_offense'] = df['offense'].copy().shift(1).fillna('none')
# df[['next_play_offense','prev_play_offense','offense']]

# onside_ids = list(onsides.id.values)
# df['onside'] = np.where(df['id'].isin(onside_ids), 1, 0)

# # if the onside kick is on the first play of the game, it's always assumed to be recovered by the Receiving Team (I'm being lazy)
# df.loc[(df['onside']==1)&(df['prev_play_offense']==df['next_play_offense']), 'play_type'] = 'Onside Recovery (Kick Team)'
# df.loc[(df['onside']==1)&(df['prev_play_offense']!=df['next_play_offense']), 'play_type'] = 'Onside Recovery (Rec Team)'


In [32]:
# gb = df.groupby(['drive_id','play_type'])['id'].count().reset_index()
# gb

## Clock

Clock data is unreliable because maybe 25% of the games have only have one time for each play, and that time is when the drive started. I played with trying to predict time per play based on play type, but the data was very messy. So I decided to get the total time of each drive, and then assume each play took the same amount of time. EPA shouldn't be significantly affected most of the time, i.e. a 70 yard pass will be considered a good play no matter what. The only time it might have an adverse effect is toward the end of a game, when seconds matter. I think that in college football, when the clock stops for a first down, and incompletions, that all pass plays probably do take a somewhat similar amount of time. Drives in this situation will consist mostly of the same play type, and plays of the same play type likely take similar amounts of time. I'll compare it to the clock data I do have to make sure.

In [33]:
# fix clock data first so drives can be figured out
time_cols = ['clock.minutes','clock.seconds','start_time.minutes','start_time.seconds',
            'end_time.minutes','end_time.seconds']
for tc in time_cols:
    df[tc] = df[tc].fillna(0)

# get time remaining in game
df['tr_game'] = (4-df['period']) * 900 + (df['clock.minutes'] * 60) + df['clock.seconds']
df['tr_half'] = np.where(df['period']>2,df['tr_game'], df['tr_game']-1800)

df = df.drop(columns=['clock.minutes','clock.seconds'])

In [34]:
# fill empties
df['elapsed.minutes'] = df['elapsed.minutes'].copy().fillna(0)
df['elapsed.seconds'] = df['elapsed.seconds'].copy().fillna(0)
df['drive_time'] = 60*df['elapsed.minutes'] + df['elapsed.seconds']

In [35]:
# a lot of those drive times are negative... and other problems. so here's an alt drive time
# alt clock

# these get the start and end time of every drive
maxs = df.groupby(['game_id','drive_id'])['tr_game'].max().reset_index()
mins = df.groupby(['game_id','drive_id'])['tr_game'].min().reset_index()
maxs = maxs.rename(columns={'tr_game':'drive_start'})
mins = mins.rename(columns={'tr_game':'drive_end'})

# sometimes the drive end time is the same as the drive start. in that case, I use the next drive start
maxs = maxs.sort_values(by=['game_id','drive_start'],ascending=False)
next_max = maxs.groupby(['game_id'])['drive_start'].shift(-1)
next_max = pd.Series(next_max, name='next_drive_start')
new_max = pd.concat([maxs, next_max], axis=1)
new_max['next_drive_start'] = new_max['next_drive_start'].fillna(0)

# sometimes (rarely, 2%ish of the time) both the next drive start and the drive end are the same as the drive start
# i can explore this more in future work
# i'm fairly sure most of the time it's when a timeout or something divides the same drive into two.

mins = mins.sort_values(by=['game_id','drive_end'],ascending=False)
next_min = mins.groupby(['game_id'])['drive_end'].shift(-1)
next_min = pd.Series(next_min, name='next_drive_end')
new_min = pd.concat([mins, next_min], axis=1)
new_min['next_drive_end'] = new_min['next_drive_end'].fillna(0)
new_min = new_min.drop(columns='game_id')
times = pd.merge(left=new_max,right=new_min,on=['drive_id','drive_id'],how='left')


# attempt 1 (works on ~95.5% of data)
times['drive_time_1'] = times['drive_start']-times['next_drive_start']
# plan B (95.8% of data)
times['drive_time'] = np.where(times['drive_time_1']>0,times['drive_time_1'],(times['drive_start']-times['drive_end']))
# last resort (didn't implement)
# times['drive_time'] = np.where(times['drive_time_2']>0,times['drive_time_2'],(times['drive_start']-times['next_drive_end']))

not_good = times.loc[times.drive_time<=0]
print(len(not_good))

good = times.loc[times.drive_time>0]
print(len(good))

print(good.drive_time.mean())

times = times[['drive_id','drive_time']]
times = times.rename(columns={'drive_time':'alt_drive_time'})


11340
277556
145.28465606940583


In [36]:
df = pd.merge(left=df,right=times,how='left',on=['drive_id','drive_id'])

In [37]:
# longest drive in CFB history is 882. so need to drop anything above 900
# also drop anything below or equal to 0

df['correct_time_1'] = np.where(df['drive_time'] > 0, df['drive_time'], df['alt_drive_time'])

df = df.loc[df['correct_time_1'] > 0]

df['correct_drive_time'] = np.where(df['drive_time'] < 900, df['drive_time'], df['alt_drive_time'])

df = df.loc[df['correct_drive_time'] < 900]

print(len(df))

print("correlation between primary and approximate drive time")
print(df[['drive_time','alt_drive_time','correct_drive_time']].corr())

df = df.drop(columns=['drive_time','alt_drive_time','correct_time_1'])
df = df.rename(columns={'correct_drive_time':'drive_time'})

1958149
correlation between primary and approximate drive time
                    drive_time  alt_drive_time  correct_drive_time
drive_time            1.000000        0.854848            0.991366
alt_drive_time        0.854848        1.000000            0.862340
correct_drive_time    0.991366        0.862340            1.000000


In [38]:
print(len(df))
df = df.dropna(subset=['play_text'])
print(len(df))

1958149
1957877


TODO: examine drives so that end of half and end of game drives are the only back-to-back drives that can result in same team possessing the ball. Also muffed punts fall into this category.

In [39]:
gb = df.groupby(['game_id','drive_id','alt_drive_id'])['offense'].nunique().reset_index()
gb = gb.rename(columns={'offense':'num_offenses'})
gb = gb.groupby(['drive_id','num_offenses'])['game_id'].count().reset_index()
print(gb.groupby(['num_offenses'])['game_id'].count())
# about 18% of drives have two offenses
# most of these are drives that include kickoffs with the offense & defense reverse
# i'll choose to make the return team the offense
two_os = gb.loc[gb.num_offenses == 2]
to_ids = list(two_os.drive_id.unique())

num_offenses
1    231865
2     51873
Name: game_id, dtype: int64


In [40]:
kos = df.loc[(df.drive_id.isin(to_ids))]


kos = kos.loc[kos.play_type=='Kickoff']
wrong_teams = list(kos.id.values)

df = df.rename(columns={'offense':'wrong_offense','defense':'wrong_defense'})

df['offense'] = np.where(df['id'].isin(wrong_teams),df['wrong_defense'],df['wrong_offense'])
df['defense'] = np.where(df['id'].isin(wrong_teams),df['wrong_offense'],df['wrong_defense'])


df = df.drop(columns=['wrong_offense','wrong_defense'])



In [41]:
# switching kickoffs fixes vast majority
gb = df.groupby(['game_id','drive_id','alt_drive_id'])['offense'].nunique().reset_index()
gb = gb.rename(columns={'offense':'num_offenses'})
gb = gb.groupby(['drive_id','num_offenses'])['game_id'].count().reset_index()
print(gb.groupby(['num_offenses'])['game_id'].count())


two_os = gb.loc[gb.num_offenses == 2]
to_ids = list(two_os.drive_id.unique())



num_offenses
1    281883
2      1855
Name: game_id, dtype: int64


In [42]:
# can switch "no play" penalties as well
tos = df.loc[(df.drive_id.isin(to_ids))]

tos.loc[:,'drive_id'] = tos['drive_id'].astype(str)
tos.loc[:,'comb_id'] = tos['offense'].copy()+tos['drive_id'].copy()

gb = tos.groupby(['drive_id','offense'])['down'].count().reset_index()
gb.loc[:,'comb_id'] = gb['offense'].copy()+gb['drive_id'].copy()

op = gb.loc[gb['down']==1]

one_plays = list(op.comb_id.values)

switch = tos.loc[(tos['comb_id'].isin(one_plays))&(tos['play_type']=='Penalty')]
wrong_teams = list(switch.id.values)

df = df.rename(columns={'offense':'wrong_offense','defense':'wrong_defense'})

df['offense'] = np.where(df['id'].isin(wrong_teams),df['wrong_defense'],df['wrong_offense'])
df['defense'] = np.where(df['id'].isin(wrong_teams),df['wrong_offense'],df['wrong_defense'])


df = df.drop(columns=['wrong_offense','wrong_defense'])

In [43]:

gb = df.groupby(['game_id','drive_id','alt_drive_id'])['offense'].nunique().reset_index()
gb = gb.rename(columns={'offense':'num_offenses'})
gb = gb.groupby(['drive_id','num_offenses'])['game_id'].count().reset_index()

two_os = gb.loc[gb.num_offenses == 2]
to_ids = list(two_os.drive_id.unique())
df['multiple_offenses'] = np.where(df['drive_id'].isin(to_ids),1,0)


In [None]:
# attempting to fix drives with multiple offenses sucks.

mos = df.loc[df.multiple_offenses == 1].copy()
# mos[['drive_id','play_type','offense','last_offense','last_di','tr_game','off_switch']].head(50)
# some of the problems are end of periods and end of halfs
bad = ['End Period','End of Half']
mos = mos.loc[~mos.play_type.isin(bad)]

# drop drives that only have one now
gb = mos.groupby(['drive_id'])['offense'].nunique().reset_index()
gb = gb.loc[gb.offense==1]
# end of half or end of period drive ids
gb_ids = list(gb.drive_id.unique())
mos = mos.loc[~mos.drive_id.isin(gb_ids)]

mos = mos.sort_values(by=['drive_id','tr_game','offense'],ascending=False)
mos['last_offense'] = mos['offense'].shift(1)
mos['last_di'] = mos['drive_id'].shift(1).fillna(0).astype(int)
# mark every time the offense switches with a 1
mos['off_switch'] = np.where(((mos['offense']!=mos['last_offense'])&(mos['drive_id']==mos['last_di'])),1,0)


gb = mos.groupby(['drive_id'])['off_switch'].sum().reset_index()
targs = gb.loc[gb['off_switch']==1]

# focus on drives with just one switch between offenses
targ_ids = list(targs.drive_id.values)
can_fix = mos.loc[mos.drive_id.isin(targ_ids)]


df.loc[:,'mo_identifier'] = df['drive_id'].astype(str) + '_' + df['offense']
can_fix.loc[:,'mo_identifier'] = can_fix['drive_id'].astype(str) + '_' + can_fix['offense']

cf_ids = list(can_fix.loc[can_fix['off_switch']==1].mo_identifier.values)
# need to make drive results uncategorized
all_cf_ids = list(can_fix.drive_id.values)

df.loc[df.drive_id.isin(all_cf_ids), 'drive_result'] = 'Uncategorized'

df['alt_drive_id'] = df['alt_drive_id'].astype(float)
df['drive_id'] = np.where(df['mo_identifier'].isin(cf_ids),df['drive_id']+0.5, df['drive_id'])
df['alt_drive_id'] = np.where(df['mo_identifier'].isin(cf_ids),df['alt_drive_id']+0.5, df['alt_drive_id'])



In [None]:
# can attempt to figure out other multiple offenses later
# timeouts seem to be an issue, try switching them

mos = mos.loc[~mos.drive_id.isin(all_cf_ids)]

mos.groupby('play_type')['down'].count()
mos = mos.rename(columns={'offense':'wrong_offense','defense':'wrong_defense'})
mos['offense'] = np.where(mos['play_type']=='Timeout', mos['wrong_defense'], mos['wrong_offense'])
mos['defense'] = np.where(mos['play_type']=='Timeout', mos['wrong_offense'], mos['wrong_defense'])

mos = mos.drop(columns=['wrong_offense','wrong_defense'])

gb = mos.groupby(['drive_id'])['offense'].nunique().sort_values().reset_index()



In [None]:
ones = mos.loc[mos.drive_id.isin(list(gb.loc[gb.offense==1].drive_id.values))]

targ_ids = list(ones.drive_id.values)
# print(ones[['drive_id','offense','play_type']].head(5))

# go ahead and fix these
df = df.rename(columns={'offense':'wrong_offense','defense':'wrong_defense'})
df['offense'] = np.where(((df['drive_id'].isin(targ_ids))&(df['play_type']=='Timeout')),df['defense'],df['offense'])
df['defense'] = np.where(((df['drive_id'].isin(targ_ids))&(df['play_type']=='Timeout')),df['offense'],df['defense'])

# print(len(df.loc[df['drive_id'].isin(targ_ids)]))


In [None]:
mos = mos.loc[mos.drive_id.isin(list(gb.loc[gb.offense!=1].drive_id.values))]

#mark these that I havent fixed with a 1
mos_ids = list(mos.drive_id.values)

df['multiple_offenses'] = np.where(df['drive_id'].isin(mos_ids),1,0)

# mos['last_offense'] = mos['offense'].shift(1)
# mos['last_di'] = mos['drive_id'].shift(1).fillna(0).astype(int)
# # mark every time the offense switches with a 1
# mos['off_switch'] = np.where(((mos['offense']!=mos['last_offense'])&(mos['drive_id']==mos['last_di'])),1,0)


# # mos[['drive_id','offense','tr_game','play_type','off_switch','play_text']]

# len(mos.loc[(mos.play_type=='Kickoff')&(mos.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))])

# some objectives

1) fix "uncategorized" play type  
2) aggregate, clean, validate all play types  
3) fix "uncategorized" drive results  
4) aggregate, clean, validate all drive results  
5) compare play types to drive results to make sure they match

In [19]:
# fixing uncategorized play types

base = r'^{}'
expr = '(?=.*{})'
words = ['End', 'of', 'Half']
end_period = base.format(''.join(expr.format(w) for w in words))

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(end_period,regex=True)), 'play_type'] = 'End of Half'

words = ['End', 'of', 'Quarter']
end_period = base.format(''.join(expr.format(w) for w in words))

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(end_period,regex=True)), 'play_type'] = 'End of Half'

words = ['fumbled','run', 'for']
fumbles = base.format(''.join(expr.format(w) for w in words))

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(fumbles,regex=True)), 'play_type'] = 'Fumble Recovery (Own)'

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains('Penalty')), 'play_type'] = 'Penalty'


In [20]:
# fix individual
# many of the ones left are fumbles, and then something
df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains('recovered by UTEP Aaron Jones')), 'play_type'] = 'Penalty'

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains('intercepted')), 'play_type'] = 'Pass Interception'

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains('SAFETY')), 'play_type'] = 'Safety'

words = ['fumbled','pass', 'complete']
complete = base.format(''.join(expr.format(w) for w in words))
df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(complete,regex=True)), 'play_type'] = 'Pass Completion'

words = ['TD','punt', 'blocked']
td_pb = base.format(''.join(expr.format(w) for w in words))
df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(td_pb,regex=True)), 'play_type'] = 'Blocked Punt Touchdown'


words = ['run','for', 'TD']
run_td = base.format(''.join(expr.format(w) for w in words))
df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(run_td,regex=True)), 'play_type'] = 'Rushing Touchdown'

df.loc[(df.play_type=='Uncategorized')&(df.down==4), 'play_type'] = 'Punt'

words = ['return','for', 'TD']
fumb_td = base.format(''.join(expr.format(w) for w in words))
df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(fumb_td,regex=True)), 'play_type'] = 'Fumble Return Touchdown'

df.loc[(df.play_type=='Uncategorized')&df.play_text.str.contains('return for'), 'play_type'] = 'Fumble Recovery (Opponent)'

df.loc[(df.play_type=='Uncategorized')&df.play_text.str.contains('run for'), 'play_type'] = 'Rush'

print("how many uncategorized plays are left?")
print(len(df.loc[df['play_type']=='Uncategorized']))


how many uncategorized plays are left?
0


In [21]:
print(df.loc[df['play_type']=='Uncategorized'].play_text)

Series([], Name: play_text, dtype: object)


So, the next objective is to aggregate/clean all play types. I can mostly do that at this point. However, I'd like to try to account for every possession transition first, because this helps identify things like which team recovered a fumble. The only time when two consecutive drive ids should have the same offense is 1) end of half and 2) muffed punts

In [41]:
gb = df.groupby(['game_id','drive_id','alt_drive_id'])['offense','drive_result','multiple_offenses'].last().reset_index()
gb['alt_drive_id'] = gb['alt_drive_id'].astype(int)
gb = gb.sort_values(by=['game_id','alt_drive_id'],ascending=True)

In [42]:
gb['last_offense'] = gb['offense'].shift(1)
gb['prev_drive_result'] = gb['drive_result'].shift(1)
gb.loc[gb['alt_drive_id']==1, 'last_offense'] = 'Start of Game'
gb.loc[gb['alt_drive_id']==1, 'prev_drive_result'] = 'Start of Game'

In [43]:
bad = gb.loc[(gb['offense'] == gb['last_offense'])]
print(len(bad))
bad = bad.loc[gb['prev_drive_result'] != 'END OF HALF']
bad = bad.loc[gb['prev_drive_result'] != 'END OF HALF TD']
print(len(bad))

KeyboardInterrupt: 

In [None]:
# let's examine some of these problems

bad_ids = list(bad.drive_id.unique())
bdf = df.loc[df.drive_id.isin(bad_ids)]


In [None]:
print(bdf.groupby(['drive_result'])['down'].count())

In [None]:
# looked up the kicker on this one play -- it was a muffed kickoff
df.loc[df.drive_result=='NETRCV', 'drive_result'] = 'KICKOFF FUMBLE RECOVERY (KICK TEAM)'

# looked up this guy too
df.loc[df.id==400934568104999803, 'drive_result'] = 'ONSIDE KICK RECOVERY (KICK TEAM)'

In [None]:
# so the biggie here is identifying muffed punts
punts = bdf.loc[(bdf.play_type=='Punt')|(bdf.play_text.str.contains('punt'))]
muffed_punts = punts.loc[punts.play_text.str.contains('fumble')]

muff_ids = list(muffed_punts.drive_id.unique())
df.loc[df.drive_id.isin(muff_ids), 'drive_result'] = 'MUFFED PUNT (PUNT TEAM REC)'

In [None]:
# drop drives that are the last drive in the half.
end_of_half = bdf.loc[bdf.play_text=='End of 2nd Quarter']
eoh_ids = list(end_of_half.drive_id.values)
bad = bad.loc[~bad.drive_id.isin(eoh_ids)]

In [None]:
bad_ids = list(bad.drive_id.unique())
bdf = df.loc[df.drive_id.isin(bad_ids)]


In [None]:
bdf.groupby(['drive_result'])['down'].count()

In [None]:

# for pt in list(bdf.loc[bdf.multiple_offenses==1].play_text.values):
#     print(pt)


In [None]:
# these plays don't have anything important
nah_part_2 = ['End Period','End of Half','End of Game']
bs = df.loc[df.play_type.isin(nah_part_2)&df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')]
del bs
df = df.loc[~df.play_type.isin(nah_part_2)]

In [None]:
# standardize
df.loc[df.play_type=='Interception', 'play_type'] = 'Pass Interception'
df.loc[df.play_type=='Pass Interception Return', 'play_type'] = 'Pass Interception'

df.loc[df.play_type=='Pass Reception', 'play_type'] = 'Pass Completion'



In [None]:
gb = df.groupby(['play_type'])['down'].count()
gb

1) fix uncategorized play type (check)  
2) clean/validate/aggregate play types


In [None]:
# start with the top: blocked FG. make sure none are touchdowns
bfg = df.loc[(df.play_type=='Blocked Field Goal')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

print("There are a few!")
print(len(bfg))
print("All are defensive scores")

df.loc[((df.play_type=='Blocked Field Goal')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))), 'play_type']='Blocked Field Goal Touchdown'

del bfg

In [None]:
# same thing for blocked punt
bp = df.loc[(df.play_type=='Blocked Punt')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

print(len(bp))
print("punts to fix")

# pretty safe to assume these are all defensive
df.loc[((df.play_type=='Blocked Punt')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))), 'play_type']='Blocked Punt Touchdown'

print('fixed')
del bp

In [None]:
# check Fumble Recovery (Opponent)
fro = df.loc[(df.play_type=='Fumble Recovery (Opponent)')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

print(len(fro))
print("fumble recoveries that were TDs")

# i think all but 1 or two are defensive, don't know a good way to sort those out
df.loc[(df.play_type=='Fumble Recovery (Opponent)')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')), 'play_type'] = 'Fumble Return Touchdown'

del fro


In [None]:
fro = df.loc[(df.play_type=='Fumble Recovery (Own)')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

print(len(fro))
print("own fumble recoveries that were TDs")

df.loc[(df.play_type=='Fumble Recovery (Own)')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')), 'play_type'] = 'Rushing Touchdown'

# missed field goal returns are gucci

In [None]:
# divide 'pass' play type into subcategories
pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

#TTD is a team abbreviation that gets picked up
pa = pa.loc[(~pa.play_text.str.contains('intercepted|Intercepted|INTERCEPTED'))&
            (~pa.play_text.str.contains('fumbled'))&
            (~pa.play_text.str.contains('Penalty|PENALTY|penalty'))&
            (~pa.play_text.str.contains('TTD'))]

pa_ids = list(pa.id.values)

df.loc[df.id.isin(pa_ids), 'play_type'] = 'Passing Touchdown'

In [None]:
pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

pa = pa.loc[((pa.play_text.str.contains('Penalty|PENALTY|penalty'))&
        (pa.play_text.str.contains('ACCEPTED|accepted|Accepted')))]

pa_ids = list(pa.id.values)
df.loc[df.id.isin(pa_ids), 'play_type'] = 'Penalty'


In [None]:
pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

pa = pa.loc[pa.play_text.str.contains('intercepted|Intercepted|INTERCEPTED')]

pa_ids = list(pa.id.values)
df.loc[df.id.isin(pa_ids), 'play_type'] = 'Interception Return Touchdown'

In [None]:
pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

pa = pa.loc[pa.play_text.str.contains('fumbled')]

fbr_ids = list(pa.loc[pa.drive_result=='FUMBLE RETURN TD'].id.values)
df.loc[df.id.isin(fbr_ids),'play_type'] = 'Fumble Return Touchdown'

fbr_ids = list(pa.loc[pa.drive_result=='PASSING TD'].id.values)
df.loc[df.id.isin(fbr_ids),'play_type'] = 'Passing Touchdown'



In [None]:
pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

pa = pa.loc[pa.play_text.str.contains('fumbled')]

# slightly guessing but i think it's right

ptd = pa.loc[pa.play_text.str.contains('pass complete')]
ptd_ids = list(ptd.id.values)
df.loc[df.id.isin(ptd_ids), 'play_type'] = 'Passing Touchdown'

pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]
pa_ids = list(pa.id.values)
df.loc[df.id.isin(pa_ids), 'play_type'] = 'Fumble Return Touchdown'

del pa
gc.collect()

In [None]:
# change rushing tds categorized as 'rush' to rushing tds
rush = df.loc[(df.play_type=='Rush')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]
# for penalties that don't stop a touchdown
words = ['0 yard','accepted']
pens = base.format(''.join(expr.format(w) for w in words))
rush_tds = rush.loc[(~rush.play_text.str.contains('Penalty|PENALTY|penalty')) | (rush.play_text.str.contains('declined|DECLINED') | (rush.play_text.str.contains(pens)))]
rush_tds = rush_tds.loc[~rush.play_text.str.contains('fumbled')]
rtd_ids = list(rush_tds.id.values)
df.loc[df.id.isin(rtd_ids),'play_type'] = 'Rushing Touchdown'

del rush_tds
# rtd_ids = list(rush_tds.id.values)

In [None]:
# some fumble 6 rushes that need to be categorized as such

rush = df.loc[(df.play_type=='Rush')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]
words = ['fumbled','returned by']
ftds = base.format(''.join(expr.format(w) for w in words))
fumble_tds = rush.loc[(rush.play_text.str.contains(ftds))]

fbtd_ids = list(fumble_tds.id.values)
df.loc[df.id.isin(fbtd_ids),'play_type'] = 'Fumble Return Touchdown'

words = ['fumbled','loss of']
ftds = base.format(''.join(expr.format(w) for w in words))
fumble_tds = rush.loc[(rush.play_text.str.contains(ftds))]

fbtd_ids = list(fumble_tds.id.values)
df.loc[df.id.isin(fbtd_ids),'play_type'] = 'Fumble Return Touchdown'

# subset of fumble 6s always say 'to the {other team} 0' 
words = ['to the','0']
ftds = base.format(''.join(expr.format(w) for w in words))
fumble_tds = rush.loc[(rush.play_text.str.contains(ftds))]

fbtd_ids = list(fumble_tds.id.values)
df.loc[df.id.isin(fbtd_ids),'play_type'] = 'Fumble Return Touchdown'

df.loc[((df.play_type=='Rush')&(df.play_text.str.contains('penalty|PENALTY|Penalty'))), 'play_type'] = 'Penalty'



In [None]:
# hard to determine which fumbles go for offensive TD vs defensive TD from just play text
# lean on drive result

df.loc[(df.play_type=='Rush')&(df.drive_result=='RUSHING TD'), 'play_type'] = 'Rushing Touchdown'

df.loc[(df.play_type=='Rush')&(df.drive_result=='FUMBLE RETURN TD'), 'play_type'] = 'Fumble Return Touchdown'

# i verified these
df.loc[((df.play_type=='Rush')&(df.yard_line>90)&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))), 'play_type'] = 'Rushing Touchdown'

# rest seem to be defensive. might be one or two offensive that leaked through
df.loc[((df.play_type=='Rush')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))), 'play_type'] = 'Fumble Return Touchdown'
                               


In [None]:
clean = ['Pass','Rush']
sa = df.loc[(df.play_type.isin(clean))&(df.play_text.str.contains('Safety|safety|SAFETY'))]
sa_ids = list(sa.id.values)
print(len(sa))
df.loc[df.id.isin(sa_ids), 'play_type'] = 'Safety'

In [None]:
# divide up the "pass" play type
df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('incomplete')), 'play_type'] = 'Pass Incompletion'
df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('complete')), 'play_type'] = 'Pass Completion'
df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('intercepted')), 'play_type'] = 'Interception'
df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('sacked')), 'play_type'] = 'Sack'
    

In [None]:
pa = df.loc[(df.play_type=='Pass')]

# only things left are fumbles
# lean on drive result, tough to know who recovered fumble 

li = ['FUMBLE','Uncategorized']

fro = pa.loc[~pa.drive_result.isin(li)]
fro_ids = list(fro.id.values)
df.loc[df.id.isin(fro_ids), 'play_type'] = 'Fumble Recovery (Own)'

fum = pa.loc[pa.drive_result == 'FUMBLE']
fum_ids = list(fum.id.values)
df.loc[df.id.isin(fum_ids), 'play_type'] = 'Fumble Recovery (Opponent)'


In [None]:
# uncategorized
pa = df.loc[(df.play_type=='Pass')]

df.loc[df.id==253370030087, 'play_type'] = 'Fumble Recovery (Opponent)'
df = df.loc[df.id != 262590259175]


In [None]:
tds = ['FUMBLE RETURN TD','FUMBLE TD']

td = df.loc[(df.play_type == 'Sack') & (df.play_text.str.contains('fumbled')) & (df.drive_result.isin(tds))]
td_ids = list(td.id.values)

df.loc[df.id.isin(td_ids), 'play_type'] = 'Fumble Return Touchdown'
del tds

sa = df.loc[(df.play_type == 'Sack') & (df.play_text.str.contains('fumbled'))]

li = ['FUMBLE','Uncategorized']

fro = sa.loc[~sa.drive_result.isin(li)]
fro_ids = list(fro.id.values)
df.loc[df.id.isin(fro_ids), 'play_type'] = 'Fumble Recovery (Own)'

fum = sa.loc[sa.drive_result == 'FUMBLE']
fum_ids = list(fum.id.values)
df.loc[df.id.isin(fum_ids), 'play_type'] = 'Fumble Recovery (Opponent)'

sa = df.loc[(df.play_type=='Sack')&(df.play_text.str.contains('fumbled'))]

# only recovery by the defense
df.loc[df.id==322450120161, 'play_type'] = 'Fumble Recovery (Opponent)'

df.loc[(df.play_type=='Sack')&(df.play_text.str.contains('fumbled')), 'play_type'] = 'Fumble Recovery (Own)'

del sa

gc.collect()

In [None]:
td = df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('fumbled')) & (df.drive_result=='FUMBLE TD')]

words = ['to the','0']
ftds = base.format(''.join(expr.format(w) for w in words))
fumble_tds = td.loc[(td.play_text.str.contains(ftds))]

fbtd_ids = list(fumble_tds.id.values)
df.loc[df.id.isin(fbtd_ids),'play_type'] = 'Fumble Return Touchdown'

# rest are just fumble recoveries
td = df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('fumbled')) & (df.drive_result=='FUMBLE TD')]
td_ids = list(td.id.values)
df.loc[df.id.isin(td_ids),'play_type'] = 'Fumble Recovery (Opponent)'

del td

In [None]:
df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('fumbled')) & (df.drive_result=='FUMBLE RETURN TD'), 'play_type'] = 'Passing Touchdown'


In [None]:
# only defensive returned touchdown in set
df.loc[df.id==272932393004, 'play_type'] = 'Fumble Return Touchdown'

pa = df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('fumbled'))]

fumbles = pa.loc[pa.drive_result=='Fumble']

fum_ids = list(fumbles.id.values)
df.loc[df.id.isin(fum_ids), 'play_type'] = 'Fumble Recovery (Opponent)'

pa = df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('fumbled')) & (df.drive_result == 'Uncategorized')]



# individual fixes
fr_own = ['UTEP','Arkansas State','Colorado','Notre Dame','Iowa State','Penn St.','TCU','Florida Atlantic']

fr_o = pa.loc[pa.offense.isin(fr_own)]
fr_o_ids = list(fr_o.id.values)

df.loc[df.id.isin(fr_o_ids), 'play_type'] = 'Fumble Recovery (Own)'


fr_opp = pa.loc[~pa.offense.isin(fr_own)]
fr_opp_ids = list(fr_opp.id.values)
df.loc[df.id.isin(fr_opp_ids), 'play_type'] = 'Fumble Recovery (Opponent)'

In [None]:
df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')), 'play_type']='Passing Touchdown'


In [None]:
bfg = df.loc[(df.play_type=='Field Goal Missed')&(df.play_text.str.contains('Blocked|BLOCKED|blocked'))]

bfg_td = bfg.loc[bfg.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')]

bfg_td_ids = list(bfg_td.id.values)

bfg = bfg.loc[~bfg.id.isin(bfg_td_ids)]
bfg_ids = list(bfg.id.values)

df.loc[df.id.isin(bfg_td_ids), 'play_type'] = 'Blocked Field Goal Touchdown'
df.loc[df.id.isin(bfg_ids), 'play_type'] = 'Blocked Field Goal'


In [None]:
# now punts

bfg = df.loc[(df.play_type=='Punt')&(df.play_text.str.contains('Blocked|BLOCKED|blocked'))]

bfg_td = bfg.loc[bfg.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')]


bfg_td_ids = list(bfg_td.id.values)

bfg = bfg.loc[~bfg.id.isin(bfg_td_ids)]
bfg_ids = list(bfg.id.values)

df.loc[df.id.isin(bfg_td_ids), 'play_type'] = 'Blocked Punt Touchdown'
df.loc[df.id.isin(bfg_ids), 'play_type'] = 'Blocked Punt'


In [None]:
# punt return touchdowns

prtd = df.loc[(df.play_type=='Punt')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

# check for penalties
words = ['penalty','accepted']
called_back = base.format(''.join(expr.format(w) for w in words))
prtd = prtd.loc[~prtd.play_text.str.contains(called_back, regex=True)]

prtd = prtd.loc[~prtd.play_text.str.contains('fumbled|TTD')]
prtd = prtd.loc[~prtd.play_text.str.contains('NO PLAY')]

prtd_ids = list(prtd.id.values)

df.loc[df.id.isin(prtd_ids), 'play_type'] = 'Punt Return Touchdown'


In [None]:
# lastly, fix declined penalties
# seem to be mostly right except some rushes
# decline = df.loc[(df.play_type=='Penalty')&(df.play_text.str.contains('declined|DECLINED'))]
# print(len(decline))

# rush = decline.loc[(decline.play_text.str.contains('rush|rushed'))&~(decline.play_text.str.contain)]

# comp = decline.loc[decline.play_text.str.contains('pass complete')]

# incomp = decline.loc[decline.play_text.str.contains('incomplete')]

# for pt in list(decline.play_text.values):
#     print(pt)

1) fix "uncategorized" play type (check)
2) aggregate, clean, validate all play types (check)
3) fix "uncategorized" drive results
4) aggregate, clean, validate all drive results
5) compare play types to drive results to make sure they match

In [None]:
gb = df.groupby(['play_type'])['down'].count()
gb

In [None]:
# make expected drive results to compare to given drive result data
pass_tds = df.loc[df.play_type == 'Passing Touchdown']
ptd_ids = list(pass_tds.drive_id.unique())
del pass_tds

rush_tds = df.loc[df.play_type == 'Rushing Touchdown']
rtd_ids = list(rush_tds.drive_id.unique())
del rush_tds

fgg = df.loc[df.play_type == 'Field Goal Good']
fgg_ids = list(fgg.drive_id.unique())
del fgg

fgm = df.loc[df.play_type == 'Field Goal Missed']
fgm_ids = list(fgm.drive_id.unique())
del fgm

sf = df.loc[df.play_type == 'Safety']
sf_ids = list(sf.drive_id.unique())
del sf

intn = df.loc[df.play_type == 'Pass Interception']
int_ids = list(intn.drive_id.unique())
del intn

fropp = df.loc[df.play_type == 'Fumble Recovery (Opponent)']
fropp_ids = list(fropp.drive_id.unique())
del fropp

ftd = ['Missed Field Goal Return Touchdown','Blocked Field Goal Touchdown']
fg_td = df.loc[df.play_type.isin(ftd)]
ftd_ids = list(fg_td.drive_id.unique())
del fg_td

prtd = ['Punt Return Touchdown','Blocked Punt Touchdown']
punt_td = df.loc[df.play_type.isin(prtd)]
prtd_ids = list(punt_td.drive_id.unique())
del punt_td

punts = df.loc[df.play_type =='Punt']
punt_ids = list(punts.drive_id.unique())
del punts

df['exp_drive_result'] = np.nan

drs = [ptd_ids, rtd_ids,fgg_ids,fgm_ids,sf_ids,int_ids,fropp_ids,ftd_ids,prtd_ids,punt_ids]

df.loc[df.drive_id.isin(ptd_ids), 'exp_drive_result'] = 'PASSING TD'
df.loc[df.drive_id.isin(rtd_ids), 'exp_drive_result'] = 'RUSHING TD'
df.loc[df.drive_id.isin(fgg_ids), 'exp_drive_result'] = 'FG GOOD'
df.loc[df.drive_id.isin(fgm_ids), 'exp_drive_result'] = 'FG MISSED'
df.loc[df.drive_id.isin(sf_ids), 'exp_drive_result'] = 'SF'
df.loc[df.drive_id.isin(int_ids), 'exp_drive_result'] = 'INT'
df.loc[df.drive_id.isin(fropp_ids), 'exp_drive_result'] = 'FUMBLE'
df.loc[df.drive_id.isin(ftd_ids), 'exp_drive_result'] = 'FG MISSED TD'
df.loc[df.drive_id.isin(prtd_ids), 'exp_drive_result'] = 'PUNT RETURN TD'
df.loc[df.drive_id.isin(punt_ids), 'exp_drive_result'] = 'PUNT'

# print(len(df))
# most = df.dropna(subset=['exp_drive_result'])
# print(len(most))

# del most
gc.collect()

Time to clean/validate/aggregate drive results

In [None]:
gb = df.groupby(['drive_result'])['down'].count()
gb

In [None]:
targ = df.loc[(df.drive_result == 'PUNT') & (df.play_type=='Punt Return Touchdown')]
targ_ids = list(targ.drive_id.values)

df.loc[df.drive_id.isin(targ_ids), 'drive_result'] = 'PUNT RETURN TD'

# muffed punts recovered by return team

targ = df.loc[(df.exp_drive_result=='FUMBLE')&(df.drive_result == 'PUNT') & (df.play_type=='Punt')]
targ_ids = list(targ.drive_id.values)
df.loc[df.drive_id.isin(targ_ids), 'drive_result'] = 'PUNT'

# safeties that are called punts because of the punt afterward

targ = df.loc[(df.exp_drive_result=='PUNT')&(df.drive_result == 'SF')]
targ_ids = list(targ.drive_id.values)
df.loc[df.drive_id.isin(targ_ids), 'exp_drive_result'] = 'SF'

# weird 
targ = df.loc[(df.exp_drive_result=='FG GOOD')&(df.drive_result == 'FG MISSED') & (df.play_type=='Field Goal Good')]
targ_ids = list(targ.drive_id.values)
df.loc[df.drive_id.isin(targ_ids), 'drive_result'] = 'FG GOOD'

# counting punt returns and blocked punts in same category
targ = df.loc[(df.drive_result == 'PUNT')&(df.exp_drive_result=='PUNT RETURN TD')& (df.play_type=='Blocked Punt Touchdown')]
targ_ids = list(targ.drive_id.values)
df.loc[df.drive_id.isin(targ_ids), 'drive_result'] = 'PUNT RETURN TD'



In [None]:
compare = ['PASSING TD','RUSHING TD','FG GOOD','FG MISSED','SF','INT','FUMBLE','FG MISSED TD','PUNT RETURN TD','PUNT']

comp = df.loc[df.drive_result.isin(compare)].copy()

comp['match'] = np.where(comp['drive_result']==comp['exp_drive_result'],1,0)

print("What pct of drives sampled match our expected drive result?")
print(str(np.round((comp.match.sum()/len(comp))*100,1))+'%')

comp['exp_drive_result'] = comp['exp_drive_result'].fillna(comp['drive_result'])


# TTD abbreviation threw everything off
comp['exp_drive_result'] = np.where(comp['exp_drive_result'].str.contains('TTD'),comp['drive_result'],comp['exp_drive_result'])

comp['match'] = np.where(comp['drive_result']==comp['exp_drive_result'],1,0)



In [None]:
wrong = comp.loc[comp.match==0].copy()

print(wrong[['drive_result','exp_drive_result','play_type','drive_id']])
for pt in list(wrong.play_text.values):
    print(pt)

In [None]:
# drop games that are clearly off
gb = wrong.groupby(['game_id'])['drive_result'].count()
gb = gb.loc[gb > 15]
bad_games = list(gb.index.values)

df = df.loc[~df.game_id.isin(bad_games)]

"intercepted" - interceptions
"

In [None]:
# gb = df.groupby(['game_id','drive_id'])['offense','tr_game'].first().reset_index()

