In [122]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import datetime
gc.collect()

from tqdm import tqdm

# Load Data

In [123]:
# load play by play & drive data
years = list(range(2005, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    
    drive_path = './output/'+str(year)+'/'+str(year)+'_drives.csv'
    drive_df = pd.read_csv(drive_path)
    
    drive_df = drive_df.rename(columns={'id':'drive_id'})
    
    sea_df = pd.merge(left=sea_df, right=drive_df, how='left', on=['drive_id','drive_id'])
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(df.shape)
print(str(num_plays) + " plays were loaded")

100%|██████████| 14/14 [00:29<00:00,  2.72s/it]

(2031893, 41)
2031893 plays were loaded





In [124]:
# offense_x seems to be correct while offense_y is not

df = df.drop(columns=['defense_y','defense_conference_y','offense_y','offense_conference_y'])

df = df.rename(columns={'defense_x':'defense','defense_conference_x':'defense_conference','offense_x':'offense','offense_conference_x':'offense_conference'})


In [125]:
# fix yard_line, it's w.r.t the home team
df = df.rename(columns={'yard_line':'wrong_yardline'})

df['yard_line'] = np.where(df['offense']==df['home'],df['wrong_yardline'],100-df['wrong_yardline'])
# print(df[['home','offense','yard_line','wrong_yardline']].head(50))
df = df.drop(columns=['wrong_yardline'])

In [126]:
df['alt_game_id'] = df['game_id'].copy().astype(str)
df['alt_drive_id'] = df['drive_id'].copy().astype(str)

def replace_id(x,y):
    return x.replace(y,'')
df['alt_drive_id'] = df.apply(lambda row: replace_id(row['alt_drive_id'], row['alt_game_id']), axis=1)

# df['len_did'] = df['alt_drive_id'].str.len()
df['alt_drive_id'] = df['alt_drive_id'].str.lstrip("0")

df['alt_drive_id'] = df['alt_drive_id'].replace(r'^\s*$', '0', regex=True)

print(df.groupby(['alt_drive_id'])['down'].count().sort_values(ascending=False))

alt_drive_id
1     82048
6     80812
8     80231
7     80150
2     80055
5     79401
3     79125
4     78787
9     78590
10    77403
18    76959
19    76738
17    76393
20    76304
16    75386
11    75370
15    74395
12    74270
21    73813
13    73725
14    73270
22    70395
23    65024
24    57809
25    50329
26    41988
27    33872
28    26384
29    19810
30    14116
31     9791
32     6782
33     4573
34     3232
35     1836
36      942
37      612
38      378
0       314
39      208
40      123
41       72
42       50
43       13
44        9
45        3
46        3
Name: down, dtype: int64


## Special Teams

I'll do special teams and overtimes in future work. Right now, keeping it simple.

In [127]:
# print(len(df))
pat = ['2pt Conversion','Offensive 1pt Safety','Defensive 2pt Conversion','Extra Point Good','Extra Point Missed']

df = df.loc[~df.play_type.isin(pat)]


# actually will include kos
kos = ['Kickoff','Kickoff Return (Offense)','Kickoff Return Touchdown']
print(len(df))

1986166


In [128]:
# also drop overtime

df = df.loc[(df.period > 0) & (df.period <= 4)]
print(len(df))


1979243


## Clock

Clock data is unreliable because maybe 25% of the games have only have one time for each play, and that time is when the drive started. I played with trying to predict time per play based on play type, but the data was very messy. So I decided to get the total time of each drive, and then assume each play took the same amount of time. EPA shouldn't be significantly affected most of the time, i.e. a 70 yard pass will be considered a good play no matter what. The only time it might have an adverse effect is toward the end of a game, when seconds matter. I think that in college football, when the clock stops for a first down, and incompletions, that all pass plays probably do take a somewhat similar amount of time. Drives in this situation will consist mostly of the same play type, and plays of the same play type likely take similar amounts of time. I'll compare it to the clock data I do have to make sure.

In [129]:
# fix clock data first so drives can be figured out
time_cols = ['clock.minutes','clock.seconds','start_time.minutes','start_time.seconds',
            'end_time.minutes','end_time.seconds']
for tc in time_cols:
    df[tc] = df[tc].fillna(0)

# get time remaining in game
df['tr_game'] = (4-df['period']) * 900 + (df['clock.minutes'] * 60) + df['clock.seconds']
df['tr_half'] = np.where(df['period']>2,df['tr_game'], df['tr_game']-1800)

df = df.drop(columns=['clock.minutes','clock.seconds'])

In [130]:
# fill empties
df['elapsed.minutes'] = df['elapsed.minutes'].copy().fillna(0)
df['elapsed.seconds'] = df['elapsed.seconds'].copy().fillna(0)
df['drive_time'] = 60*df['elapsed.minutes'] + df['elapsed.seconds']

In [131]:
# a lot of those drive times are negative... and other problems. so here's an alt drive time
# alt clock

# these get the start and end time of every drive
maxs = df.groupby(['game_id','drive_id'])['tr_game'].max().reset_index()
mins = df.groupby(['game_id','drive_id'])['tr_game'].min().reset_index()
maxs = maxs.rename(columns={'tr_game':'drive_start'})
mins = mins.rename(columns={'tr_game':'drive_end'})

# sometimes the drive end time is the same as the drive start. in that case, I use the next drive start
maxs = maxs.sort_values(by=['game_id','drive_start'],ascending=False)
next_max = maxs.groupby(['game_id'])['drive_start'].shift(-1)
next_max = pd.Series(next_max, name='next_drive_start')
new_max = pd.concat([maxs, next_max], axis=1)
new_max['next_drive_start'] = new_max['next_drive_start'].fillna(0)

# sometimes (rarely, 2%ish of the time) both the next drive start and the drive end are the same as the drive start
# i can explore this more in future work
# i'm fairly sure most of the time it's when a timeout or something divides the same drive into two.

mins = mins.sort_values(by=['game_id','drive_end'],ascending=False)
next_min = mins.groupby(['game_id'])['drive_end'].shift(-1)
next_min = pd.Series(next_min, name='next_drive_end')
new_min = pd.concat([mins, next_min], axis=1)
new_min['next_drive_end'] = new_min['next_drive_end'].fillna(0)
new_min = new_min.drop(columns='game_id')
times = pd.merge(left=new_max,right=new_min,on=['drive_id','drive_id'],how='left')


# attempt 1 (works on ~95.5% of data)
times['drive_time_1'] = times['drive_start']-times['next_drive_start']
# plan B (95.8% of data)
times['drive_time'] = np.where(times['drive_time_1']>0,times['drive_time_1'],(times['drive_start']-times['drive_end']))
# last resort (didn't implement)
# times['drive_time'] = np.where(times['drive_time_2']>0,times['drive_time_2'],(times['drive_start']-times['next_drive_end']))

not_good = times.loc[times.drive_time<=0]
print(len(not_good))

good = times.loc[times.drive_time>0]
print(len(good))

print(good.drive_time.mean())

times = times[['drive_id','drive_time']]
times = times.rename(columns={'drive_time':'alt_drive_time'})


11340
277556
145.28465606940583


In [132]:
df = pd.merge(left=df,right=times,how='left',on=['drive_id','drive_id'])

In [133]:
# longest drive in CFB history is 882. so need to drop anything above 900
# also drop anything below or equal to 0

df['correct_time_1'] = np.where(df['drive_time'] > 0, df['drive_time'], df['alt_drive_time'])

df = df.loc[df['correct_time_1'] > 0]

df['correct_drive_time'] = np.where(df['drive_time'] < 900, df['drive_time'], df['alt_drive_time'])

df = df.loc[df['correct_drive_time'] < 900]

print(len(df))

print("correlation between primary and approximate drive time")
print(df[['drive_time','alt_drive_time','correct_drive_time']].corr())

df = df.drop(columns=['drive_time','alt_drive_time','correct_time_1'])
df = df.rename(columns={'correct_drive_time':'drive_time'})

1958149
correlation between primary and approximate drive time
                    drive_time  alt_drive_time  correct_drive_time
drive_time            1.000000        0.854848            0.991366
alt_drive_time        0.854848        1.000000            0.862340
correct_drive_time    0.991366        0.862340            1.000000


In [134]:
print(len(df))
df = df.dropna(subset=['play_text'])
print(len(df))

1958149
1957877


examine drives so that end of half and end of game drives are the only back-to-back drives that can result in same team possessing the ball

In [135]:
gb = df.groupby(['game_id','drive_id','alt_drive_id'])['offense'].nunique().reset_index()
gb = gb.rename(columns={'offense':'num_offenses'})
gb = gb.groupby(['drive_id','num_offenses'])['game_id'].count().reset_index()
print(gb.groupby(['num_offenses'])['game_id'].count())
# about 18% of drives have two offenses

two_os = gb.loc[gb.num_offenses == 2]
to_ids = list(two_os.drive_id.unique())

num_offenses
1    231865
2     51873
Name: game_id, dtype: int64


In [136]:
kos = df.loc[(df.drive_id.isin(to_ids))]

kos.loc[kos['play_type'] == 'Kickoff Return (Offense)', 'play_type'] = 'Kickoff'
kos.loc[kos['play_type'] == 'Kickoff Return Touchdown', 'play_type'] = 'Kickoff'


kos = kos.loc[kos.play_type=='Kickoff']
wrong_teams = list(kos.id.values)

df = df.rename(columns={'offense':'wrong_offense','defense':'wrong_defense'})

df['offense'] = np.where(df['id'].isin(wrong_teams),df['wrong_defense'],df['wrong_offense'])
df['defense'] = np.where(df['id'].isin(wrong_teams),df['wrong_offense'],df['wrong_defense'])


df = df.drop(columns=['wrong_offense','wrong_defense'])



In [137]:
# switching kickoffs fixes vast majority
gb = df.groupby(['game_id','drive_id','alt_drive_id'])['offense'].nunique().reset_index()
gb = gb.rename(columns={'offense':'num_offenses'})
gb = gb.groupby(['drive_id','num_offenses'])['game_id'].count().reset_index()
print(gb.groupby(['num_offenses'])['game_id'].count())
# about 18% of drives have two offenses

two_os = gb.loc[gb.num_offenses == 2]
to_ids = list(two_os.drive_id.unique())

num_offenses
1    282463
2      1275
Name: game_id, dtype: int64


In [151]:
tos = df.loc[(df.drive_id.isin(to_ids))]
# print(list(tos))

gb = tos.groupby(['drive_id','offense'])['down'].count().reset_index()

one_plays = gb.loc[gb['down']==1]
one_plays['op_id'] = one_plays['offense'].copy()+'_'+one_plays['drive_id'].copy().astype(str)
tos['op_id'] = tos['offense'].copy()+'_'+tos['drive_id'].copy().astype(str)

op_di = list(one_plays.op_id.values)

bad = tos.loc[tos['op_id'].isin(op_di)]

bad_ids = 



                 id           offense play_type  \
90     252440009110     Arizona State   Penalty   
17402  252532751150  Louisiana Monroe      Pass   
21910  252600197001    Arkansas State   Penalty   
29247  252670202031             Tulsa      Rush   
45698  252810164106           Rutgers   Penalty   
47100  252810251113             Texas   Penalty   
47678  252812084002           Buffalo      Rush   
60585  252950070098             Idaho   Penalty   
78836  253092305037            Kansas   Penalty   
90917  253230167112        New Mexico   Kickoff   

                                               play_text  
90     ARIZONA ST PENALTY 5 yard illegal formation AC...  
17402  Steven Jyles pass incomplete to Drouzon Quille...  
21910  Oklahoma St penalty 31 yard holding accepted, ...  
29247    Chris Kindred rush for 3 yards for a TOUCHDOWN.  
45698           Rutgers penalty 5 yard offside accepted.  
47100    Texas penalty 5 yard offside accepted, no play.  
47678        James Vann r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# some objectives

1) fix "uncategorized" play type  
2) aggregate, clean, validate all play types  
3) fix "uncategorized" drive results  
4) aggregate, clean, validate all drive results  
5) compare play types to drive results to make sure they match

In [13]:
# fixing uncategorized play types

base = r'^{}'
expr = '(?=.*{})'
words = ['End', 'of', 'Quarter']
end_period = base.format(''.join(expr.format(w) for w in words))

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(end_period,regex=True)), 'play_type'] = 'End Period'

words = ['fumbled','run', 'for']
fumbles = base.format(''.join(expr.format(w) for w in words))

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(fumbles,regex=True)), 'play_type'] = 'Fumble Recovery (Own)'

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains('Penalty')), 'play_type'] = 'Penalty'


In [14]:
# fix individual
# many of the ones left are fumbles, and then something
df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains('recovered by UTEP Aaron Jones')), 'play_type'] = 'Penalty'

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains('intercepted')), 'play_type'] = 'Pass Interception'

df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains('SAFETY')), 'play_type'] = 'Safety'

words = ['fumbled','pass', 'complete']
complete = base.format(''.join(expr.format(w) for w in words))
df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(complete,regex=True)), 'play_type'] = 'Pass Completion'

words = ['TD','punt', 'blocked']
td_pb = base.format(''.join(expr.format(w) for w in words))
df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(td_pb,regex=True)), 'play_type'] = 'Blocked Punt Touchdown'


words = ['run','for', 'TD']
run_td = base.format(''.join(expr.format(w) for w in words))
df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(run_td,regex=True)), 'play_type'] = 'Rushing Touchdown'

df.loc[(df.play_type=='Uncategorized')&(df.down==4), 'play_type'] = 'Punt'

words = ['return','for', 'TD']
fumb_td = base.format(''.join(expr.format(w) for w in words))
df.loc[(df.play_type=='Uncategorized')&(df.play_text.str.contains(fumb_td,regex=True)), 'play_type'] = 'Fumble Return Touchdown'

df.loc[(df.play_type=='Uncategorized')&df.play_text.str.contains('return for'), 'play_type'] = 'Fumble Recovery (Opponent)'

df.loc[(df.play_type=='Uncategorized')&df.play_text.str.contains('run for'), 'play_type'] = 'Rush'

print("how many uncategorized plays are left?")
print(len(df.loc[df['play_type']=='Uncategorized']))


how many uncategorized plays are left?
0


In [15]:
# these plays don't have anything important
nah_part_2 = ['End Period','End of Half','End of Game']
bs = df.loc[df.play_type.isin(nah_part_2)&df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')]
del bs
df = df.loc[~df.play_type.isin(nah_part_2)]

In [16]:
# standardize
df.loc[df.play_type=='Interception', 'play_type'] = 'Pass Interception'
df.loc[df.play_type=='Pass Interception Return', 'play_type'] = 'Pass Interception'

df.loc[df.play_type=='Pass Reception', 'play_type'] = 'Pass Completion'

In [17]:
gb = df.groupby(['play_type'])['down'].count()
gb

play_type
Blocked Field Goal                       446
Blocked Field Goal Touchdown              24
Blocked Punt                             347
Blocked Punt Touchdown                    19
Field Goal Good                        24014
Field Goal Missed                       8756
Fumble Recovery (Opponent)              3671
Fumble Recovery (Own)                   4322
Fumble Return Touchdown                  147
Interception Return Touchdown            822
Missed Field Goal Return                  20
Missed Field Goal Return Touchdown         3
Pass                                   41590
Pass Completion                       370077
Pass Incompletion                     246857
Pass Interception                      18083
Passing Touchdown                      13821
Penalty                                97536
Punt                                  107878
Punt Return Touchdown                     52
Rush                                  769502
Rushing Touchdown                      14991


1) fix uncategorized play type (check)  
2) clean/validate/aggregate play types


In [18]:
# start with the top: blocked FG. make sure none are touchdowns
bfg = df.loc[(df.play_type=='Blocked Field Goal')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

print("There are a few!")
print(len(bfg))
print("All are defensive scores")

df.loc[((df.play_type=='Blocked Field Goal')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))), 'play_type']='Blocked Field Goal Touchdown'

del bfg

There are a few!
11
All are defensive scores


In [19]:
# same thing for blocked punt
bp = df.loc[(df.play_type=='Blocked Punt')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

print(len(bp))
print("punts to fix")

# pretty safe to assume these are all defensive
df.loc[((df.play_type=='Blocked Punt')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))), 'play_type']='Blocked Punt Touchdown'

print('fixed')
del bp

92
punts to fix
fixed


In [20]:
# check Fumble Recovery (Opponent)
fro = df.loc[(df.play_type=='Fumble Recovery (Opponent)')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

print(len(fro))
print("fumble recoveries that were TDs")

# i think all but 1 or two are defensive, don't know a good way to sort those out
df.loc[(df.play_type=='Fumble Recovery (Opponent)')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')), 'play_type'] = 'Fumble Return Touchdown'

del fro


68
fumble recoveries that were TDs


In [21]:
fro = df.loc[(df.play_type=='Fumble Recovery (Own)')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

print(len(fro))
print("own fumble recoveries that were TDs")

df.loc[(df.play_type=='Fumble Recovery (Own)')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')), 'play_type'] = 'Rushing Touchdown'

# missed field goal returns are gucci

18
own fumble recoveries that were TDs


In [22]:
# divide 'pass' play type into subcategories
pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

#TTD is a team abbreviation that gets picked up
pa = pa.loc[(~pa.play_text.str.contains('intercepted|Intercepted|INTERCEPTED'))&
            (~pa.play_text.str.contains('fumbled'))&
            (~pa.play_text.str.contains('Penalty|PENALTY|penalty'))&
            (~pa.play_text.str.contains('TTD'))]

pa_ids = list(pa.id.values)

df.loc[df.id.isin(pa_ids), 'play_type'] = 'Passing Touchdown'

In [23]:
pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

pa = pa.loc[((pa.play_text.str.contains('Penalty|PENALTY|penalty'))&
        (pa.play_text.str.contains('ACCEPTED|accepted|Accepted')))]

pa_ids = list(pa.id.values)
df.loc[df.id.isin(pa_ids), 'play_type'] = 'Penalty'


In [24]:
pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

pa = pa.loc[pa.play_text.str.contains('intercepted|Intercepted|INTERCEPTED')]

pa_ids = list(pa.id.values)
df.loc[df.id.isin(pa_ids), 'play_type'] = 'Interception Return Touchdown'

In [25]:
pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

pa = pa.loc[pa.play_text.str.contains('fumbled')]

fbr_ids = list(pa.loc[pa.drive_result=='FUMBLE RETURN TD'].id.values)
df.loc[df.id.isin(fbr_ids),'play_type'] = 'Fumble Return Touchdown'

fbr_ids = list(pa.loc[pa.drive_result=='PASSING TD'].id.values)
df.loc[df.id.isin(fbr_ids),'play_type'] = 'Passing Touchdown'



In [26]:
pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]

pa = pa.loc[pa.play_text.str.contains('fumbled')]

# slightly guessing but i think it's right

ptd = pa.loc[pa.play_text.str.contains('pass complete')]
ptd_ids = list(ptd.id.values)
df.loc[df.id.isin(ptd_ids), 'play_type'] = 'Passing Touchdown'

pa = df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]
pa_ids = list(pa.id.values)
df.loc[df.id.isin(pa_ids), 'play_type'] = 'Fumble Return Touchdown'

del pa
gc.collect()

125

In [27]:
# change rushing tds categorized as 'rush' to rushing tds
rush = df.loc[(df.play_type=='Rush')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]
# for penalties that don't stop a touchdown
words = ['0 yard','accepted']
pens = base.format(''.join(expr.format(w) for w in words))
rush_tds = rush.loc[(~rush.play_text.str.contains('Penalty|PENALTY|penalty')) | (rush.play_text.str.contains('declined|DECLINED') | (rush.play_text.str.contains(pens)))]
rush_tds = rush_tds.loc[~rush.play_text.str.contains('fumbled')]
rtd_ids = list(rush_tds.id.values)
df.loc[df.id.isin(rtd_ids),'play_type'] = 'Rushing Touchdown'

del rush_tds
# rtd_ids = list(rush_tds.id.values)

In [28]:
# some fumble 6 rushes that need to be categorized as such

rush = df.loc[(df.play_type=='Rush')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]
words = ['fumbled','returned by']
ftds = base.format(''.join(expr.format(w) for w in words))
fumble_tds = rush.loc[(rush.play_text.str.contains(ftds))]

fbtd_ids = list(fumble_tds.id.values)
df.loc[df.id.isin(fbtd_ids),'play_type'] = 'Fumble Return Touchdown'

words = ['fumbled','loss of']
ftds = base.format(''.join(expr.format(w) for w in words))
fumble_tds = rush.loc[(rush.play_text.str.contains(ftds))]

fbtd_ids = list(fumble_tds.id.values)
df.loc[df.id.isin(fbtd_ids),'play_type'] = 'Fumble Return Touchdown'

# subset of fumble 6s always say 'to the {other team} 0' 
words = ['to the','0']
ftds = base.format(''.join(expr.format(w) for w in words))
fumble_tds = rush.loc[(rush.play_text.str.contains(ftds))]

fbtd_ids = list(fumble_tds.id.values)
df.loc[df.id.isin(fbtd_ids),'play_type'] = 'Fumble Return Touchdown'

df.loc[((df.play_type=='Rush')&(df.play_text.str.contains('penalty|PENALTY|Penalty'))), 'play_type'] = 'Penalty'



In [29]:
# hard to determine which fumbles go for offensive TD vs defensive TD from just play text
# lean on drive result

df.loc[(df.play_type=='Rush')&(df.drive_result=='RUSHING TD'), 'play_type'] = 'Rushing Touchdown'

df.loc[(df.play_type=='Rush')&(df.drive_result=='FUMBLE RETURN TD'), 'play_type'] = 'Fumble Return Touchdown'

# i verified these
df.loc[((df.play_type=='Rush')&(df.yard_line>90)&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))), 'play_type'] = 'Rushing Touchdown'

# rest seem to be defensive. might be one or two offensive that leaked through
df.loc[((df.play_type=='Rush')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))), 'play_type'] = 'Fumble Return Touchdown'
                               


In [30]:
clean = ['Pass','Rush']
sa = df.loc[(df.play_type.isin(clean))&(df.play_text.str.contains('Safety|safety|SAFETY'))]
sa_ids = list(sa.id.values)
print(len(sa))
df.loc[df.id.isin(sa_ids), 'play_type'] = 'Safety'

3


In [31]:
# divide up the "pass" play type
df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('incomplete')), 'play_type'] = 'Pass Incompletion'
df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('complete')), 'play_type'] = 'Pass Completion'
df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('intercepted')), 'play_type'] = 'Interception'
df.loc[(df.play_type=='Pass')&(df.play_text.str.contains('sacked')), 'play_type'] = 'Sack'
    

In [32]:
pa = df.loc[(df.play_type=='Pass')]

# only things left are fumbles
# lean on drive result, tough to know who recovered fumble 

li = ['FUMBLE','Uncategorized']

fro = pa.loc[~pa.drive_result.isin(li)]
fro_ids = list(fro.id.values)
df.loc[df.id.isin(fro_ids), 'play_type'] = 'Fumble Recovery (Own)'

fum = pa.loc[pa.drive_result == 'FUMBLE']
fum_ids = list(fum.id.values)
df.loc[df.id.isin(fum_ids), 'play_type'] = 'Fumble Recovery (Opponent)'


In [33]:
# uncategorized
pa = df.loc[(df.play_type=='Pass')]

df.loc[df.id==253370030087, 'play_type'] = 'Fumble Recovery (Opponent)'
df = df.loc[df.id != 262590259175]


In [34]:
tds = ['FUMBLE RETURN TD','FUMBLE TD']

td = df.loc[(df.play_type == 'Sack') & (df.play_text.str.contains('fumbled')) & (df.drive_result.isin(tds))]
td_ids = list(td.id.values)

df.loc[df.id.isin(td_ids), 'play_type'] = 'Fumble Return Touchdown'
del tds

sa = df.loc[(df.play_type == 'Sack') & (df.play_text.str.contains('fumbled'))]

li = ['FUMBLE','Uncategorized']

fro = sa.loc[~sa.drive_result.isin(li)]
fro_ids = list(fro.id.values)
df.loc[df.id.isin(fro_ids), 'play_type'] = 'Fumble Recovery (Own)'

fum = sa.loc[sa.drive_result == 'FUMBLE']
fum_ids = list(fum.id.values)
df.loc[df.id.isin(fum_ids), 'play_type'] = 'Fumble Recovery (Opponent)'

sa = df.loc[(df.play_type=='Sack')&(df.play_text.str.contains('fumbled'))]

# only recovery by the defense
df.loc[df.id==322450120161, 'play_type'] = 'Fumble Recovery (Opponent)'

df.loc[(df.play_type=='Sack')&(df.play_text.str.contains('fumbled')), 'play_type'] = 'Fumble Recovery (Own)'

del sa

gc.collect()

33

In [35]:
td = df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('fumbled')) & (df.drive_result=='FUMBLE TD')]

words = ['to the','0']
ftds = base.format(''.join(expr.format(w) for w in words))
fumble_tds = td.loc[(td.play_text.str.contains(ftds))]

fbtd_ids = list(fumble_tds.id.values)
df.loc[df.id.isin(fbtd_ids),'play_type'] = 'Fumble Return Touchdown'

# rest are just fumble recoveries
td = df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('fumbled')) & (df.drive_result=='FUMBLE TD')]
td_ids = list(td.id.values)
df.loc[df.id.isin(td_ids),'play_type'] = 'Fumble Recovery (Opponent)'

del td

In [36]:
df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('fumbled')) & (df.drive_result=='FUMBLE RETURN TD'), 'play_type'] = 'Passing Touchdown'


In [37]:
# only defensive returned touchdown in set
df.loc[df.id==272932393004, 'play_type'] = 'Fumble Return Touchdown'

pa = df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('fumbled'))]

fumbles = pa.loc[pa.drive_result=='Fumble']

fum_ids = list(fumbles.id.values)
df.loc[df.id.isin(fum_ids), 'play_type'] = 'Fumble Recovery (Opponent)'

pa = df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('fumbled')) & (df.drive_result == 'Uncategorized')]



# individual fixes
fr_own = ['UTEP','Arkansas State','Colorado','Notre Dame','Iowa State','Penn St.','TCU','Florida Atlantic']

fr_o = pa.loc[pa.offense.isin(fr_own)]
fr_o_ids = list(fr_o.id.values)

df.loc[df.id.isin(fr_o_ids), 'play_type'] = 'Fumble Recovery (Own)'


fr_opp = pa.loc[~pa.offense.isin(fr_own)]
fr_opp_ids = list(fr_opp.id.values)
df.loc[df.id.isin(fr_opp_ids), 'play_type'] = 'Fumble Recovery (Opponent)'

In [38]:
df.loc[(df.play_type == 'Pass Completion') & (df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')), 'play_type']='Passing Touchdown'


In [39]:
bfg = df.loc[(df.play_type=='Field Goal Missed')&(df.play_text.str.contains('Blocked|BLOCKED|blocked'))]

bfg_td = bfg.loc[bfg.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')]

bfg_td_ids = list(bfg_td.id.values)

bfg = bfg.loc[~bfg.id.isin(bfg_td_ids)]
bfg_ids = list(bfg.id.values)

df.loc[df.id.isin(bfg_td_ids), 'play_type'] = 'Blocked Field Goal Touchdown'
df.loc[df.id.isin(bfg_ids), 'play_type'] = 'Blocked Field Goal'


In [40]:
# now punts

bfg = df.loc[(df.play_type=='Punt')&(df.play_text.str.contains('Blocked|BLOCKED|blocked'))]

bfg_td = bfg.loc[bfg.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD')]


bfg_td_ids = list(bfg_td.id.values)

bfg = bfg.loc[~bfg.id.isin(bfg_td_ids)]
bfg_ids = list(bfg.id.values)

df.loc[df.id.isin(bfg_td_ids), 'play_type'] = 'Blocked Punt Touchdown'
df.loc[df.id.isin(bfg_ids), 'play_type'] = 'Blocked Punt'


In [41]:
# punt return touchdowns

prtd = df.loc[(df.play_type=='Punt')&(df.play_text.str.contains('TOUCHDOWN|touchdown|Touchdown|TD'))]


words = ['penalty','accepted']
called_back = base.format(''.join(expr.format(w) for w in words))
prtd = prtd.loc[~prtd.play_text.str.contains(called_back, regex=True)]

prtd = prtd.loc[~prtd.play_text.str.contains('fumbled|TTD')]
prtd = prtd.loc[~prtd.play_text.str.contains('NO PLAY')]

prtd_ids = list(prtd.id.values)

df.loc[df.id.isin(prtd_ids), 'play_type'] = 'Punt Return Touchdown'


In [42]:
# lastly, fix declined penalties
# seem to be mostly right except some rushes
# decline = df.loc[(df.play_type=='Penalty')&(df.play_text.str.contains('declined|DECLINED'))]
# print(len(decline))

# rush = decline.loc[(decline.play_text.str.contains('rush|rushed'))&~(decline.play_text.str.contain)]

# comp = decline.loc[decline.play_text.str.contains('pass complete')]

# incomp = decline.loc[decline.play_text.str.contains('incomplete')]

# for pt in list(decline.play_text.values):
#     print(pt)

1) fix "uncategorized" play type (check)
2) aggregate, clean, validate all play types (check)
3) fix "uncategorized" drive results
4) aggregate, clean, validate all drive results
5) compare play types to drive results to make sure they match

In [43]:
gb = df.groupby(['play_type'])['down'].count()
gb

play_type
Blocked Field Goal                      1196
Blocked Field Goal Touchdown              49
Blocked Punt                             694
Blocked Punt Touchdown                   330
Field Goal Good                        24014
Field Goal Missed                       7981
Fumble Recovery (Opponent)              5562
Fumble Recovery (Own)                   5366
Fumble Return Touchdown                 1439
Interception                            1132
Interception Return Touchdown            921
Missed Field Goal Return                  20
Missed Field Goal Return Touchdown         3
Pass Completion                       371455
Pass Incompletion                     261974
Pass Interception                      18083
Passing Touchdown                      34531
Penalty                               109818
Punt                                  106619
Punt Return Touchdown                    653
Rush                                  657905
Rushing Touchdown                     113410


In [44]:
# make expected drive results to compare to given drive result data
pass_tds = df.loc[df.play_type == 'Passing Touchdown']
ptd_ids = list(pass_tds.drive_id.unique())
del pass_tds

rush_tds = df.loc[df.play_type == 'Rushing Touchdown']
rtd_ids = list(rush_tds.drive_id.unique())
del rush_tds

fgg = df.loc[df.play_type == 'Field Goal Good']
fgg_ids = list(fgg.drive_id.unique())
del fgg

fgm = df.loc[df.play_type == 'Field Goal Missed']
fgm_ids = list(fgm.drive_id.unique())
del fgm

sf = df.loc[df.play_type == 'Safety']
sf_ids = list(sf.drive_id.unique())
del sf

intn = df.loc[df.play_type == 'Pass Interception']
int_ids = list(intn.drive_id.unique())
del intn

fropp = df.loc[df.play_type == 'Fumble Recovery (Opponent)']
fropp_ids = list(fropp.drive_id.unique())
del fropp

ftd = ['Missed Field Goal Return Touchdown','Blocked Field Goal Touchdown']
fg_td = df.loc[df.play_type.isin(ftd)]
ftd_ids = list(fg_td.drive_id.unique())
del fg_td

prtd = ['Punt Return Touchdown','Blocked Punt Touchdown']
punt_td = df.loc[df.play_type.isin(prtd)]
prtd_ids = list(punt_td.drive_id.unique())
del punt_td

punts = df.loc[df.play_type =='Punt']
punt_ids = list(punts.drive_id.unique())
del punts

df['exp_drive_result'] = np.nan

drs = [ptd_ids, rtd_ids,fgg_ids,fgm_ids,sf_ids,int_ids,fropp_ids,ftd_ids,prtd_ids,punt_ids]

df.loc[df.drive_id.isin(ptd_ids), 'exp_drive_result'] = 'PASSING TD'
df.loc[df.drive_id.isin(rtd_ids), 'exp_drive_result'] = 'RUSHING TD'
df.loc[df.drive_id.isin(fgg_ids), 'exp_drive_result'] = 'FG GOOD'
df.loc[df.drive_id.isin(fgm_ids), 'exp_drive_result'] = 'FG MISSED'
df.loc[df.drive_id.isin(sf_ids), 'exp_drive_result'] = 'SF'
df.loc[df.drive_id.isin(int_ids), 'exp_drive_result'] = 'INT'
df.loc[df.drive_id.isin(fropp_ids), 'exp_drive_result'] = 'FUMBLE'
df.loc[df.drive_id.isin(ftd_ids), 'exp_drive_result'] = 'FG MISSED TD'
df.loc[df.drive_id.isin(prtd_ids), 'exp_drive_result'] = 'PUNT RETURN TD'
df.loc[df.drive_id.isin(punt_ids), 'exp_drive_result'] = 'PUNT'

# print(len(df))
# most = df.dropna(subset=['exp_drive_result'])
# print(len(most))

# del most
gc.collect()

0

Time to clean/validate/aggregate drive results

In [45]:
gb = df.groupby(['drive_result'])['down'].count()
gb

drive_result
DOWNS                          51299
DOWNS TD                          99
END OF 4TH QUARTER               234
END OF GAME                    33673
END OF GAME TD                    22
END OF HALF                    32489
END OF HALF TD                    75
FG                             91065
FG GOOD                       139212
FG GOOD TD                        31
FG MISSED                      53590
FG MISSED TD                     229
FG TD                             22
FUMBLE                         68676
FUMBLE RETURN TD                1407
FUMBLE TD                       2580
INT                            91974
INT RETURN TOUCH                 113
INT TD                          6512
KICKOFF                           77
KICKOFF RETURN TD                 23
MISSED FG                      31796
MISSED FG TD                     275
PASSING TD                    149349
POSSESSION (FOR OT DRIVES)        24
PUNT                          608778
PUNT RETURN TD           

In [46]:
targ = df.loc[(df.drive_result == 'PUNT') & (df.play_type=='Punt Return Touchdown')]
targ_ids = list(targ.drive_id.values)

df.loc[df.drive_id.isin(targ_ids), 'drive_result'] = 'PUNT RETURN TD'

# muffed punts recovered by return team

targ = df.loc[(df.exp_drive_result=='FUMBLE')&(df.drive_result == 'PUNT') & (df.play_type=='Punt')]
targ_ids = list(targ.drive_id.values)
df.loc[df.drive_id.isin(targ_ids), 'drive_result'] = 'PUNT'

# safeties that are called punts because of the punt afterward

targ = df.loc[(df.exp_drive_result=='PUNT')&(df.drive_result == 'SF')]
targ_ids = list(targ.drive_id.values)
df.loc[df.drive_id.isin(targ_ids), 'exp_drive_result'] = 'SF'

# weird 
targ = df.loc[(df.exp_drive_result=='FG GOOD')&(df.drive_result == 'FG MISSED') & (df.play_type=='Field Goal Good')]
targ_ids = list(targ.drive_id.values)
df.loc[df.drive_id.isin(targ_ids), 'drive_result'] = 'FG GOOD'

# counting punt returns and blocked punts in same category
targ = df.loc[(df.drive_result == 'PUNT')&(df.exp_drive_result=='PUNT RETURN TD')& (df.play_type=='Blocked Punt Touchdown')]
targ_ids = list(targ.drive_id.values)
df.loc[df.drive_id.isin(targ_ids), 'drive_result'] = 'PUNT RETURN TD'



In [47]:
compare = ['PASSING TD','RUSHING TD','FG GOOD','FG MISSED','SF','INT','FUMBLE','FG MISSED TD','PUNT RETURN TD','PUNT']

comp = df.loc[df.drive_result.isin(compare)].copy()

comp['match'] = np.where(comp['drive_result']==comp['exp_drive_result'],1,0)

print("What pct of drives sampled match our expected drive result?")
print(str(np.round((comp.match.sum()/len(comp))*100,1))+'%')

comp['exp_drive_result'] = comp['exp_drive_result'].fillna(comp['drive_result'])


# TTD abbreviation threw everything off
comp['exp_drive_result'] = np.where(comp['exp_drive_result'].str.contains('TTD'),comp['drive_result'],comp['exp_drive_result'])

comp['match'] = np.where(comp['drive_result']==comp['exp_drive_result'],1,0)



What pct of drives sampled match our expected drive result?
94.4%


In [48]:
wrong = comp.loc[comp.match==0].copy()

print(wrong[['drive_result','exp_drive_result','play_type','drive_id']])
for pt in list(wrong.play_text.values):
    print(pt)

        drive_result exp_drive_result                play_type     drive_id
6838      PASSING TD       RUSHING TD        Pass Incompletion  25246242618
6839      PASSING TD       RUSHING TD                     Rush  25246242618
6840      PASSING TD       RUSHING TD                  Timeout  25246242618
6841      PASSING TD       RUSHING TD          Pass Completion  25246242618
6842      PASSING TD       RUSHING TD        Pass Incompletion  25246242618
6843      PASSING TD       RUSHING TD        Pass Incompletion  25246242618
6844      PASSING TD       RUSHING TD        Pass Incompletion  25246242618
6845      PASSING TD       RUSHING TD             Interception  25246242618
6846      PASSING TD       RUSHING TD                     Rush  25246242618
6847      PASSING TD       RUSHING TD          Pass Completion  25246242618
6848      PASSING TD       RUSHING TD          Pass Completion  25246242618
6849      PASSING TD       RUSHING TD                     Rush  25246242618
6850      PA

Zac Robinson pass incomplete to Hubert Anyiam.
Riley Skinner pass complete to Mike Rinfrette for 4 yards to the WFrst 18.
Brandon Pendergrass rush for 4 yards to the WFrst 14.
Brandon Pendergrass rush for a loss of 1 yard to the WFrst 10.
Shane Popham punt for 37 yards, returned by Michael Wade for no gain, fumbled at the Clem 45.
Rusty Smith pass complete to Chris Bonner for 12 yards to the FlAtl 32 for a 1ST down.
Rusty Smith pass incomplete.
William Rose rush for 29 yards to the NoTex 17 for a 1ST down.
William Rose rush for 5 yards to the FlAtl 47.
Timeout FLORIDA ATLANTIC, clock 5:17.
Rusty Smith pass complete to William Rose for 7 yards to the NoTex 46 for a 1ST down.
Rusty Smith pass incomplete.
Rusty Smith rush for 17 yards for a TOUCHDOWN.
Alfred Morris rush for 10 yards to the FlAtl 42 for a 1ST down.
Kase Whitehead punt for 39 yards, returned by Jock Sanders, fumbled, forced by Sean McClellan, recovered by Marsh John Jacobs at the WVirg 42, John Jacobs for 18 yards, to the W

James Sims Rush rush for 2 yards for a TOUCHDOWN.
Jamal Woodyard rush for 2 yards to the SoMis 49, SOUTHERN MISS penalty 10 yard Holding accepted.
Austin Davis pass complete to Ryan Balentine for 14 yards to the SoMis 29 for a 1ST down.
UCF penalty 14 yard Pass Interference accepted.
Austin Davis pass incomplete to Tracy Lampley.
Austin Davis pass complete to Desmond Johnson for 3 yards to the SoMis 33.
UCF penalty 10 yard Holding accepted.
SOUTHERN MISS penalty 5 yard False Start accepted.
Jamal Woodyard rush for 6 yards to the SoMis 35.
Peter Boehme punt for 45 yards, returned by J.J. Worton for 9 yards, fumbled at the UCF 25.
Austin Davis pass incomplete to Dominique Sullivan.
Austin Davis pass incomplete to Tracy Lampley.
Austin Davis pass complete to Tracy Lampley for 2 yards to the SoMis 39, SOUTHERN MISS penalty 10 yard Holding accepted.
Austin Davis pass complete to Desmond Johnson for 8 yards to the SoMis 47.
SOUTHERN MISS penalty 10 yard Holding accepted.
Brandon Weeden pass 

Malik Stokes rush for 3 yards to the NflkS 18, tackled by Preston Smith and Scott Wiggins.
OLDDOMINION penalty 15 yard personal foul on Rashaad Coward accepted, no play.
Rolandan Finch rush for 1 yard to the NflkS 15, tackled by John Darr and Terrell Reid.
Malik Stokes rush for 3 yards to the NflkS 18, tackled by Preston Smith and Scott Wiggins.
Malik Stokes rush for 1 yard to the NflkS 34, tackled by TJ Ricks.
Taylor Heinicke pass complete to Antonio Vaughan for 7 yards to the ODU 44, tackled by Collin Frazer out-of-bounds, NORFOLK ST penalty substitution infraction declined for a 1ST down.
Taylor Heinicke pass complete to Melvin Vaughn for 9 yards to the ODU 37, tackled by Darrin Marrow out-of-bounds.
Taylor Heinicke pass incomplete to Antonio Vaughan.
Taylor Heinicke pass complete to Antonio Vaughan for 7 yards to the ODU 44, tackled by Collin Frazer out-of-bounds, NORFOLK ST penalty substitution infraction declined for a 1ST down.
Taylor Heinicke pass complete to Larry Pinkard for 

CLEMSON Penalty, False Start (Dorian O'Daniel) to the Clem 10
Deshaun Watson pass incomplete to Germone Hopper
Andy Teasdall punt for 36 yds , Ryan Switzer returns for no gain to the Clem 46 Ryan Switzer fumbled, recovered by NCaro Ryan Switzer
Deshaun Watson run for 4 yds to the Clem 15
JK Scott punt for 53 yds, Antonio Callaway fumbled, recovered by Callaway at the Fla 7, returns 1 yard to the Fla 8
Jake Coker pass incomplete to Calvin Ridley, broken up by Vernon Hargreaves III
Derrick Henry run for a loss of 2 yards to the Alab 36
Jake Coker sacked by Jonathan Bullard for a loss of 7 yards to the Alab 29
Shock Linwood run for 1 yd to the Bayl 30
Drew Galitz punt for 42 yds , Daje Johnson returns for 2 yds to the Texas 29 Daje Johnson fumbled, recovered by Texas Jake Oliver
BAYLOR Penalty, Delay of Game (-5 Yards) to the Bayl 31
Lynx Hawthorne run for 11 yds to the Bayl 36
Lynx Hawthorne pass incomplete to Corey Coleman
BAYLOR Penalty, False Start (Jarell Broxton) to the Bayl 25
Luke

In [49]:
# drop games that are clearly off
gb = wrong.groupby(['game_id'])['drive_result'].count()
gb = gb.loc[gb > 15]
bad_games = list(gb.index.values)

df = df.loc[~df.game_id.isin(bad_games)]

"intercepted" - interceptions
"

In [61]:
# gb = df.groupby(['game_id','drive_id'])['offense','tr_game'].first().reset_index()



        alt_game_id alt_drive_id    game_id     drive_id
0         252440009           01  252440009  25244000901
1         252440009           01  252440009  25244000901
2         252440009           01  252440009  25244000901
3         252440009           01  252440009  25244000901
4         252440009           01  252440009  25244000901
5         252440009           02  252440009  25244000902
6         252440009           02  252440009  25244000902
7         252440009           02  252440009  25244000902
8         252440009           02  252440009  25244000902
9         252440009           02  252440009  25244000902
10        252440009           02  252440009  25244000902
11        252440009           02  252440009  25244000902
12        252440009           03  252440009  25244000903
13        252440009           03  252440009  25244000903
14        252440009           03  252440009  25244000903
15        252440009           03  252440009  25244000903
16        252440009           0

Unnamed: 0,game_id,drive_id,offense,tr_game
0,252440009,25244000901,Arizona State,3526.0
1,252440009,25244000902,Temple,3420.0
2,252440009,25244000903,Arizona State,3085.0
3,252440009,25244000904,Temple,2933.0
4,252440009,25244000905,Arizona State,2713.0
5,252440009,25244000906,Temple,2490.0
6,252440009,25244000907,Arizona State,2411.0
7,252440009,25244000908,Temple,2350.0
8,252440009,25244000909,Arizona State,2313.0
9,252440009,25244000910,Temple,2178.0
