In [1]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import datetime
gc.collect()

from tqdm import tqdm

# Load Data

In [2]:
# load play by play & drive data
years = list(range(2005, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    
    drive_path = './output/'+str(year)+'/'+str(year)+'_drives.csv'
    drive_df = pd.read_csv(drive_path)
    
    drive_df = drive_df.rename(columns={'id':'drive_id'})
    
    sea_df = pd.merge(left=sea_df, right=drive_df, how='left', on=['drive_id','drive_id'])
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(df.shape)
print(str(num_plays) + " plays were loaded")

100%|██████████| 14/14 [00:30<00:00,  3.06s/it]

(2031893, 41)
2031893 plays were loaded





# Clean Data 

In [3]:
# fix clock data first so drives can be figured out
time_cols = ['clock.minutes','clock.seconds','start_time.minutes','start_time.seconds',
            'end_time.minutes','end_time.seconds']
for tc in time_cols:
    df[tc] = df[tc].fillna(0)

# get time remaining in game
df['tr_game'] = (4-df['period']) * 900 + (df['clock.minutes'] * 60) + df['clock.seconds']
df['tr_half'] = np.where(df['period']>2,df['tr_game'], df['tr_game']-1800)

df = df.drop(columns=['clock.minutes','clock.seconds'])

In [4]:
# fix uncategorized
uncat = df.loc[df['play_type']=='Uncategorized']
print(len(uncat))
print(uncat.play_text.head(10))

52
51929                     End of half, clock 00:00.
65245                     End of half, clock 00:00.
86881                     End of half, clock 00:00.
95268                     End of half, clock 00:00.
106328                    End of half, clock 15:00.
14553                     End of half, clock 00:00.
14768     BOWLING GREEN drive start at 15:00 (OT ).
18425            PURDUE drive start at 15:00 (OT ).
22311                                  Clock 15:00.
28877                     End of half, clock 00:00.
Name: play_text, dtype: object


In [5]:
print(df.play_type.unique())
def fix_uncat(play_type, play_text):
    if play_type != 'Uncategorized':
        return play_type
    else:
        if isinstance(play_text,str):
            if "Start of the 2nd quarter." in play_text:
                return "End Period"
            elif "Start of the 3rd quarter." in play_text:
                return "End of Half"
            elif "Start of the 4th quarter." in play_text:
                return "End Period"
            elif "Start of overtime." in play_text:
                return "End Period"
            elif "End of the game." in play_text:
                return "End of Game"
            elif "Extra point" in play_text:
                if "is good" in play_text:
                    return "Extra Point Good"
                elif "is no good." in play_text[-13:]:
                    return "Extra Point Missed"
                else:
                    return play_type
            elif "field goal" in play_text:
                if "is good" in play_text:
                    return "Field Goal Good"
                elif "is no good." in play_text[-13:]:
                    return "Field Goal Missed"
                else:
                    print(play_text)
                    return play_type
            elif "missed PAT returned." in play_text:
                return "Extra Point Missed"
            elif "took lateral and rushed" in play_text:
                return "Rush"
            # mostly fumbled snaps recovered by own team
            elif "fumbled" in play_text:
                return "Fumble Recovery (Own)"
            elif "return for" in play_text:
                return "Punt Return"
            elif "End of" in play_text:
                return "End Period"
            elif "run for" in play_text:
                return "Rush"
            elif "SAFETY" in play_text:
                return "Safety"
            elif "Penalty" in play_text:
                return "Penalty"
            else:
                return play_type
    return play_type

df['play_type'] = df.apply(lambda row: fix_uncat(row['play_type'], row['play_text']),axis=1)

# uncat = df.loc[df.play_type=='Uncategorized']
# print(len(uncat))

['Punt' 'Rush' 'Pass' 'Penalty' 'Kickoff' 'Extra Point Good' 'Timeout'
 'Field Goal Good' 'Field Goal Missed' 'Extra Point Missed' 'End Period'
 'Safety' 'Uncategorized' 'Pass Incompletion' 'Pass Completion'
 'Pass Interception' 'Sack' 'Fumble Return Touchdown'
 'Punt Return Touchdown' '2pt Conversion' 'Offensive 1pt Safety'
 'Kickoff Return (Offense)' 'Pass Reception' 'Fumble Recovery (Opponent)'
 'Fumble Recovery (Own)' 'Passing Touchdown' 'Rushing Touchdown'
 'Pass Interception Return' 'Interception Return Touchdown' 'End of Half'
 'Blocked Field Goal' 'Blocked Punt' 'End of Game'
 'Kickoff Return Touchdown' 'Blocked Field Goal Touchdown'
 'Defensive 2pt Conversion' 'Blocked Punt Touchdown'
 'Missed Field Goal Return' 'Interception'
 'Missed Field Goal Return Touchdown']


In [6]:
# isolate extra point attempts
xp_cats = ['Two Point Pass','Two Point Rush','Blocked PAT','Extra Point Good','Extra Point Missed', '2pt Conversion',
          'Offensive 1pt Safety','Defensive 2pt Conversion']
xps = df.loc[df['play_type'].isin(xp_cats)]
df = df.loc[~df['play_type'].isin(xp_cats)]

In [7]:
# isolate kickoffs
kickoffs_cats = ['Kickoff Return (Offense)', 'Kickoff Return Touchdown', 'Kickoff Return (Defense)', 'Kickoff']
kickoff_penalty_plays = ['KICKOFF', 'KICKOFF RETURN TD']
kickoffs = df.loc[(df['play_type'].isin(kickoffs_cats)) | (df['drive_result']).isin(kickoff_penalty_plays)]
df = df.loc[~df['play_type'].isin(kickoffs_cats)]


In [8]:
# isolate OT
ot = df.loc[(df['period'] > 4) | (df['drive_result'] == 'POSSESSION (FOR OT DRIVES)')]
df = df.loc[(df['period'] <= 4) & (df['period'] >0)]
df = df.loc[df.drive_result != 'POSSESSION (FOR OT DRIVES)']


In [9]:
# drop end of period plays

eop = ['End of Game','End of Half','End Period']
df = df.loc[~df['play_type'].isin(eop)]


In [10]:
# validate
# bad = df.loc[df['offense_x']!=df['offense_y']]
# print(bad[['tr_game','play_text','offense_x','offense_y']].head(25))
# offense_x seems to be correct while offense_y is not

df = df.drop(columns=['defense_y','defense_conference_y','offense_y','offense_conference_y'])

df = df.rename(columns={'defense_x':'defense','defense_conference_x':'defense_conference','offense_x':'offense','offense_conference_x':'offense_conference'})


In [11]:
# fix bad distances

zeros = df.loc[df['distance'] == 0]
# print(len(zeros))
# print(zeros.groupby(['play_type'])['distance'].count())
# print(zeros.play_text.tail(50))

# drop negative distances. change 0 distances to 0.5 yard
df = df.loc[df['distance']>=0]

df = df.rename(columns={'distance':'wrong_distance'})
df['distance'] = np.where(df['wrong_distance']>0,df['wrong_distance'],0.5)
df = df.drop(columns=['wrong_distance'])


In [12]:
# fix bad downs
zero_down = df.loc[df['down']==0]
print(len(zero_down))

# impute down + 1 from previous play, to max of 4
df['down'] = np.where(df['down']>0,df['down'],df['down'].shift()+1)
# still 18 bad

df = df.loc[(df['down']>0)&(df['down']<5)]

28


## Clock
Also, sometimes clock data is wrong. I messed with trying to predict time per play based on play type, but the data was prohibitively messy. Instead I'll just assume every play takes up the same percentage of drive time. The worst effect this will have is it will make incompletions look worse in late game situations, because incompletions will have the same time elapsed as completion. I guess this matters less in college football, because the clock stops on a first down.

In [13]:
# fix negative drive times first, they mostly contain actual plays
df['elapsed.minutes'] = df['elapsed.minutes'].copy().fillna(0)
df['elapsed.seconds'] = df['elapsed.seconds'].copy().fillna(0)
df['drive_time'] = 60*df['elapsed.minutes'] + df['elapsed.seconds']
cols = list(df)
print(cols)

['away', 'defense', 'defense_conference', 'defense_score', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'season', 'week', 'season_type', 'drive_result', 'elapsed.minutes', 'elapsed.seconds', 'end_period', 'end_time.minutes', 'end_time.seconds', 'end_yardline', 'game_id', 'plays', 'scoring', 'start_period', 'start_time.minutes', 'start_time.seconds', 'start_yardline', 'yards', 'tr_game', 'tr_half', 'distance', 'drive_time']


In [14]:
# can't find systematic error, just going to reverse start and end time. the reversed data
# seems to make sense


negs = df.loc[df.drive_time < 0].copy()
negs = negs[cols]
neg_ids = list(negs.drive_id.unique())
df = df.loc[~df.drive_id.isin(neg_ids)]

negs = negs.rename(columns={'elapsed.minutes':'wrong_em','elapsed.seconds':'wrong_es',
                            'start_time.minutes':'wrong_sm','start_time.seconds':'wrong_ss',
                            'end_time.minutes':'wrong_etm','end_time.seconds':'wrong_ets',
                            'drive_time':'wrong_dt'
                           })

negs['start_time.minutes'] = negs['wrong_etm'].copy()
negs['start_time.seconds'] = negs['wrong_ets'].copy()

negs['end_time.minutes'] = negs['wrong_sm'].copy()
negs['end_time.seconds'] = negs['wrong_ss'].copy()

negs['elapsed.minutes'] = negs['wrong_em'].copy().abs()
negs['elapsed.seconds'] = negs['wrong_es'].copy().abs()
negs['drive_time'] = negs['wrong_dt'].copy().abs()

drop_cols = ['wrong_em','wrong_es','wrong_sm','wrong_ss','wrong_etm','wrong_ets','wrong_dt']
negs = negs.drop(columns=drop_cols)

negs = negs[cols]
df = pd.concat([df,negs],sort=False)
df = df.sort_values(by=['game_id','tr_game','drive_id'])

In [15]:
# now work on fixing nan drive timea
df['drive_time'] = 60*df['elapsed.minutes'] + df['elapsed.seconds']


In [16]:
zeros = df.loc[df.drive_time == 0]
zgb = zeros.groupby(['plays'])['start_time.minutes'].count()

# zero play drives are fumble recoveries by the other team or punt returns
# fumble recoveries are on the first play of the drive and are actually correct
# need to fix punt returns

# print(zplay.drive_id.head())

zpunts = zeros.loc[(zeros.plays==0)&(zeros.play_type=='Punt Return')]



zp_dids = list(zpunts.drive_id.unique())
zp_mo = [(x-1) for x in zp_dids]

if len(zp_dids) > 0:
    df['did-1'] = df['drive_id'] - 1
    df['drive_id'] = np.where(df['drive_id'].isin(zp_dids),df['did-1'], df['drive_id'])
    df.loc[df.drive_id.isin(zp_mo), 'drive_result'] = 'PUNT'
    df.loc[df.drive_id.isin(zp_mo), 'drive_time'] = 150
    
    
    
# do the same for drive result == 'punt return td td'
prtt = df.loc[df.drive_result=='PUNT RETURN TD TD']
prtt_dids = list(prtt.drive_id.unique())
prtt_mo = [(x-1) for x in prtt_dids]

if len(prtt_dids) > 0:
    df['drive_id'] = np.where(df['drive_id'].isin(prtt_dids),df['did-1'], df['drive_id'])
    df.loc[df.drive_id.isin(prtt_mo), 'drive_result'] = 'PUNT RETURN TD'
    df = df.drop(columns=['did-1'])
    

I tried to fix drive times of zero seconds with more than zero plays. My idea was get the time remaining of the next drive, and subtract that from the time remaining of the current drive. Unfortunately that only worked for about half the drives. And, most of the drives it did work on, only resulted in drive times of under a minute. That seemed off and I didn't see an obvious reason why. I decided just to drop these drives. That's about 3% of the data.

In [17]:
print(len(df))
df = df.loc[df['drive_time']>0]
print(len(df))

1846914
1799493


## Fix Bad Drive Results, Standardize Others

In [18]:
# fix drive result kickoffs
kos = df.loc[(df['drive_result']=='KICKOFF')]
dids = list(kos.drive_id.unique())

fgs = df.loc[(df['drive_id'].isin(dids))&df['play_text'].str.contains('Field Goal')]
fg_ids = list(fgs.drive_id.unique())

df.loc[df.drive_id.isin(fg_ids), 'drive_result'] = 'FG GOOD'


punts = [4005483434]
end_of_half = [40054770815]

df.loc[df.drive_id.isin(punts), 'drive_result'] = 'PUNT'
df.loc[df.drive_id.isin(end_of_half), 'drive_result'] = 'END OF HALF'

# special cases

df.loc[df.drive_id == 24269009911, 'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id == 24269009911, 'drive_id'] = 24269009912

In [19]:
# fix Fg missed TDs
brt = df.loc[(df['drive_result']=='FG MISSED TD')&df['play_text'].str.contains('BLOCKED')]
brt_ids = list(brt.drive_id.unique())

df.loc[df['drive_id'].isin(brt_ids), 'drive_result'] = 'BLOCKED FG (TD) TD'

brt = df.loc[(df['drive_result']=='FG MISSED TD')&df['play_text'].str.contains('blocked')]
brt_ids = list(brt.drive_id.unique())

df.loc[df['drive_id'].isin(brt_ids), 'drive_result'] = 'BLOCKED FG (TD) TD'

brt = df.loc[(df['drive_result']=='FG MISSED TD')&df['play_text'].str.contains('blocked,')]
brt_ids = list(brt.drive_id.unique())

df.loc[df['drive_id'].isin(brt_ids), 'drive_result'] = 'BLOCKED FG (TD) TD'

In [20]:
# merge 'FG' and 'FG GOOD'
df.loc[df['drive_result']=='FG', 'drive_result'] = 'FG GOOD' 
df.loc[df['drive_result']=='MADE FG', 'drive_result'] = 'FG GOOD' 

df.loc[df['drive_result']=='MISSED FG', 'drive_result'] = 'FG MISSED' 

In [21]:
# fix 'end of half TD'
end_of_half = [29290230615,29304013515,32259006215,32308025415,32315015219,33250230613,33285263815,33327000821,40060388027]
fumble_tds = [30279211614]
int_tds = [30282006815,40079088219]
rush_tds = [30247263316,30268006217,32243211716,32301000917,40054794315]
pass_tds = [30324002112,32329023512]
block_fg_td = [40054834612]

df.loc[df['drive_id'].isin(end_of_half), 'drive_result'] = 'END OF HALF'
df.loc[df['drive_id'].isin(fumble_tds), 'drive_result'] = 'FUMBLE RETURN TD'
df.loc[df['drive_id'].isin(int_tds), 'drive_result'] = 'INT TD'
df.loc[df['drive_id'].isin(rush_tds), 'drive_result'] = 'RUSHING TD'
df.loc[df['drive_id'].isin(pass_tds), 'drive_result'] = 'PASSING TD'
df.loc[df['drive_id'].isin(block_fg_td), 'drive_result'] = 'BLOCKED FG (TD) TD'

In [22]:
# fix "end of game TD"
int_tds = [30324020406,40087609227]
fumble_tds = [40076354226,40086912120]
end_of_game = [40078746229,40094526122]

df.loc[df['drive_id'].isin(int_tds), 'drive_result'] = 'INT TD'
df.loc[df['drive_id'].isin(fumble_tds), 'drive_result'] = 'FUMBLE RETURN TD'
df.loc[df['drive_id'].isin(end_of_game), 'drive_result'] = 'END OF GAME'

In [23]:
# fix "downs TD"
passing_tds = [40054825721,4007635338,40076343011,40086953320,4010133465,4010128564]
rushing_tds = [40054786022,40060392118,40086953316]
interception_tds = [4005482704,40075690213,40076355220,4008696135,40086963817]

df.loc[df['drive_id'].isin(passing_tds), 'drive_result'] = 'PASSING TD'
df.loc[df['drive_id'].isin(rushing_tds), 'drive_result'] = 'RUSHING TD'
df.loc[df['drive_id'].isin(interception_tds), 'drive_result'] = 'INT TD'

In [24]:
# fix 'FG TD' drive result
df.loc[((df.drive_id == 40054786811)&(df.offense=='Baylor')), 'drive_result'] = 'FG GOOD'
df.loc[((df.drive_id == 40054786811)&(df.offense=='Baylor')), 'drive_id'] = 4005478681100

df.loc[(df.drive_id == 40054786811), 'drive_result'] = 'RUSHING TD'

In [25]:
# fix "end of 4th"
df.loc[df['drive_result']=='END OF 4TH QUARTER', 'drive_result'] = 'END OF GAME'

In [26]:
# more blocked field goals
bfg_ids = [242550275,242760259,243250150,243250344]
bfg_dids = [24255027512,24276025916,24325015008,24325034420]
to_fix = [(x-1) for x in bfg_dids]

df.loc[df['drive_id'].isin(to_fix), 'drive_result'] = 'BLOCKED FG (TD) TD' 
df['did-1'] = df['drive_id'] - 1
df['drive_id'] = np.where(df['drive_id'].isin(bfg_dids),df['did-1'], df['drive_id'])
df = df.drop(columns=['did-1'])

In [27]:
# blocked punts
bps = list(df.loc[df['drive_result']=='BLOCKED PUNT TD'].drive_id.unique())
to_fix = [(x-1) for x in bps]

df.loc[df['drive_id'].isin(to_fix), 'drive_result'] = 'BLOCKED PUNT TD' 
df['did-1'] = df['drive_id'] - 1
df['drive_id'] = np.where(df['drive_id'].isin(bps),df['did-1'], df['drive_id'])
df = df.drop(columns=['did-1'])

In [28]:
# the drive data in these games seems very off
bad_ids = [252602572,272560120,272562005]
df = df.loc[~df.game_id.isin(bad_ids)]

In [29]:
# fix 'FG GOOD TD'
# for text in list(cat.tr_game.values):
#     print(text)
fgs = [30282005814]
rush_tds = [32252239316, 32266019408]

df.loc[df.drive_id==30282005814, 'drive_result'] = 'FG GOOD'

df.loc[df.drive_id.isin(rush_tds), 'drive_result'] = 'RUSHING TD'

In [30]:
# Drives labelled "FG MISSED TD" are actually made field goals, ensuing kickoff returned for TD

df.loc[df.drive_result=='FG MISSED TD', 'drive_result'] = 'FG GOOD'

In [31]:
# for incompletes, if there is barely any time left, drive result = end of half or end of game. 
# otherwise turnover on downs

incomp = df.loc[df.drive_result == 'INCOMPLETE']

icb = incomp.groupby(['drive_id'])['tr_game'].min().reset_index()

eofg = icb.loc[icb.tr_game < 30]
eofg_ids = list(eofg.drive_id.values)

incomp = incomp.loc[~incomp.drive_id.isin(eofg_ids)]

icb = incomp.groupby(['drive_id'])['tr_half'].min().reset_index()
eofh = icb.loc[icb.tr_half < 30]
eofh_ids = list(eofh.drive_id.values)

incomp = incomp.loc[~incomp.drive_id.isin(eofh_ids)]

down_ids = list(incomp.drive_id.values)

df.loc[df.drive_id.isin(eofg_ids), 'drive_result'] = 'END OF GAME'
df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'
df.loc[df.drive_id.isin(down_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [32]:
# same with completes

comp = df.loc[df.drive_result == 'PASS COMPLETE']

ccb = comp.groupby(['drive_id'])['tr_game'].min().reset_index()

eofg = ccb.loc[ccb.tr_game < 30]
eofg_ids = list(eofg.drive_id.values)

comp = comp.loc[~comp.drive_id.isin(eofg_ids)]

ccb = comp.groupby(['drive_id'])['tr_half'].min().reset_index()
eofh = ccb.loc[ccb.tr_half < 30]
eofh_ids = list(eofh.drive_id.values)

comp = comp.loc[~comp.drive_id.isin(eofh_ids)]

down_ids = list(comp.drive_id.values)

df.loc[df.drive_id.isin(eofg_ids), 'drive_result'] = 'END OF GAME'
df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'
df.loc[df.drive_id.isin(down_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [33]:
# same with penalties

pens = df.loc[df.drive_result == 'PENALTY']

pcb = pens.groupby(['drive_id'])['tr_game'].min().reset_index()

eofg = pcb.loc[pcb.tr_game < 30]
eofg_ids = list(eofg.drive_id.values)

pens = pens.loc[~pens.drive_id.isin(eofg_ids)]

pcb = pens.groupby(['drive_id'])['tr_half'].min().reset_index()
eofh = pcb.loc[pcb.tr_half < 30]
eofh_ids = list(eofh.drive_id.values)

pens = pens.loc[~pens.drive_id.isin(eofh_ids)]

down_ids = list(pens.drive_id.values)

df.loc[df.drive_id.isin(eofg_ids), 'drive_result'] = 'END OF GAME'
df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'
df.loc[df.drive_id.isin(down_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [34]:
# for drive result == kickoffs, if there is barely any time remain in half, end of half
# otherwise, i think they are just random plays out of other drives. dropping them

kos = df.loc[df.drive_result == 'KICKOFF']

kgb = kos.groupby(['drive_id'])['tr_half'].min().reset_index()

eofh = kgb.loc[kgb.tr_half <= 60]
eofh_ids = list(eofh.drive_id.values)

kos = kos.loc[~kos.drive_id.isin(eofh_ids)]
drops = list(kos.drive_id.values)

df.loc[df.drive_id.isin(eofh_ids), 'drive_result'] = 'END OF HALF'

df = df.loc[~df.drive_id.isin(drops)]

In [35]:
# drop these, they're kickoff penalties
df = df.loc[df.drive_result != 'KICKOFF RETURN TD']

In [36]:
# Rushing TD TD often is two drives with same ids

df.loc[((df.drive_id == 24318000805)&(df.offense=='Ole Miss')), 'drive_id'] = 2431800080500

df.loc[((df.drive_id == 24339003820)&(df.offense=='Colorado')), 'drive_id'] = 2433900382000

df.loc[((df.drive_id == 24276002413)&(df.offense=='Stanford')), 'drive_id'] = 2427600241300

df.loc[df.drive_id==2431800080500, 'drive_time'] = 150
df.loc[df.drive_id==2431800080500, 'drive_result'] = 'TURNOVER ON DOWNS'

df.loc[df.drive_id==2433900382000, 'drive_time'] = 100
df.loc[df.drive_id==2433900382000, 'drive_result'] = 'TURNOVER ON DOWNS'

df.loc[df.drive_id==2427600241300, 'drive_time'] = 100
df.loc[df.drive_id==2427600241300, 'drive_result'] = 'TURNOVER ON DOWNS'

df.loc[df.drive_result == 'RUSHING TD TD', 'drive_result'] = 'RUSHING TD'

In [37]:
# Same with PASSING TD TD (two drives with same ids)

df.loc[((df.drive_id == 24248001223)&(df.offense=='Northern Arizona')), 'drive_id'] = 2424800122300
df.loc[df.drive_id==2424800122300, 'drive_time'] = 100
df.loc[df.drive_id==2424800122300, 'drive_result'] = 'TURNOVER ON DOWNS'
df.loc[((df.drive_id == 24248001223)&(df.offense=='Arizona')), 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24255025914)&(df.offense=='Western Michigan')), 'drive_id'] = 24255025913
df.loc[df.drive_id==24255025913, 'drive_result'] = 'FG MISSED'
df.loc[((df.drive_id == 24255025914)&(df.offense=='Virginia Tech')), 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24262006614)&(df.offense=='Northern Illinois')), 'drive_id'] = 24262006616
df.loc[((df.drive_id == 24262006616)), 'drive_result'] = 'PASSING TD'
df.loc[((df.drive_id == 24262006614)&(df.offense=='Iowa State')), 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24283003001)&(df.offense=='California')), 'drive_id'] = 2428300300100
df.loc[((df.drive_id == 2428300300100)), 'drive_result'] = 'TURNOVER ON DOWNS'
df.loc[((df.drive_id == 24283003001)&(df.offense=='USC')), 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24283007707)&(df.offense=='Northwestern')), 'drive_id'] = 2428300770700
df.loc[(df.drive_id == 2428300770700), 'drive_result'] = 'FG MISSED'
df.loc[((df.drive_id == 24283007707)&(df.offense=='Indiana')), 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24295027705)&(df.offense=='Syracuse')), 'drive_id'] = 24295027704
df.loc[df.drive_id == 24295027704, 'drive_result'] = 'FG MISSED'
df.loc[df.drive_id == 24295027705, 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24304015435)&(df.offense=='Wake Forest')), 'drive_id'] = 24304015434
df.loc[df.drive_id == 24304015434, 'drive_result'] = 'FG MISSED'
df.loc[df.drive_id == 24304015435, 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24311023826)&(df.offense=='Vanderbilt')), 'drive_id'] = 24311023825
df.loc[df.drive_id == 24311023825, 'drive_result'] = 'FG MISSED'
df.loc[df.drive_id == 24311023826, 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24304257915)&(df.offense=='South Carolina')), 'drive_id'] = 2430425791500
df.loc[df.drive_id == 2430425791500, 'drive_result'] = 'FG MISSED'
df.loc[df.drive_id == 24304257915, 'drive_result'] = 'PASSING TD'

df.loc[((df.drive_id == 24325025430)&(df.offense=='BYU')), 'drive_id'] = 2432502543000
df.loc[df.drive_id == 2432502543000, 'drive_result'] = 'FG MISSED'
df.loc[df.drive_id == 24325025430, 'drive_result'] = 'PASSING TD'


In [38]:
df.loc[df.drive_id==29311022818,'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id==30338222604,'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id==32252230601,'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id==32301000216,'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id==31001020115,'drive_result'] = 'INT TD'
df.loc[df.drive_id==31281025112,'drive_result'] = 'INT TD'

df.loc[df.drive_id==30254002525,'drive_id'] = 3025400252500
df.loc[df.drive_id==3025400252500, 'drive_result'] = 'FUMBLE TD'
df.loc[df.drive_id==30254002525,'drive_result'] = 'RUSHING TD'

df.loc[((df.drive_id==30282027706)&(df.offense=='West Virginia')),'drive_result'] = 'RUSHING TD'
df.loc[((df.drive_id==30282027706)&(df.offense=='UNLV')),'drive_id'] = 3028202770600
df.loc[df.drive_id==3028202770600,'drive_result'] = 'INT'

df.loc[((df.drive_id==31260015815)&(df.offense=='Nebraska')),'drive_result'] = 'RUSHING TD'
df.loc[((df.drive_id==31260015815)&(df.offense=='Washington')),'drive_id'] = 3126001581500
df.loc[df.drive_id==3126001581500,'drive_result'] = 'PUNT'

df.loc[((df.drive_id==32294006809)&(df.offense=='Boise State')),'drive_result'] = 'RUSHING TD'
df.loc[((df.drive_id==32294006809)&(df.offense=='UNLV')),'drive_id'] = 3229400680900
df.loc[df.drive_id==3229400680900,'drive_result'] = 'INT'

df.loc[((df.drive_id==32307002518)&(df.offense=='Washington')),'drive_result'] = 'PASSING TD'
df.loc[((df.drive_id==32307002518)&(df.offense=='California')),'drive_id'] = 3230700251800
df.loc[df.drive_id==3230700251800,'drive_result'] = 'FG GOOD'

df.loc[((df.drive_id==33242224720)&(df.offense=='Samford')),'drive_result'] = 'RUSHING TD'
df.loc[((df.drive_id==33242224720)&(df.offense=='Georgia State')),'drive_id'] = 3324222472000
df.loc[df.drive_id==3324222472000,'drive_result'] = 'PUNT'

df.loc[((df.drive_id==33311002415)&(df.offense=='Oregon')),'drive_result'] = 'PASSING TD'
df.loc[((df.drive_id==33311002415)&(df.offense=='Stanford')),'drive_id'] = 3331100241500
df.loc[df.drive_id==3331100241500,'drive_result'] = 'MISSED FG TD'

In [39]:
# none of these returned for a TD, i checked
ints = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains('intercepted'))]
int_ids = list(ints.drive_id.unique())

df.loc[df.drive_id.isin(int_ids), 'drive_result'] = 'INT'

In [40]:
safeties = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains('SAFETY'))]
sf_ids = list(safeties.drive_id.unique())

df.loc[df.drive_id.isin(sf_ids), 'drive_result'] = 'SF'

In [41]:
# checked for touchdowns, again only punts
punts = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains('punt'))]
punt_ids = list(punts.drive_id.unique())

df.loc[df.drive_id.isin(punt_ids), 'drive_result'] = 'PUNT'

Fix uncategorized

In [42]:
base = r'^{}'
expr = '(?=.*{})'
words = ['field', 'goal', 'GOOD']
fg_good = base.format(''.join(expr.format(w) for w in words))

fgg = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(fg_good,regex=True))]
fgg_ids = list(fgg.drive_id.unique())
df.loc[df.drive_id.isin(fgg_ids), 'drive_result'] = 'FG GOOD'

In [43]:
words = ['field', 'goal', 'MISSED']
fg_missed = base.format(''.join(expr.format(w) for w in words))

fgm = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(fg_missed,regex=True))]
fgm_ids = list(fgm.drive_id.unique())
df.loc[df.drive_id.isin(fgm_ids), 'drive_result'] = 'FG MISSED'

In [44]:
words = ['field', 'goal', 'BLOCKED']
fg_blocked = base.format(''.join(expr.format(w) for w in words))

fgb = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(fg_blocked,regex=True))]
fgb_ids = list(fgb.drive_id.unique())
df.loc[df.drive_id.isin(fgb_ids), 'drive_result'] = 'FG MISSED'

In [45]:
words = ['rush','TOUCHDOWN']
rtd = base.format(''.join(expr.format(w) for w in words))

rush_td = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(rtd,regex=True))]
rtd_ids = list(rush_td.drive_id.unique())
df.loc[df.drive_id.isin(rtd_ids), 'drive_result'] = 'RUSHING TD'

In [46]:
words = ['pass','complete','TOUCHDOWN']
ptd = base.format(''.join(expr.format(w) for w in words))

pass_td = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(ptd,regex=True))]
ptd_ids = list(pass_td.drive_id.unique())
df.loc[df.drive_id.isin(ptd_ids), 'drive_result'] = 'PASSING TD'

In [47]:
fumbles = df.loc[(df.drive_result=='Uncategorized')&(df.play_type=='Fumble Recovery (Opponent)')]
fids = list(fumbles.drive_id.unique())
df.loc[df.drive_id.isin(fids), 'drive_result'] = 'FUMBLE'

fumbles = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains('fumble|fumbles'))&(df.play_type=='Fumble Recovery (Own)')]
fids = list(fumbles.drive_id.unique())
df.loc[df.drive_id.isin(fids), 'drive_result'] = 'FUMBLE'

In [48]:
safeties = df.loc[(df.drive_result=='Uncategorized')&(df.play_type=='Safety')]
sids = list(safeties.drive_id.unique())
df.loc[df.drive_id.isin(sids), 'drive_result'] = 'SF'

In [49]:
end_of_game = df.loc[(df.drive_result=='Uncategorized')&(df.tr_game<=60)]
eog_ids = list(end_of_game.drive_id.unique())
df.loc[df.drive_id.isin(eog_ids), 'drive_result'] = 'END OF GAME'

end_of_half = df.loc[(df.drive_result=='Uncategorized')&(df.tr_half<=60)]
eoh_ids = list(end_of_half.drive_id.unique())
df.loc[df.drive_id.isin(eoh_ids), 'drive_result'] = 'END OF HALF'


In [50]:
# QB kneels, end of game
words = ['TEAM','rush']
kneel = base.format(''.join(expr.format(w) for w in words))
kneel_df = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(kneel,regex=True))&(df.tr_game<=1800)]
kneel_ids = list(kneel_df.drive_id.unique())
df.loc[df.drive_id.isin(kneel_ids), 'drive_result'] = 'END OF GAME'

words = ['TEAM','run','loss']
kneel = base.format(''.join(expr.format(w) for w in words))
kneel_df = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(kneel,regex=True))&(df.tr_game<=1800)]
kneel_ids = list(kneel_df.drive_id.unique())
df.loc[df.drive_id.isin(kneel_ids), 'drive_result'] = 'END OF GAME'

# QB kneels, end of half
words = ['TEAM','rush']
kneel = base.format(''.join(expr.format(w) for w in words))
kneel_df = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(kneel,regex=True))]
kneel_ids = list(kneel_df.drive_id.unique())
df.loc[df.drive_id.isin(kneel_ids), 'drive_result'] = 'END OF HALF'

words = ['TEAM','run','loss']
kneel = base.format(''.join(expr.format(w) for w in words))
kneel_df = df.loc[(df.drive_result=='Uncategorized')&(df.play_text.str.contains(kneel,regex=True))]
kneel_ids = list(kneel_df.drive_id.unique())
df.loc[df.drive_id.isin(kneel_ids), 'drive_result'] = 'END OF HALF'

In [51]:
# turnover on downs
tod = df.loc[(df.drive_result=='Uncategorized')&(df.down==4)&df.play_text.str.contains('incomplete')]
tod_ids = list(tod.drive_id.unique())
df.loc[df.drive_id.isin(tod_ids), 'drive_result'] = 'TURNOVER ON DOWNS'

In [52]:
# drop the rest, probably got drive ids split up
df = df.loc[df.drive_result != 'Uncategorized']

In [53]:
# standardize
df.loc[df.drive_result=='FUMBLE RETURN TD', 'drive_result'] = 'FUMBLE TD'

df.loc[df.drive_result=='PUNT TD', 'drive_result'] = 'PUNT RETURN TD'

df.loc[df.drive_result == 'INT RETURN TOUCH', 'drive_result'] = 'INT TD'

df.loc[df.drive_result == 'BLOCKED FG (TD) TD', 'drive_result'] = 'MISSED FG TD'

df.loc[df.drive_result=='POSS. ON DOWNS', 'drive_result'] = 'TURNOVER ON DOWNS'

# turnover on downs
df.loc[df.drive_result == 'DOWNS', 'drive_result'] = 'TURNOVER ON DOWNS'

### Fix miscategorized

In [54]:
often_missed = ['TURNOVER ON DOWNS','END OF GAME','END OF HALF']

om = df.loc[(df.drive_result.isin(often_missed))&(df.play_type=='Field Goal Good')]
om_ids = list(om.drive_id.unique())
df.loc[df.drive_id.isin(om_ids), 'drive_result'] = 'FG GOOD'

om = df.loc[(df.drive_result.isin(often_missed))&(df.play_type=='Field Goal Missed')]
om_ids = list(om.drive_id.unique())
df.loc[df.drive_id.isin(om_ids), 'drive_result'] = 'FG MISSED'

words = ['pass','complete','TOUCHDOWN']
pass_td = base.format(''.join(expr.format(w) for w in words))
pdf = df.loc[(df.drive_result.isin(often_missed))&(df.play_text.str.contains(pass_td,regex=True))]
pids = list(pdf.drive_id.unique())
df.loc[df.drive_id.isin(pids), 'drive_result'] = 'PASSING TD'

words = ['pass','complete','for','TD']
pass_td = base.format(''.join(expr.format(w) for w in words))
pdf = df.loc[(df.drive_result.isin(often_missed))&(df.play_text.str.contains(pass_td,regex=True))]
pdf = pdf.loc[~pdf.play_text.str.contains('TTD')]
pids = list(pdf.drive_id.unique())
df.loc[df.drive_id.isin(pids), 'drive_result'] = 'PASSING TD'

words = ['rush','for','TOUCHDOWN']
rush_td = base.format(''.join(expr.format(w) for w in words))
rdf = df.loc[(df.drive_result.isin(often_missed))&(df.play_text.str.contains(rush_td,regex=True))]
rdf = rdf.loc[~rdf.play_text.str.contains('TTD')]
rids = list(rdf.drive_id.unique())
df.loc[df.drive_id.isin(rids), 'drive_result'] = 'RUSHING TD'

words = ['run','for','TD']
rush_td = base.format(''.join(expr.format(w) for w in words))
rdf = df.loc[(df.drive_result.isin(often_missed))&(df.play_text.str.contains(rush_td,regex=True))]
rids = list(rdf.drive_id.unique())
df.loc[df.drive_id.isin(rids), 'drive_result'] = 'RUSHING TD'

words = ['punt','touchback']
punt = base.format(''.join(expr.format(w) for w in words))
punt_df = df.loc[(df.drive_result.isin(often_missed))&(df.play_text.str.contains(punt,regex=True))]
punt_df = punt_df.loc[~punt_df.play_text.str.contains('PENALTY')]
punt_ids = list(punt_df.drive_id.unique())
df.loc[df.drive_id.isin(punt_ids), 'drive_result'] = 'PUNT'

    
    
    

In [55]:
gb = df.groupby(['drive_result'])['down'].count()
gb

drive_result
END OF GAME           35159
END OF HALF           27914
FG GOOD              236266
FG MISSED             86818
FUMBLE                66236
FUMBLE TD              3918
INT                   89036
INT TD                 6626
MISSED FG TD            401
PASSING TD           150882
PUNT                 592493
PUNT RETURN TD         3737
RUSHING TD           164488
SF                     2334
TD                   212689
TURNOVER ON DOWNS    118521
Name: down, dtype: int64

In [56]:
print(len(df))
df = df.dropna(subset=['play_text'])
print(len(df))

1797518
1797259


# Feature Engineering

### Need 6 Features:
-Down (check)  
-Seconds left in half (check)  
-Alternate Clock
-Yards to go for touchdown (log?)  
-Yards to go for first down (log?)    
-Goal to goal indicator  
-Under 2 minutes indicator  

In [57]:
print(list(df))

['away', 'defense', 'defense_conference', 'defense_score', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'season', 'week', 'season_type', 'drive_result', 'elapsed.minutes', 'elapsed.seconds', 'end_period', 'end_time.minutes', 'end_time.seconds', 'end_yardline', 'game_id', 'plays', 'scoring', 'start_period', 'start_time.minutes', 'start_time.seconds', 'start_yardline', 'yards', 'tr_game', 'tr_half', 'distance', 'drive_time']


Clock data is unreliable because maybe 25% of the games have only have one time for each play, and that time is when the drive started. So I decided to get the total time of each drive, and then assume each play took the same amount of time. EPA shouldn't be significantly affected most of the time, i.e. a 70 yard pass will be considered a good play no matter what. The only time it might have an adverse effect is toward the end of a game, when seconds matter. I think that in college football, when the clock stops for a first down, and incompletions, that all pass plays probably do take a somewhat similar amount of time. Drives in this situation will consist mostly of the same play type, and plays of the same play type likely take similar amounts of time. I'll compare it to the clock data I do have to make sure.

In [58]:
# alt clock

# these get the start and end time of every drive
maxs = df.groupby(['game_id','drive_id'])['tr_game'].max().reset_index()
mins = df.groupby(['game_id','drive_id'])['tr_game'].min().reset_index()
maxs = maxs.rename(columns={'tr_game':'drive_start'})
mins = mins.rename(columns={'tr_game':'drive_end'})

# sometimes the drive end time is the same as the drive start. in that case, I use the next drive start
maxs = maxs.sort_values(by=['game_id','drive_start'],ascending=False)
next_max = maxs.groupby(['game_id'])['drive_start'].shift(-1)
next_max = pd.Series(next_max, name='next_drive_start')
new_max = pd.concat([maxs, next_max], axis=1)
new_max['next_drive_start'] = new_max['next_drive_start'].fillna(0)

# sometimes (rarely, 2%ish of the time) both the next drive start and the drive end are the same as the drive start
# in that case, as a last resort, i use the next drive end time. 
# i'm fairly sure most of the time it's when a timeout or something divides the same drive into two.
# i can explore this more in future work
mins = mins.sort_values(by=['game_id','drive_end'],ascending=False)
next_min = mins.groupby(['game_id'])['drive_end'].shift(-1)
next_min = pd.Series(next_min, name='next_drive_end')
new_min = pd.concat([mins, next_min], axis=1)
new_min['next_drive_end'] = new_min['next_drive_end'].fillna(0)
new_min = new_min.drop(columns='game_id')
times = pd.merge(left=new_max,right=new_min,on=['drive_id','drive_id'],how='left')

# attempt 1 (works on ~75% of data)
times['drive_time_1'] = times['drive_start'] - times['drive_end']
# plan b (works on 98% of data)
times['drive_time_2'] = np.where(times['drive_time_1']>0,times['drive_time_1'],(times['drive_start']-times['next_drive_start']))
# last resort (works on 99.3% of data)
times['drive_time'] = np.where(times['drive_time_2']>0,times['drive_time_2'],(times['drive_start']-times['next_drive_end']))

# if all 3 don't work, drop that crap
times = times[['drive_id','drive_time']]
times = times.rename(columns={'drive_time':'alt_drive_time'})


In [59]:
df = pd.merge(left=df,right=times,how='left',on=['drive_id','drive_id'])
df = df.loc[df.drive_time>0]

In [60]:
# need to divide the drive time up amongst the plays. 
# separate timeouts and penalties
tps = ['Timeout','Penalty']
ts_and_ps = df.loc[df['play_type'].isin(tps)]

df = df.loc[~df['play_type'].isin(tps)]

In [61]:
# in my cleaning sometimes i merged or divided up drives. using my own play count.
gb = df.groupby(['drive_id'])['down'].count().reset_index()
gb = gb.rename(columns={'down':'alt_plays'})

df = pd.merge(left=df,right=gb,how='left',on=['drive_id','drive_id'])

In [62]:
# validate
print(df[['plays','alt_plays']].corr())

df = df.drop(columns=['plays','drive_time'])
df = df.rename(columns={'alt_plays':'plays', 'alt_drive_time':'drive_time'})

              plays  alt_plays
plays      1.000000   0.973881
alt_plays  0.973881   1.000000


In [63]:
# time per play
df['tpp'] = df['drive_time']/df['plays']

In [69]:
print(list(df))

['away', 'defense', 'defense_conference', 'defense_score', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yards_gained', 'season', 'week', 'season_type', 'drive_result', 'elapsed.minutes', 'elapsed.seconds', 'end_period', 'end_time.minutes', 'end_time.seconds', 'end_yardline', 'game_id', 'scoring', 'start_period', 'start_time.minutes', 'start_time.seconds', 'start_yardline', 'yards', 'tr_game', 'tr_half', 'distance', 'drive_time', 'plays', 'tpp', 'yard_line', 'l10_dist', 'GTG', 'UTM']


If you remove the game id from the play id, or the drive id, then you usually get a numbered list. if you don't, the 100th play will be sorted behind the 99th because it starts with 1

In [73]:
df = df.sort_values(by=['game_id','drive_id','id'],ascending=False)
df['id_len'] = df.id.astype(str).str.len()
# print(df[['game_id','drive_id','id','id_len','tr_game']])
gb = df.groupby(['id_len'])['drive_id'].count()
gb

id_len
10         2
12    971507
13       127
18    663181
Name: drive_id, dtype: int64

In [75]:
df = df.sort_values(by=['game_id','drive_id','id'],ascending=False)
print(df[['game_id','drive_id','id','tr_game']])

           game_id     drive_id                  id  tr_game
1634673  401100159  40110015926  401100159104985607    103.0
1634675  401100159  40110015926  401100159104985605    103.0
1634674  401100159  40110015926  401100159104985604    103.0
1634672  401100159  40110015925  401100159104985601    103.0
1634676  401100159  40110015925  401100159104985201    107.0
1634677  401100159  40110015925  401100159104984801    111.0
1634679  401100159  40110015925  401100159104976702    152.0
1634678  401100159  40110015925  401100159104976701    152.0
1634680  401100159  40110015925  401100159104968502    194.0
1634681  401100159  40110015925  401100159104968501    194.0
1634685  401100159  40110015925  401100159104954306    296.0
1634683  401100159  40110015925  401100159104954305    296.0
1634684  401100159  40110015925  401100159104954304    296.0
1634682  401100159  40110015924  401100159104954301    296.0
1634686  401100159  40110015924  401100159104928805    431.0
1634687  401100159  4011

In [85]:
bad_ids = df.loc[~df['id_len'].isin([12,18])]
bad_ids.play_text

1300025    Alex McGough pass complete to Alex Gardner for...
1152823          Zach Conque run for no gain to the SFAus 20
417104           Tim Curry rush for no gain to the NMxSt 49.
417106      Trevor Walls pass incomplete to William Bullock.
417108       Trevor Walls pass incomplete to Donyae Coleman.
417107       Trevor Walls pass incomplete to Donyae Coleman.
417105     Trevor Walls pass complete to William Bullock ...
417113     Kyle Petersen punt for 36 yards, returned by D...
417112     Michael Farrar rush for no gain to the UTEP 21...
417111     Michael Farrar rush for no gain to the UTEP 21...
417110     Tim Curry pass incomplete to Melvin Stephenson...
417109     Trevor Walls pass complete to Kyle Nelson for ...
417115     Trevor Walls rush for 13 yards to the UTEP 5, ...
417116     Trevor Walls pass complete to Tonny Glynn for ...
417117     Tonny Glynn rush for 1 yard to the UTEP 29, ta...
417118      Trevor Walls pass incomplete to Marcus Anderson.
417119     Tonny Glynn r

In [65]:
# fix yard_line, it's w.r.t the home team
df = df.rename(columns={'yard_line':'wrong_yardline'})

df['yard_line'] = np.where(df['offense']==df['home'],df['wrong_yardline'],100-df['wrong_yardline'])
# print(df[['home','offense','yard_line','wrong_yardline']].head(50))
df = df.drop(columns=['wrong_yardline'])

In [66]:
# get log 10 of distance
df['l10_dist'] = np.log10(df['distance'])

In [67]:
# goal to go
df['GTG'] = np.where((df['yard_line']+df['distance']>=100),1,0)


# under two min in half
df['UTM'] = np.where(df['tr_half']<=120,1,0)

# Save

In [68]:
PATH = './output/processed.csv'
df.to_csv(PATH,index=False)