In [1]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import datetime
gc.collect()

from tqdm import tqdm

# Load Data

In [2]:
# load play by play & drive data
years = list(range(2004, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    
    drive_path = './output/'+str(year)+'/'+str(year)+'_drives.csv'
    drive_df = pd.read_csv(drive_path)
    
    drive_df = drive_df.rename(columns={'id':'drive_id'})
    
    sea_df = pd.merge(left=sea_df, right=drive_df, how='left', on=['drive_id','drive_id'])
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(df.shape)
print(str(num_plays) + " plays were loaded")

100%|██████████| 15/15 [00:48<00:00,  4.87s/it]

(2122188, 41)
2122188 plays were loaded





# Clean Data 

In [3]:
# fix clock data first so drives can be figured out
time_cols = ['clock.minutes','clock.seconds','start_time.minutes','start_time.seconds',
            'end_time.minutes','end_time.seconds']
for tc in time_cols:
    df[tc] = df[tc].fillna(0)

# get time remaining in game
df['tr_game'] = (4-df['period']) * 900 + (df['clock.minutes'] * 60) + df['clock.seconds']
df['tr_half'] = np.where(df['period']>2,df['tr_game'], df['tr_game']-1800)

df = df.drop(columns=['clock.minutes','clock.seconds'])

In [4]:
# fix uncategorized
uncat = df.loc[df['play_type']=='Uncategorized']
print(len(uncat))
print(uncat.play_text.head(10))

5580
27            Extra point by Ryan Killeen (USC) is good.
38      35 yard field goal by Brandon Pace (VT) is good.
46     35 yard field goal by Ryan Killeen (USC) is no...
61                             Start of the 2nd quarter.
71             Extra point by Brandon Pace (VT) is good.
117           Extra point by Ryan Killeen (USC) is good.
122                            Start of the 4th quarter.
132     42 yard field goal by Brandon Pace (VT) is good.
148           Extra point by Ryan Killeen (USC) is good.
166    40 yard field goal by Ryan Killeen (USC) is good.
Name: play_text, dtype: object


In [5]:
print(df.play_type.unique())
def fix_uncat(play_type, play_text):
    if play_type != 'Uncategorized':
        return play_type
    else:
        if isinstance(play_text,str):
            if "Start of the 2nd quarter." in play_text:
                return "End Period"
            elif "Start of the 3rd quarter." in play_text:
                return "End of Half"
            elif "Start of the 4th quarter." in play_text:
                return "End Period"
            elif "Start of overtime." in play_text:
                return "End Period"
            elif "End of the game." in play_text:
                return "End of Game"
            elif "Extra point" in play_text:
                if "is good" in play_text:
                    return "Extra Point Good"
                elif "is no good." in play_text[-13:]:
                    return "Extra Point Missed"
                else:
                    return play_type
            elif "field goal" in play_text:
                if "is good" in play_text:
                    return "Field Goal Good"
                elif "is no good." in play_text[-13:]:
                    return "Field Goal Missed"
                else:
                    print(play_text)
                    return play_type
            elif "missed PAT returned." in play_text:
                return "Extra Point Missed"
            elif "took lateral and rushed" in play_text:
                return "Rush"
            # mostly fumbled snaps recovered by own team
            elif "fumbled" in play_text:
                return "Fumble Recovery (Own)"
            elif "return for" in play_text:
                return "Punt Return"
            elif "End of" in play_text:
                return "End Period"
            elif "run for" in play_text:
                return "Rush"
            elif "SAFETY" in play_text:
                return "Safety"
            elif "Penalty" in play_text:
                return "Penalty"
            else:
                return play_type
    return play_type

df['play_type'] = df.apply(lambda row: fix_uncat(row['play_type'], row['play_text']),axis=1)

# uncat = df.loc[df.play_type=='Uncategorized']
# print(len(uncat))

['Rush' 'Pass Incompletion' 'Timeout' 'Penalty' 'Punt Return'
 'Pass Interception' 'Pass Completion' 'Uncategorized'
 'Kickoff Return (Offense)' 'End Period' 'Fumble Recovery (Own)' 'Sack'
 'Fumble Recovery (Opponent)' 'Interception Return Touchdown'
 'Blocked Punt' 'Safety' 'Two Point Pass' 'Kickoff Return Touchdown'
 'Two Point Rush' 'Blocked Field Goal' 'Blocked Punt Touchdown'
 'Blocked PAT' 'Punt Return Touchdown' 'Fumble Return Touchdown'
 'Kickoff Return (Defense)' 'Blocked Field Goal Touchdown' 'Punt' 'Pass'
 'Kickoff' 'Extra Point Good' 'Field Goal Good' 'Field Goal Missed'
 'Extra Point Missed' '2pt Conversion' 'Offensive 1pt Safety'
 'Pass Reception' 'Passing Touchdown' 'Rushing Touchdown'
 'Pass Interception Return' 'End of Half' 'End of Game'
 'Defensive 2pt Conversion' 'Missed Field Goal Return' 'Interception'
 'Missed Field Goal Return Touchdown']


In [6]:
# isolate extra point attempts
xp_cats = ['Two Point Pass','Two Point Rush','Blocked PAT','Extra Point Good','Extra Point Missed', '2pt Conversion',
          'Offensive 1pt Safety','Defensive 2pt Conversion']
xps = df.loc[df['play_type'].isin(xp_cats)]
df = df.loc[~df['play_type'].isin(xp_cats)]

In [7]:
# isolate kickoffs
kickoffs_cats = ['Kickoff Return (Offense)', 'Kickoff Return Touchdown', 'Kickoff Return (Defense)', 'Kickoff']
kickoff_penalty_plays = ['KICKOFF', 'KICKOFF RETURN TD']
kickoffs = df.loc[(df['play_type'].isin(kickoffs_cats)) | (df['drive_result']).isin(kickoff_penalty_plays)]
df = df.loc[~df['play_type'].isin(kickoffs_cats)]


In [8]:
# isolate OT
ot = df.loc[(df['period'] > 4) | (df['drive_result'] == 'POSSESSION (FOR OT DRIVES)')]
df = df.loc[(df['period'] <= 4) & (df['period'] >0)]
df = df.loc[df.drive_result != 'POSSESSION (FOR OT DRIVES)']


In [9]:
# drop end of period plays

eop = ['End of Game','End of Half','End Period']
df = df.loc[~df['play_type'].isin(eop)]


In [10]:
# validate
# bad = df.loc[df['offense_x']!=df['offense_y']]
# print(bad[['tr_game','play_text','offense_x','offense_y']].head(25))
# offense_x seems to be correct while offense_y is not

df = df.drop(columns=['defense_y','defense_conference_y','offense_y','offense_conference_y'])

df = df.rename(columns={'defense_x':'defense','defense_conference_x':'defense_conference','offense_x':'offense','offense_conference_x':'offense_conference'})


In [11]:
# fix bad distances

zeros = df.loc[df['distance'] == 0]
# print(len(zeros))
# print(zeros.groupby(['play_type'])['distance'].count())
# print(zeros.play_text.tail(50))

# drop negative distances. change 0 distances to 0.5 yard
df = df.loc[df['distance']>=0]

df = df.rename(columns={'distance':'wrong_distance'})
df['distance'] = np.where(df['wrong_distance']>0,df['wrong_distance'],0.5)
df = df.drop(columns=['wrong_distance'])


In [12]:
# fix bad downs
zero_down = df.loc[df['down']==0]
print(len(zero_down))

# impute down + 1 from previous play, to max of 4
df['down'] = np.where(df['down']>0,df['down'],df['down'].shift()+1)
# still 18 bad

df = df.loc[(df['down']>0)&(df['down']<5)]

441


## Clock
Also, sometimes clock data is wrong. I messed with trying to predict time per play based on play type, but the data was prohibitively messy. Instead I'll just assume every play takes up the same percentage of drive time. The worst effect this will have is it will make incompletions look worse in late game situations, because incompletions will have the same time elapsed as completion. I guess this matters less in college football, because the clock stops on a first down.

In [13]:
# fix negative drive times first, they mostly contain actual plays
df['elapsed.minutes'] = df['elapsed.minutes'].copy().fillna(0)
df['elapsed.seconds'] = df['elapsed.seconds'].copy().fillna(0)
df['drive_time'] = 60*df['elapsed.minutes'] + df['elapsed.seconds']
cols = list(df)
print(cols)

['away', 'defense', 'defense_conference', 'defense_score', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'season', 'week', 'season_type', 'drive_result', 'elapsed.minutes', 'elapsed.seconds', 'end_period', 'end_time.minutes', 'end_time.seconds', 'end_yardline', 'game_id', 'plays', 'scoring', 'start_period', 'start_time.minutes', 'start_time.seconds', 'start_yardline', 'yards', 'tr_game', 'tr_half', 'distance', 'drive_time']


In [14]:
# can't find systematic error, just going to reverse start and end time. the reversed data
# seems to make sense


negs = df.loc[df.drive_time < 0].copy()
negs = negs[cols]
neg_ids = list(negs.drive_id.unique())
df = df.loc[~df.drive_id.isin(neg_ids)]

negs = negs.rename(columns={'elapsed.minutes':'wrong_em','elapsed.seconds':'wrong_es',
                            'start_time.minutes':'wrong_sm','start_time.seconds':'wrong_ss',
                            'end_time.minutes':'wrong_etm','end_time.seconds':'wrong_ets',
                            'drive_time':'wrong_dt'
                           })

negs['start_time.minutes'] = negs['wrong_etm'].copy()
negs['start_time.seconds'] = negs['wrong_ets'].copy()

negs['end_time.minutes'] = negs['wrong_sm'].copy()
negs['end_time.seconds'] = negs['wrong_ss'].copy()

negs['elapsed.minutes'] = negs['wrong_em'].copy().abs()
negs['elapsed.seconds'] = negs['wrong_es'].copy().abs()
negs['drive_time'] = negs['wrong_dt'].copy().abs()

drop_cols = ['wrong_em','wrong_es','wrong_sm','wrong_ss','wrong_etm','wrong_ets','wrong_dt']
negs = negs.drop(columns=drop_cols)

negs = negs[cols]
df = pd.concat([df,negs],sort=False)
df = df.sort_values(by=['game_id','tr_game','drive_id'])

In [15]:
# now work on fixing nan drive timea
df['drive_time'] = 60*df['elapsed.minutes'] + df['elapsed.seconds']


In [29]:
zeros = df.loc[df.drive_time == 0]
zgb = zeros.groupby(['plays'])['start_time.minutes'].count()

# zero play drives are fumble recoveries by the other team or punt returns
# fumble recoveries are on the first play of the drive and are actually correct
# need to fix punt returns

# print(zplay.drive_id.head())

zpunts = zeros.loc[(zeros.plays==0)&(zeros.play_type=='Punt Return')]

zp_dids = list(zpunts.drive_id.unique())
zp_mo = [(x-1) for x in zp_dids]

if len(zp_dids) > 0:
    df.loc[df.drive_id.isin(zp_mo), 'drive_result'] = 'Punt'
    df.loc[df.drive_id.isin(zp_dids), 'drive_time'] = 500
    print(df.loc[df['drive_id'].isin(zp_dids)].head())
    df.loc[df['drive_id'].isin(zp_dids), 'drive_id'] = df['drive_id'].subtract(1)
    print(df.loc[df['drive_id'].isin(zp_dids)].head())

Below, I try to fix drive times of zero seconds with more than zero plays. My idea was get the time remaining of the next drive, and subtract that from the time remaining of the current drive. Unfortunately that only worked for about half the drives. And, most of the drives it did work on, only resulted in drive times of under a minute. That seemed off and I didn't see an obvious reason why. I decided just to drop these drives. That's about 3% of the data.

In [17]:
# # now fix drives with more than zero plays
# mplays = zeros.loc[zeros.plays>0]
# # print(len(mplays))

# # get the time remaining for the next drive
# mp_dids = list(mplays.drive_id.unique())
# mp_next = [(x+1) for x in mp_dids]
# # print(len(mp_dids))

# next_df = df.loc[df.drive_id.isin(mp_next)]

# ngb = next_df.groupby(['drive_id'])['tr_game'].max().reset_index()
# ngb['drive_id'] = ngb['drive_id'] - 1
# ngb = ngb.rename(columns={'tr_game':'end_drive_time'})

# mgb = mplays.groupby(['drive_id'])['tr_game'].min().reset_index()
# mgb = pd.merge(left=mgb, right=ngb, on=['drive_id','drive_id'], how='left')
# mgb = mgb.rename(columns={'tr_game':'start_drive_time'})

# mgb['new_elapsed'] = mgb['start_drive_time'] - mgb['end_drive_time']

# new_dt = mgb.loc[mgb['new_elapsed'] > 0]

# ndt_ids = list(new_dt.drive_id.unique())

# test = df.loc[df.drive_id.isin(ndt_ids)]
# print(test.groupby(['play_type'])['down'].count())

# # mplays = pd.merge(left=mplays, right=ngb, on=['drive_id','drive_id'], how='left')

# # mplays['new_elapsed'] = mplays['tr_game'] - mplays['end_drive_time']

# # new = mplays.loc[mplays['new_elapsed'] >0]

In [18]:
print(len(df))
df = df.loc[df['drive_time']>0]
print(len(df))

1927091
1878447


## Fix Bad Drive Results, Standardize Others

In [19]:
# fix drive result kickoffs
kos = df.loc[(df['drive_result']=='KICKOFF')]
dids = list(kos.drive_id.unique())

fgs = df.loc[(df['drive_id'].isin(dids))&df['play_text'].str.contains('Field Goal')]
fg_ids = list(fgs.drive_id.unique())

df.loc[df.drive_id.isin(fg_ids), 'drive_result'] = 'FG GOOD'


punts = [4005483434]
end_of_half = [40054770815]

df.loc[df.drive_id.isin(punts), 'drive_result'] = 'PUNT'
df.loc[df.drive_id.isin(end_of_half), 'drive_result'] = 'END OF HALF'

# special cases

df.loc[df.drive_id == 24269009911, 'drive_result'] = 'RUSHING TD'
df.loc[df.drive_id == 24269009911, 'drive_id'] = 24269009912

In [20]:
# fix Fg missed TDs
brt = df.loc[(df['drive_result']=='FG MISSED TD')&df['play_text'].str.contains('BLOCKED')]
brt_ids = list(brt.drive_id.unique())

df.loc[df['drive_id'].isin(brt_ids), 'drive_result'] = 'BLOCKED FG (TD) TD'

brt = df.loc[(df['drive_result']=='FG MISSED TD')&df['play_text'].str.contains('blocked')]
brt_ids = list(brt.drive_id.unique())

df.loc[df['drive_id'].isin(brt_ids), 'drive_result'] = 'BLOCKED FG (TD) TD'

brt = df.loc[(df['drive_result']=='FG MISSED TD')&df['play_text'].str.contains('blocked,')]
brt_ids = list(brt.drive_id.unique())

df.loc[df['drive_id'].isin(brt_ids), 'drive_result'] = 'BLOCKED FG (TD) TD'

In [21]:
# merge 'FG' and 'FG GOOD'
df.loc[df['drive_result']=='FG', 'drive_result'] = 'FG GOOD' 
df.loc[df['drive_result']=='MADE FG', 'drive_result'] = 'FG GOOD' 

df.loc[df['drive_result']=='MISSED FG', 'drive_result'] = 'FG MISSED' 

In [22]:
# fix 'end of half TD'
end_of_half = [29290230615,29304013515,32259006215,32308025415,32315015219,33250230613,33285263815,33327000821,40060388027]
fumble_tds = [30279211614]
int_tds = [30282006815,40079088219]
rush_tds = [30247263316,30268006217,32243211716,32301000917,40054794315]
pass_tds = [30324002112,32329023512]
block_fg_td = [40054834612]

df.loc[df['drive_id'].isin(end_of_half), 'drive_result'] = 'END OF HALF'
df.loc[df['drive_id'].isin(fumble_tds), 'drive_result'] = 'FUMBLE RETURN TD'
df.loc[df['drive_id'].isin(int_tds), 'drive_result'] = 'INT TD'
df.loc[df['drive_id'].isin(rush_tds), 'drive_result'] = 'RUSHING TD'
df.loc[df['drive_id'].isin(pass_tds), 'drive_result'] = 'PASSING TD'
df.loc[df['drive_id'].isin(block_fg_td), 'drive_result'] = 'BLOCKED FG (TD) TD'

In [23]:
# fix "end of game TD"
int_tds = [30324020406,40087609227]
fumble_tds = [40076354226,40086912120]
end_of_game = [40078746229,40094526122]

df.loc[df['drive_id'].isin(int_tds), 'drive_result'] = 'INT TD'
df.loc[df['drive_id'].isin(fumble_tds), 'drive_result'] = 'FUMBLE RETURN TD'
df.loc[df['drive_id'].isin(end_of_game), 'drive_result'] = 'END OF GAME'

In [24]:
# fix "downs TD"
passing_tds = [40054825721,4007635338,40076343011,40086953320,4010133465,4010128564]
rushing_tds = [40054786022,40060392118,40086953316]
interception_tds = [4005482704,40075690213,40076355220,4008696135,40086963817]

df.loc[df['drive_id'].isin(passing_tds), 'drive_result'] = 'PASSING TD'
df.loc[df['drive_id'].isin(rushing_tds), 'drive_result'] = 'RUSHING TD'
df.loc[df['drive_id'].isin(interception_tds), 'drive_result'] = 'INT TD'

In [25]:
# fix 'FG TD' drive result
df.loc[((df.drive_id == 40054786811)&(df.offense=='Baylor')), 'drive_result'] = 'FG GOOD'
df.loc[((df.drive_id == 40054786811)&(df.offense=='Baylor')), 'drive_id'] = 4005478681100

df.loc[(df.drive_id == 40054786811), 'drive_result'] = 'RUSHING TD'

In [26]:
# fix "end of 4th"
df.loc[df['drive_result']=='END OF 4TH QUARTER', 'drive_result'] = 'END OF GAME'

In [31]:
# more blocked field goals
bfg_ids = [242550275,242760259,243250150,243250344]
bfg_dids = [24255027512,24276025916,24325015008,24325034420]
to_fix = [(x-1) for x in bfg_dids]

df.loc[df['drive_id'].isin(to_fix), 'drive_result'] = 'BLOCKED FG (TD) TD' 
df.loc[df['drive_id'].isin(bfg_dids), 'drive_id'] = df['drive_id'].copy().subtract(1)

ValueError: cannot reindex from a duplicate axis

In [None]:
# blocked punts
bps = list(df.loc[df['drive_result']=='BLOCKED PUNT TD'].drive_id.unique())
to_fix = [(x-1) for x in bps]

df.loc[df['drive_id'].isin(to_fix), 'drive_result'] = 'BLOCKED PUNT TD' 
df.loc[df['drive_id'].isin(bps), 'drive_id'] = df.drive_id.copy() - 1

In [None]:
# the drive data in these games seems very off
bad_ids = [252602572,272560120,272562005]
df = df.loc[~df.game_id.isin(bad_ids)]

In [None]:
## attempt to predict clock, can ignore

# going to try to build a model to approximate off good data

# drop drives when they have the same clock for more than 8 plays
# gb = df[['game_id','drive_id','tr_game','down','plays']]
# mode = gb.groupby(['drive_id','tr_game'])['plays'].count().reset_index()
# mode = mode.sort_values(by='plays',ascending=False)

# bad_drives = mode.loc[mode['plays']>=8]
# print(len(bad_drives))
# bd_list = list(bad_drives.drive_id.unique())


# drop bad drives from data
# clock = df.loc[~df['drive_id'].isin(bd_list)]
# print(len(clock))

# only use long drives in model (this also drops small drives with still clock)
# clock = clock.loc[clock.plays>=10]
# print(len(clock))

# need time elapsed
# clock = clock.sort_values(by=['game_id','drive_id','tr_game'],ascending=False)
# clock['tr_game_next'] = clock.groupby(['drive_id'])['tr_game'].transform(lambda x:x.shift(-1))

# print(clock[['drive_id','tr_game','tr_game_next']])

# clock['tr_game_next'] = clock['tr_game_next'].fillna(3600)
# clock = clock.dropna(subset=['tr_game_next'])

# clock['play_time'] = clock['tr_game'] - clock['tr_game_next']

# # drop values that are still bad
# clock = clock.loc[(clock.play_time>=1)&(clock.play_time <= 80)]

# print(clock[['tr_game','tr_game_next','play_time']].head(50))

# print(clock.groupby(['play_type'])['play_time'].mean())
# print(clock.groupby(['play_type'])['play_time'].count())

# Feature Engineering

### Need 6 Features:
-Down (check)  
-Seconds left in half (check)  
-Yards to go for touchdown (log?)  
-Yards to go for first down (log?)    
-Goal to goal indicator  
-Under 2 minutes indicator  

In [None]:
print(list(df))

In [None]:
# fix yard_line, it's w.r.t the home team
df = df.rename(columns={'yard_line':'wrong_yardline'})
df['yard_line'] = np.where(df['offense']==df['home'],df['wrong_yardline'],100-df['wrong_yardline'])
# print(df[['home','offense','yard_line','wrong_yardline']].head(50))
df = df.drop(columns=['wrong_yardline'])

In [None]:
# get log 10 of distance
df['l10_dist'] = np.log10(df['distance'])

In [None]:
# goal to go
df['GTG'] = np.where((df['yard_line']+df['distance']>=100),1,0)


# under two min in half
df['UTM'] = np.where(df['tr_half']<=120,1,0)

# Save

In [None]:
PATH = './output/processed.csv'
df.to_csv(PATH,index=False)