In [19]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import datetime
gc.collect()

from tqdm import tqdm

# Load Data

In [20]:
# load play by play
years = list(range(2004, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    
    drive_path = './output/'+str(year)+'/'+str(year)+'_drives.csv'
    drive_df = pd.read_csv(drive_path)
    
    drive_df = drive_df.rename(columns={'id':'drive_id'})
    
    sea_df = pd.merge(left=sea_df, right=drive_df, how='left', on=['drive_id','drive_id'])
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(df.shape)
print(str(num_plays) + " plays were loaded")

100%|██████████| 15/15 [00:35<00:00,  3.43s/it]

(2122188, 41)
2122188 plays were loaded





# Clean Data 

In [44]:
# fix clock data first so drives can be figured out


time_cols = ['clock.minutes','clock.seconds','start_time.minutes','start_time.seconds',
            'end_time.minutes','end_time.seconds']
for tc in time_cols:
    df[tc] = df[tc].fillna(0)

# get time remaining in game
df['tr_game'] = (4-df['period']) * 900 + (df['clock.minutes'] * 60) + df['clock.seconds']
df['tr_half'] = np.where(df['period']>2,df['tr_game'], df['tr_game']-1800)

df = df.drop(columns=['clock.minutes','clock.seconds'])

In [55]:
# isolate extra point attempts
xp_cats = ['Two Point Pass','Two Point Rush','Blocked PAT','Extra Point Good','Extra Point Missed', '2pt Conversion',
          'Offensive 1pt Safety','Defensive 2pt Conversion']
xps = df.loc[df['play_type'].isin(xp_cats)]
df = df.loc[~df['play_type'].isin(xp_cats)]


In [53]:
kickoffs_cats = ['Kickoff Return (Offense)', 'Kickoff Return Touchdown', 'Kickoff Return (Defense)', 'Kickoff']
kickoffs = df.loc[df['play_type'].isin(kickoffs_cats)]
df = df.loc[~df['play_type'].isin(kickoffs_cats)]


In [57]:
# isolate OT
ot = df.loc[df['period'] > 4]
df = df.loc[(df['period'] <= 4) & (df['period'] >0)]


In [64]:
# validate
bad = df.loc[df['offense_x']!=df['offense_y']]
# print(bad[['tr_game','play_text','offense_x','offense_y']].head(25))
# offense_x seems to be correct while offense_y is not

df = df.drop(columns=['defense_y','defense_conference_y','offense_y','offense_conference_y'])

df = df.rename(columns={'defense_x':'defense','defense_conference_x':'defense_conference','offense_x':'offense','offense_conference_x':'offense_conference'})


['away', 'defense_x', 'defense_conference_x', 'defense_score', 'distance', 'down', 'drive_id', 'home', 'id', 'offense_x', 'offense_conference_x', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'season', 'week', 'season_type', 'defense_y', 'defense_conference_y', 'drive_result', 'elapsed.minutes', 'elapsed.seconds', 'end_period', 'end_time.minutes', 'end_time.seconds', 'end_yardline', 'game_id', 'offense_y', 'offense_conference_y', 'plays', 'scoring', 'start_period', 'start_time.minutes', 'start_time.seconds', 'start_yardline', 'yards', 'tr_game', 'tr_half']
      tr_game                                          play_text  \
252    2355.0  Jordan Palmer (UTEP) pass left side intercepte...   
403     264.0  Sam Keller (ASU) pass right side complete to J...   
404      20.0  Sam Keller (ASU) pass right side complete to J...   
623    3543.0                                     Eagles safety.   
1243      0.0                            End of the 4th quart

In [65]:
print(list(df))

['away', 'defense', 'defense_conference', 'defense_score', 'distance', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'season', 'week', 'season_type', 'drive_result', 'elapsed.minutes', 'elapsed.seconds', 'end_period', 'end_time.minutes', 'end_time.seconds', 'end_yardline', 'game_id', 'plays', 'scoring', 'start_period', 'start_time.minutes', 'start_time.seconds', 'start_yardline', 'yards', 'tr_game', 'tr_half']


In [66]:
uncat = df.loc[df['play_type']=='Uncategorized']
print(len(uncat))
print(uncat.play_text.head(10))

5507
['Extra point by Ryan Killeen (USC) is good.'
 '35 yard field goal by Brandon Pace (VT) is good.'
 '35 yard field goal by Ryan Killeen (USC) is no good.' ...
 'Dayton Furuta fumbled,  Cole McDonald run for 2 yds to the UNLV 30'
 'Jason Shelley fumbled,  Devin Brumfield run for no gain to the Colo 35'
 'Nick Pickett return for no gain OREGON Penalty, unsportsmanlike conduct (Nick Pickett)']


In [69]:
print(df.play_type.unique())

['Rush' 'Pass Incompletion' 'Timeout' 'Penalty' 'Punt Return'
 'Pass Interception' 'Pass Completion' 'Uncategorized' 'End Period'
 'Fumble Recovery (Own)' 'Sack' 'Fumble Recovery (Opponent)'
 'Interception Return Touchdown' 'Blocked Punt' 'Safety'
 'Blocked Punt Touchdown' 'Blocked Field Goal' 'Punt Return Touchdown'
 'Fumble Return Touchdown' 'Blocked Field Goal Touchdown' 'Punt' 'Pass'
 'Field Goal Good' 'Field Goal Missed' 'Pass Reception'
 'Passing Touchdown' 'Rushing Touchdown' 'Pass Interception Return'
 'End of Half' 'End of Game' 'Missed Field Goal Return' 'Interception'
 'Missed Field Goal Return Touchdown']


In [74]:
# fix uncategorized
print(len(df.loc[df.play_type=='Uncategorized']))
df['XP'] = np.where((df['play_type']=='Uncategorized')&(df['play_text'].str.contains('Extra point by')),1,0)
df['FG'] = np.where((df['play_type']=='Uncategorized')&(df['play_text'].str.contains('yard field goal by')),1,0)
df['FG/XP'] = df['XP'] + df['FG']

df['FG/XP Good'] = np.where((df['FG/XP']==1)&(df['play_text'].str.contains('is good')),1,0)
df['FG/XP Not Good'] = np.where((df['FG/XP']==1)&(df['FG/XP Good']==0),1,0)

df.loc[(df['FG'] == 1) & (df['FG/XP Good']==1), ['play_type']] =['Field Goal Good']
df.loc[(df['FG'] == 1) & (df['FG/XP Good']==1), ['play_type']] =['Field Goal Missed']

print(len(df.loc[df.play_type=='Uncategorized']))

4551
4551
