In [1]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import datetime
gc.collect()

from tqdm import tqdm

# Load Data

In [2]:
# load play by play & drive data
years = list(range(2004, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    
    drive_path = './output/'+str(year)+'/'+str(year)+'_drives.csv'
    drive_df = pd.read_csv(drive_path)
    
    drive_df = drive_df.rename(columns={'id':'drive_id'})
    
    sea_df = pd.merge(left=sea_df, right=drive_df, how='left', on=['drive_id','drive_id'])
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(df.shape)
print(str(num_plays) + " plays were loaded")

100%|██████████| 15/15 [00:34<00:00,  3.42s/it]

(2122188, 41)
2122188 plays were loaded





# Clean Data 

In [3]:
# fix clock data first so drives can be figured out
time_cols = ['clock.minutes','clock.seconds','start_time.minutes','start_time.seconds',
            'end_time.minutes','end_time.seconds']
for tc in time_cols:
    df[tc] = df[tc].fillna(0)

# get time remaining in game
df['tr_game'] = (4-df['period']) * 900 + (df['clock.minutes'] * 60) + df['clock.seconds']
df['tr_half'] = np.where(df['period']>2,df['tr_game'], df['tr_game']-1800)

df = df.drop(columns=['clock.minutes','clock.seconds'])

In [26]:
# fix uncategorized
uncat = df.loc[df['play_type']=='Uncategorized']
print(len(uncat))
print(uncat.play_text.head(10))

0
Series([], Name: play_text, dtype: object)


In [6]:
print(df.play_type.unique())
def fix_uncat(play_type, play_text):
    if play_type != 'Uncategorized':
        return play_type
    else:
        if isinstance(play_text,str):
            if "Start of the 2nd quarter." in play_text:
                return "End Period"
            elif "Start of the 3rd quarter." in play_text:
                return "End of Half"
            elif "Start of the 4th quarter." in play_text:
                return "End Period"
            elif "Start of overtime." in play_text:
                return "End Period"
            elif "End of the game." in play_text:
                return "End of Game"
            elif "Extra point" in play_text:
                if "is good" in play_text:
                    return "Extra Point Good"
                elif "is no good." in play_text[-13:]:
                    return "Extra Point Missed"
                else:
                    return play_type
            elif "field goal" in play_text:
                if "is good" in play_text:
                    return "Field Goal Good"
                elif "is no good." in play_text[-13:]:
                    return "Field Goal Missed"
                else:
                    print(play_text)
                    return play_type
            elif "missed PAT returned." in play_text:
                return "Extra Point Missed"
            elif "took lateral and rushed" in play_text:
                return "Rush"
            # mostly fumbled snaps recovered by own team
            elif "fumbled" in play_text:
                return "Fumble Recovery (Own)"
            elif "return for" in play_text:
                return "Punt Return"
            elif "End of" in play_text:
                return "End Period"
            elif "run for" in play_text:
                return "Rush"
            elif "SAFETY" in play_text:
                return "Safety"
            elif "Penalty" in play_text:
                return "Penalty"
            else:
                return play_type
    return play_type

df['play_type'] = df.apply(lambda row: fix_uncat(row['play_type'], row['play_text']),axis=1)

# uncat = df.loc[df.play_type=='Uncategorized']
# print(len(uncat))

['Rush' 'Pass Incompletion' 'Timeout' 'Penalty' 'Punt Return'
 'Pass Interception' 'Pass Completion' 'Uncategorized'
 'Kickoff Return (Offense)' 'End Period' 'Fumble Recovery (Own)' 'Sack'
 'Fumble Recovery (Opponent)' 'Interception Return Touchdown'
 'Blocked Punt' 'Safety' 'Two Point Pass' 'Kickoff Return Touchdown'
 'Two Point Rush' 'Blocked Field Goal' 'Blocked Punt Touchdown'
 'Blocked PAT' 'Punt Return Touchdown' 'Fumble Return Touchdown'
 'Kickoff Return (Defense)' 'Blocked Field Goal Touchdown' 'Punt' 'Pass'
 'Kickoff' 'Extra Point Good' 'Field Goal Good' 'Field Goal Missed'
 'Extra Point Missed' '2pt Conversion' 'Offensive 1pt Safety'
 'Pass Reception' 'Passing Touchdown' 'Rushing Touchdown'
 'Pass Interception Return' 'End of Half' 'End of Game'
 'Defensive 2pt Conversion' 'Missed Field Goal Return' 'Interception'
 'Missed Field Goal Return Touchdown']
35


In [7]:
# isolate extra point attempts
xp_cats = ['Two Point Pass','Two Point Rush','Blocked PAT','Extra Point Good','Extra Point Missed', '2pt Conversion',
          'Offensive 1pt Safety','Defensive 2pt Conversion']
xps = df.loc[df['play_type'].isin(xp_cats)]
df = df.loc[~df['play_type'].isin(xp_cats)]

In [8]:
# isolate kickoffs
kickoffs_cats = ['Kickoff Return (Offense)', 'Kickoff Return Touchdown', 'Kickoff Return (Defense)', 'Kickoff']
kickoffs = df.loc[df['play_type'].isin(kickoffs_cats)]
df = df.loc[~df['play_type'].isin(kickoffs_cats)]


In [9]:
# isolate OT
ot = df.loc[df['period'] > 4]
df = df.loc[(df['period'] <= 4) & (df['period'] >0)]


In [10]:
# drop end of period plays

eop = ['End of Game','End of Half','End Period']
df = df.loc[~df['play_type'].isin(eop)]


In [11]:
# validate
bad = df.loc[df['offense_x']!=df['offense_y']]
# print(bad[['tr_game','play_text','offense_x','offense_y']].head(25))
# offense_x seems to be correct while offense_y is not

df = df.drop(columns=['defense_y','defense_conference_y','offense_y','offense_conference_y'])

df = df.rename(columns={'defense_x':'defense','defense_conference_x':'defense_conference','offense_x':'offense','offense_conference_x':'offense_conference'})


In [None]:
# fix bad distances

zeros = df.loc[df['distance'] == 0]
# print(len(zeros))
# print(zeros.groupby(['play_type'])['distance'].count())
# print(zeros.play_text.tail(50))

# drop negative distances. change 0 distances to 0.5 yard
df = df.loc[df['distance']>=0]

df = df.rename(columns={'distance':'wrong_distance'})
df['distance'] = np.where(df['wrong_distance']>0,df['wrong_distance'],0.5)
df = df.drop(columns=['wrong_distance'])


In [34]:
# fix bad downs
zero_down = df.loc[df['down']==0]
print(len(zero_down))

# impute down + 1 from previous play, to max of 4
df['down'] = np.where(df['down']>0,df['down'],df['down'].shift()+1)
# still 18 bad

df = df.loc[(df['down']>0)&(df['down']<5)]

0


# Feature Engineering

### Need 6 Features:
-Down (check)
-Seconds left in half (check)  
-Yards to go for touchdown (log?)  
-Yards to go for first down (log?)    
-Goal to goal indicator  
-Under 2 minutes indicator  

In [12]:
print(list(df))

['away', 'defense', 'defense_conference', 'defense_score', 'distance', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'season', 'week', 'season_type', 'drive_result', 'elapsed.minutes', 'elapsed.seconds', 'end_period', 'end_time.minutes', 'end_time.seconds', 'end_yardline', 'game_id', 'plays', 'scoring', 'start_period', 'start_time.minutes', 'start_time.seconds', 'start_yardline', 'yards', 'tr_game', 'tr_half']


In [13]:
# fix yard_line, it's w.r.t the home team
df = df.rename(columns={'yard_line':'wrong_yardline'})
df['yard_line'] = np.where(df['offense']==df['home'],df['wrong_yardline'],100-df['wrong_yardline'])
# print(df[['home','offense','yard_line','wrong_yardline']].head(50))

0


In [25]:

play_type 

                  away           defense defense_conference  defense_score  \
55                 USC     Virginia Tech                ACC            3.0   
136                USC               USC             Pac-10           14.0   
161                USC               USC             Pac-10           21.0   
404               UTEP              UTEP   Western Athletic            9.0   
1835  Northern Arizona  Northern Arizona                NaN            0.0   

      down     drive_id           home             id        offense  \
55     0.0  24241025908  Virginia Tech  2424102590810            USC   
136    0.0  24241025922  Virginia Tech  2424102592207  Virginia Tech   
161    0.0  24241025928  Virginia Tech  2424102592804  Virginia Tech   
404    0.0  24246000946  Arizona State  2424600094607  Arizona State   
1835   0.0  24248001204        Arizona  2424800120403        Arizona   

     offense_conference  ...  scoring  start_period start_time.minutes  \
55               Pac-10 