In [1]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import datetime
gc.collect()

from tqdm import tqdm

# Load Data

In [2]:
# load play by play & drive data
years = list(range(2005, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    
    drive_path = './output/'+str(year)+'/'+str(year)+'_drives.csv'
    drive_df = pd.read_csv(drive_path)
    
    drive_df = drive_df.rename(columns={'id':'drive_id'})
    
    sea_df = pd.merge(left=sea_df, right=drive_df, how='left', on=['drive_id','drive_id'])
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(df.shape)
print(str(num_plays) + " plays were loaded")

100%|██████████| 14/14 [00:28<00:00,  2.78s/it]

(2031893, 41)
2031893 plays were loaded





In [3]:
# offense_x seems to be correct while offense_y is not

df = df.drop(columns=['defense_y','defense_conference_y','offense_y','offense_conference_y'])

df = df.rename(columns={'defense_x':'defense','defense_conference_x':'defense_conference','offense_x':'offense','offense_conference_x':'offense_conference'})


## Special Teams

I'll do special teams and overtimes in future work. Right now, keeping it simple.

In [4]:
print(len(df))
nah = ['2pt Conversion','Kickoff','Kickoff Return (Offense)','Kickoff Return Touchdown',
      'Offensive 1pt Safety','Defensive 2pt Conversion','Extra Point Good','Extra Point Missed']

df = df.loc[~df.play_type.isin(nah)]
print(len(df))

2031893
1871043


In [5]:
# also drop overtime

df = df.loc[(df.period > 0) & (df.period <= 4)]
print(len(df))


1864121


## Clock

Clock data is unreliable because maybe 25% of the games have only have one time for each play, and that time is when the drive started. I played with trying to predict time per play based on play type, but the data was very messy. So I decided to get the total time of each drive, and then assume each play took the same amount of time. EPA shouldn't be significantly affected most of the time, i.e. a 70 yard pass will be considered a good play no matter what. The only time it might have an adverse effect is toward the end of a game, when seconds matter. I think that in college football, when the clock stops for a first down, and incompletions, that all pass plays probably do take a somewhat similar amount of time. Drives in this situation will consist mostly of the same play type, and plays of the same play type likely take similar amounts of time. I'll compare it to the clock data I do have to make sure.

In [6]:
# fix clock data first so drives can be figured out
time_cols = ['clock.minutes','clock.seconds','start_time.minutes','start_time.seconds',
            'end_time.minutes','end_time.seconds']
for tc in time_cols:
    df[tc] = df[tc].fillna(0)

# get time remaining in game
df['tr_game'] = (4-df['period']) * 900 + (df['clock.minutes'] * 60) + df['clock.seconds']
df['tr_half'] = np.where(df['period']>2,df['tr_game'], df['tr_game']-1800)

df = df.drop(columns=['clock.minutes','clock.seconds'])

In [7]:
# fill empties
df['elapsed.minutes'] = df['elapsed.minutes'].copy().fillna(0)
df['elapsed.seconds'] = df['elapsed.seconds'].copy().fillna(0)
df['drive_time'] = 60*df['elapsed.minutes'] + df['elapsed.seconds']

In [8]:
# a lot of those drive times are negative... and other problems. so here's an alt drive time
# alt clock

# these get the start and end time of every drive
maxs = df.groupby(['game_id','drive_id'])['tr_game'].max().reset_index()
mins = df.groupby(['game_id','drive_id'])['tr_game'].min().reset_index()
maxs = maxs.rename(columns={'tr_game':'drive_start'})
mins = mins.rename(columns={'tr_game':'drive_end'})

# sometimes the drive end time is the same as the drive start. in that case, I use the next drive start
maxs = maxs.sort_values(by=['game_id','drive_start'],ascending=False)
next_max = maxs.groupby(['game_id'])['drive_start'].shift(-1)
next_max = pd.Series(next_max, name='next_drive_start')
new_max = pd.concat([maxs, next_max], axis=1)
new_max['next_drive_start'] = new_max['next_drive_start'].fillna(0)

# sometimes (rarely, 2%ish of the time) both the next drive start and the drive end are the same as the drive start
# in that case, as a last resort, i use the next drive end time. 
# i'm fairly sure most of the time it's when a timeout or something divides the same drive into two.
# i can explore this more in future work
mins = mins.sort_values(by=['game_id','drive_end'],ascending=False)
next_min = mins.groupby(['game_id'])['drive_end'].shift(-1)
next_min = pd.Series(next_min, name='next_drive_end')
new_min = pd.concat([mins, next_min], axis=1)
new_min['next_drive_end'] = new_min['next_drive_end'].fillna(0)
new_min = new_min.drop(columns='game_id')
times = pd.merge(left=new_max,right=new_min,on=['drive_id','drive_id'],how='left')


# attempt 1 (works on ~95.5% of data)
times['drive_time_1'] = times['drive_start']-times['next_drive_start']
# plan B (95.8% of data)
times['drive_time'] = np.where(times['drive_time_1']>0,times['drive_time_1'],(times['drive_start']-times['drive_end']))
# last resort (works on 99.3% of data)
# times['drive_time'] = np.where(times['drive_time_2']>0,times['drive_time_2'],(times['drive_start']-times['next_drive_end']))

not_good = times.loc[times.drive_time<=0]
print(len(not_good))

good = times.loc[times.drive_time>0]
print(len(good))

print(good.drive_time.mean())

times = times[['drive_id','drive_time']]
times = times.rename(columns={'drive_time':'alt_drive_time'})


8969
275461
146.12538980109707


In [9]:
df = pd.merge(left=df,right=times,how='left',on=['drive_id','drive_id'])


In [10]:
# longest drive in CFB history is 882. so need to drop anything above 900
# also drop anything below or equal to 0

df['correct_time_1'] = np.where(df['drive_time'] > 0, df['drive_time'], df['alt_drive_time'])

df = df.loc[df['correct_time_1'] > 0]

df['correct_drive_time'] = np.where(df['drive_time'] < 900, df['drive_time'], df['alt_drive_time'])

df = df.loc[df['correct_drive_time'] < 900]

print(len(df))

print(df[['drive_time','alt_drive_time']].corr())

df = df.drop(columns=['drive_time','alt_drive_time','correct_time_1'])
df = df.rename(columns={'correct_drive_time':'drive_time'})

1844449
                drive_time  alt_drive_time
drive_time        1.000000        0.855643
alt_drive_time    0.855643        1.000000


# some objectives

1) fix "uncategorized" play type  
2) aggregate and clean all play types  
3) aggregate and clean all drive results  

In [11]:
gb = df.groupby(['play_type'])['down'].count()
gb

play_type
Blocked Field Goal                       446
Blocked Field Goal Touchdown              24
Blocked Punt                             347
Blocked Punt Touchdown                    18
End Period                              9205
End of Game                             3414
End of Half                             4303
Field Goal Good                        24021
Field Goal Missed                       8758
Fumble Recovery (Opponent)              3665
Fumble Recovery (Own)                   4313
Fumble Return Touchdown                  146
Interception                               2
Interception Return Touchdown            822
Missed Field Goal Return                  20
Missed Field Goal Return Touchdown         3
Pass                                   41599
Pass Completion                       225742
Pass Incompletion                     246884
Pass Interception                      11406
Pass Interception Return                6680
Pass Reception                        144382


In [14]:
uncat = df.loc[df['play_type']=='Uncategorized']
for pt in list(uncat.play_text.values):
    print(pt)
    


N/A fumbled, recovered by UTEP Aaron Jones , return for 0 yards Aaron Jones sacked for a loss of 4 yards to the UTEP 12 UTEP Penalty, intentional grounding (-3 Yards) to the UTEP 3
TEAM fumbled, recovered by Clem N/A  Deshaun Watson pass incomplete to N/A
Alex Kelley fumbled, recovered by Colo N/A  Sefo Liufau pass incomplete to Bryce Bobo
End of 2nd Quarter
End of 3rd Quarter
End of 3rd Quarter
Joshua Dobbs fumbled,  Joshua Dobbs pass intercepted Torren McGaster return for 6 yds to the Tenn 24
Michael Gordon run for 35 yds for a TD, (Logan Spry PAT MISSED)
Malik Smith return for 9 yds to the SDSt 49
Josh Burton return for 8 yds for a TD, (Josh Kealamakia KICK)
Roland Jenkins return for no gain to the Tulsa 28 UL MONROE Penalty, illegal block (10 Yards) to the Tulsa 43
Demetrius Kemp return for 7 yds to the NCaro 30
Jamauri Bogan run for 9 yds to the Nwest 13 for a 1ST down
, Devonte Williams return for 24 yds to the Ind 24
TEAM fumbled,  Brandon Wright punt for 21 yds
Treston Decoud r