In [1]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import datetime
import random
gc.collect()

from tqdm import tqdm

# Load Data

In [2]:
# load play by play & drive data
years = list(range(2005, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    
    drive_path = './output/'+str(year)+'/'+str(year)+'_drives.csv'
    drive_df = pd.read_csv(drive_path)
    
    drive_df = drive_df.rename(columns={'id':'drive_id'})
    
    sea_df = pd.merge(left=sea_df, right=drive_df, how='left', on=['drive_id','drive_id'])
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(df.shape)
print(str(num_plays) + " plays were loaded")

100%|██████████| 14/14 [00:29<00:00,  2.96s/it]

(2031893, 41)
2031893 plays were loaded





In [3]:
# offense_x seems to be correct while offense_y is not

df = df.drop(columns=['defense_y','defense_conference_y','offense_y','offense_conference_y'])

df = df.rename(columns={'defense_x':'defense','defense_conference_x':'defense_conference','offense_x':'offense','offense_conference_x':'offense_conference'})

In [4]:
# fix yard_line, it's w.r.t the home team
df = df.rename(columns={'yard_line':'wrong_yardline'})

df['yard_line'] = np.where(df['offense']==df['home'],df['wrong_yardline'],100-df['wrong_yardline'])
# print(df[['home','offense','yard_line','wrong_yardline']].head(50))
df = df.drop(columns=['wrong_yardline'])

In [5]:
df['alt_game_id'] = df['game_id'].copy().astype(str)
df['alt_drive_id'] = df['drive_id'].copy().astype(str)

def replace_id(x,y):
    return x.replace(y,'')
df['alt_drive_id'] = df.apply(lambda row: replace_id(row['alt_drive_id'], row['alt_game_id']), axis=1)

#  strip leading zeros from 1-9 drive numbers
df['alt_drive_id'] = df['alt_drive_id'].str.lstrip("0")

# this also eliminates drive "zeros", so replace empty space with zero
df['alt_drive_id'] = df['alt_drive_id'].replace(r'^\s*$', '0', regex=True)

## Special Teams

Overtimes are for future work, doing point afters last here (or maybe not at all)

In [6]:
# print(len(df))
pat = ['2pt Conversion','Offensive 1pt Safety','Defensive 2pt Conversion','Extra Point Good','Extra Point Missed']

df = df.loc[~df.play_type.isin(pat)]

In [7]:
# also drop overtime

df = df.loc[(df.period > 0) & (df.period <= 4)]
len(df)


1979243

In [8]:
# and plays with no play text
df = df.dropna(subset=['play_text'])
len(df)


1978969

## Clock

Clock data is unreliable because maybe 25% of the games have only have one time for each play, and that time is when the drive started. I played with trying to predict time per play based on play type, but the data was very messy. So I decided to get the total time of each drive, and then assume each play took the same amount of time. EPA shouldn't be significantly affected most of the time, i.e. a 70 yard pass will be considered a good play no matter what. The only time it might have an adverse effect is toward the end of a game, when seconds matter. I think that in college football, when the clock stops for a first down, and incompletions, that all pass plays probably do take a somewhat similar amount of time. Drives in this situation will consist mostly of the same play type, and plays of the same play type likely take similar amounts of time. I'll compare it to the clock data I do have to make sure.

In [9]:
# fix clock data first so drives can be figured out
time_cols = ['clock.minutes','clock.seconds','start_time.minutes','start_time.seconds',
            'end_time.minutes','end_time.seconds']
for tc in time_cols:
    df[tc] = df[tc].fillna(0)

# get time remaining in game
df['tr_game'] = (4-df['period']) * 900 + (df['clock.minutes'] * 60) + df['clock.seconds']
df['tr_half'] = np.where(df['period']>2,df['tr_game'], df['tr_game']-1800)

df = df.drop(columns=['clock.minutes','clock.seconds'])

In [10]:
# fill empties
df['elapsed.minutes'] = df['elapsed.minutes'].copy().fillna(0)
df['elapsed.seconds'] = df['elapsed.seconds'].copy().fillna(0)
df['drive_time'] = 60*df['elapsed.minutes'] + df['elapsed.seconds']

In [11]:
# a lot of those drive times are negative... and other problems. so here's an alt drive time
# alt clock

# these get the start and end time of every drive
maxs = df.groupby(['game_id','drive_id'])['tr_game'].max().reset_index()
mins = df.groupby(['game_id','drive_id'])['tr_game'].min().reset_index()
maxs = maxs.rename(columns={'tr_game':'drive_start'})
mins = mins.rename(columns={'tr_game':'drive_end'})

# sometimes the drive end time is the same as the drive start. in that case, I use the next drive start
maxs = maxs.sort_values(by=['game_id','drive_start'],ascending=False)
next_max = maxs.groupby(['game_id'])['drive_start'].shift(-1)
next_max = pd.Series(next_max, name='next_drive_start')
new_max = pd.concat([maxs, next_max], axis=1)
new_max['next_drive_start'] = new_max['next_drive_start'].fillna(0)

# sometimes (rarely, 2%ish of the time) both the next drive start and the drive end are the same as the drive start
# i can explore this more in future work
# i'm fairly sure most of the time it's when a timeout or something divides the same drive into two.

mins = mins.sort_values(by=['game_id','drive_end'],ascending=False)
next_min = mins.groupby(['game_id'])['drive_end'].shift(-1)
next_min = pd.Series(next_min, name='next_drive_end')
new_min = pd.concat([mins, next_min], axis=1)
new_min['next_drive_end'] = new_min['next_drive_end'].fillna(0)
new_min = new_min.drop(columns='game_id')
times = pd.merge(left=new_max,right=new_min,on=['drive_id','drive_id'],how='left')


# attempt 1 (works on ~95.5% of data)
times['drive_time_1'] = times['drive_start']-times['next_drive_start']
# plan B (95.8% of data)
times['drive_time'] = np.where(times['drive_time_1']>0,times['drive_time_1'],(times['drive_start']-times['drive_end']))
# last resort (didn't implement)
# times['drive_time'] = np.where(times['drive_time_2']>0,times['drive_time_2'],(times['drive_start']-times['next_drive_end']))

not_good = times.loc[times.drive_time<=0]
print(len(not_good))

good = times.loc[times.drive_time>0]
print(len(good))

print(good.drive_time.mean())

times = times[['drive_id','drive_time']]
times = times.rename(columns={'drive_time':'alt_drive_time'})

11341
277555
145.282859253121


In [12]:
df = pd.merge(left=df,right=times,how='left',on=['drive_id','drive_id'])

In [13]:
# longest drive in CFB history is 882. so need to drop anything above 900
# also drop anything below or equal to 0

df['correct_time_1'] = np.where(df['drive_time'] > 0, df['drive_time'], df['alt_drive_time'])

df = df.loc[df['correct_time_1'] > 0]

df['correct_drive_time'] = np.where(df['drive_time'] < 900, df['drive_time'], df['alt_drive_time'])

df = df.loc[df['correct_drive_time'] < 900]

print(len(df))

print("correlation between primary and approximate drive time")
print(df[['drive_time','alt_drive_time','correct_drive_time']].corr())

df = df.drop(columns=['drive_time','alt_drive_time','correct_time_1'])
df = df.rename(columns={'correct_drive_time':'drive_time'})

1957877
correlation between primary and approximate drive time
                    drive_time  alt_drive_time  correct_drive_time
drive_time            1.000000        0.855134            0.991361
alt_drive_time        0.855134        1.000000            0.862633
correct_drive_time    0.991361        0.862633            1.000000


In [14]:
df = df.drop(columns=['start_time.minutes','start_time.seconds','end_time.minutes','end_time.seconds'])

# Scoring Changes

Scoring changes are probably the most reliable data in the dataset. And also the most important.

In [32]:
# from what I can tell, sorting by ID gets the plays in order. Thank sweet baby jesus, because I'm not sure there's another way
df = df.sort_values(by=['game_id','id'],ascending=True)

df['away_score'] = np.where(df['away']==df['offense'], df['offense_score'], df['defense_score'])
df['home_score'] = np.where(df['away']==df['defense'], df['offense_score'], df['defense_score'])

df['prev_home_score'] = df.groupby(['game_id'])['home_score'].shift(1)
df['prev_away_score'] = df.groupby(['game_id'])['away_score'].shift(1)

df['home_score_change'] =  df['home_score'] - df['prev_home_score']
df['away_score_change'] = df['away_score'] - df['prev_away_score'] 




In [34]:
# on kickoffs, sometimes the score doesn't fill in yet, and so you get a lot of -7s and -3s
# there are a bunch of random other positives and negatives because occasionally random plays will have 0-0 scores
tds = [6,7]
df['home_td']=0
df['away_td']=0
df['home_fg']=0
df['away_fg']=0

df.loc[df.home_score_change.isin(tds), 'home_td'] = 1
df.loc[df.home_score_change == 3, 'home_fg'] = 1

df.loc[df.away_score_change.isin(tds), 'away_td'] = 1
df.loc[df.away_score_change == 3, 'away_fg'] = 1



In [35]:
# pt = ['Rushing Touchdown','Passing Touchdown']
# tds = 

# df.groupby(['play_type'])['id'].count()

play_type
Blocked Field Goal                       446
Blocked Field Goal Touchdown              24
Blocked Punt                             347
Blocked Punt Touchdown                    18
End Period                              9206
End of Game                             3414
End of Half                             4303
Field Goal Good                        24014
Field Goal Missed                       8754
Fumble Recovery (Opponent)              3664
Fumble Recovery (Own)                   4313
Fumble Return Touchdown                  144
Interception                               2
Interception Return Touchdown            822
Kickoff                               110779
Kickoff Return (Offense)                2749
Kickoff Return Touchdown                 138
Missed Field Goal Return                  20
Missed Field Goal Return Touchdown         3
Pass                                   41589
Pass Completion                       225699
Pass Incompletion                     246860


In [36]:
df['offensive_td'] = np.where(((df['home_td']==1)&(df['home']==df['offense'])), 1, 0)
df['offensive_td'] = np.where(((df['away_td']==1)&(df['away']==df['offense'])), 1, df['offensive_td'])

df['defensive_td'] = np.where(((df['home_td']==1)&(df['home']==df['defense'])), 1, 0)
df['defensive_td'] = np.where(((df['away_td']==1)&(df['away']==df['defense'])), 1, df['defensive_td'])


72055
12132


12800


In [26]:
# defensive_td = df.loc[df.play_type=='Fumble Return Touchdown']
game_ids = list(weird.game_id.values)
sample = random.choice(game_ids)
               
test_game = df.loc[df.game_id==sample]

test_game.to_csv('sample.csv',index=False)

# Hard Part

Need to make sure drive result, play text, and play type are all consistent. 

In [15]:
# identify each play type by play text

df['fumble'] = np.where(df['play_text'].str.contains('fumble'),1,0)
df['interception'] = np.where(df['play_text'].str.contains('interception'),1,0)
df['completion'] = np.where(df['play_text'].str.contains('pass complete'),1,0)
df['incompletion'] = np.where(df['play_text'].str.contains('incomplete'),1,0)
df['sack'] = np.where(df['play_text'].str.contains('sack'),1,0)
df['touchdown'] = np.where(df['play_text'].str.contains('Touchdown|TOUCHDOWN|touchdown'),1,0)
df['touchdown'] = np.where(df['play_text'].str.contains('/^TD$/'),1,df['touchdown'])
df['rush'] = np.where(df['play_text'].str.contains('run|rush'),1,0)
df['punt'] = np.where(df['play_text'].str.contains('punt'),1,0)
# safties fixed now
df['safety'] = np.where(df['play_text'].str.contains('safety|SAFETY|Safety'),1,0)
df['punt_block'] = np.where(df['play_text'].str.contains('punt blocked'),1,0)
base = r'^{}'
expr = '(?=.*{})'
words = ['FG', 'GOOD']
fg_made = base.format(''.join(expr.format(w) for w in words))
df['fg_made'] = np.where(df.play_text.str.contains(fg_made,regex=True),1,0)
words = ['Field Goal', 'GOOD']
df['fg_made'] = np.where(df.play_text.str.contains(fg_made,regex=True),1,df['fg_made'])
words = ['FG', 'MISSED']
fg_missed = base.format(''.join(expr.format(w) for w in words))
df['fg_missed'] = np.where(df.play_text.str.contains(fg_missed,regex=True),1,0)
words = ['Field Goal', 'MISSED']
df['fg_missed'] = np.where(df.play_text.str.contains(fg_made,regex=True),1,df['fg_missed'])



In [40]:
# where everything agrees, let's go ahead and standardize
print(df.rush.sum())
print(df.completion.sum())

AttributeError: 'DataFrame' object has no attribute 'rush'

In [20]:

# from what I can tell, sorting by ID gets the plays in order. Thank sweet baby jesus, because I'm not sure there's another way
df = df.sort_values(by=['game_id','id'],ascending=True)

defensive_td = df.loc[df.play_type=='Fumble Return Touchdown']
game_ids = list(defensive_td.game_id.values)
sample = random.choice(game_ids)
               
test_game = df.loc[df.game_id==sample]

test_game.to_csv('sample.csv',index=False)




In [None]:
kickoffs = ['Kickoff','Kickoff Return (Offense)','Kickoff Return Touchdown']

nok = df.loc[~df['play_type'].isin(kickoffs)]

# adding columns to markt the first and last play of drives to help with categorizing things like fumbles
firsts = nok.groupby(['game_id','drive_id'])['id'].first().reset_index()

lasts = nok.groupby(['game_id','drive_id'])['id'].last().reset_index()

first_ids = list(firsts.id.values)
last_ids = list(lasts.id.values)

df['first_play'] = 0
df['last_play'] = 0
df.loc[df.id.isin(first_ids),'first_play'] = 1
df.loc[df.id.isin(last_ids),'last_play'] = 1

del nok 
gc.collect()

In [18]:
df.groupby(['play_type'])['id'].count()

play_type
Blocked Field Goal                       446
Blocked Field Goal Touchdown              24
Blocked Punt                             325
Blocked Punt Touchdown                    18
End Period                              9206
End of Game                             3414
End of Half                             4303
Field Goal Good                        24014
Field Goal Missed                       8754
Fumble Recovery (Opponent)              3664
Fumble Recovery (Own)                   4313
Fumble Return Touchdown                  144
Interception                               2
Interception Return Touchdown            822
Kickoff                               110779
Kickoff Return (Offense)                2749
Kickoff Return Touchdown                 138
Missed Field Goal Return                  20
Missed Field Goal Return Touchdown         3
Pass                                   41588
Pass Completion                       225699
Pass Incompletion                     246860


In [None]:
# fix fumbles
['Fumble','Fumble Recovery (Opponent)', 'Fumble Recovery (Own)', 'Fumble Return Touchdown']
# fumbles = 
wrong = df.loc[(df.play_type=='Fumble')&(df.fumble==0)]

In [17]:
# fix safeties

df.loc[(df.play_type=='Safety') & (df.play_text.str.contains('intentional grounding')), 'safety'] = 1


df.loc[(df['play_type']=='Safety')&(df['safety']==0), 'play_type'] = 'Uncategorized'


df.loc[(df['play_type']!='Safety')&(df['safety']==1), 'play_type'] = 'Safety'