In [2]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
import requests
gc.collect()

from tqdm import tqdm

# Load Data

In [3]:
years = list(range(2004, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(str(num_plays) + " plays were loaded")

100%|██████████| 15/15 [00:17<00:00,  1.70s/it]

2122188 plays were loaded





In [15]:
year = 2018

drive_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/drives?seasonType=regular&year=' + str(year)).json())


NameError: name 'requests' is not defined

In [None]:
print(drive_data.head())

# Clean Data

In [4]:
# print(list(df))
print(list(df.play_type.unique()))

['Rush', 'Pass Incompletion', 'Timeout', 'Penalty', 'Punt Return', 'Pass Interception', 'Pass Completion', 'Uncategorized', 'Kickoff Return (Offense)', 'End Period', 'Fumble Recovery (Own)', 'Sack', 'Fumble Recovery (Opponent)', 'Interception Return Touchdown', 'Blocked Punt', 'Safety', 'Two Point Pass', 'Kickoff Return Touchdown', 'Two Point Rush', 'Blocked Field Goal', 'Blocked Punt Touchdown', 'Blocked PAT', 'Punt Return Touchdown', 'Fumble Return Touchdown', 'Kickoff Return (Defense)', 'Blocked Field Goal Touchdown', 'Punt', 'Pass', 'Kickoff', 'Extra Point Good', 'Field Goal Good', 'Field Goal Missed', 'Extra Point Missed', '2pt Conversion', 'Offensive 1pt Safety', 'Pass Reception', 'Passing Touchdown', 'Rushing Touchdown', 'Pass Interception Return', 'End of Half', 'End of Game', 'Defensive 2pt Conversion', 'Missed Field Goal Return', 'Interception', 'Missed Field Goal Return Touchdown']


In [5]:
# check for no play text
df['play_text'] = df['play_text'].to_string()

# nans = df.loc[df['play_text']=='NaN']
df['empties'] = np.where(len(df['play_text'])<4,1,0)
empties = df.loc[df['empties']==1]

print(empties.play_text)

In [None]:
# fix Uncategorized play types 
# 5,580

uncat = df.loc[df['play_type']=='Uncategorized']
print(len(uncat))
print(uncat.play_text.values)

5580


In [23]:
# isolate kickoffs
kickoffs_cats = ['Kickoff Return (Offense)', 'Kickoff Return Touchdown', 'Kickoff Return (Defense)', 'Kickoff']
kickoffs = df.loc[df['play_type'].isin(kickoffs_cats)]
df = df.loc[~df['play_type'].isin(kickoffs_cats)]

In [25]:
# isolate extra point attempts
xp_cats = ['Two Point Pass','Two Point Rush','Blocked PAT','Extra Point Good','Extra Point Missed', '2pt Conversion',
          'Offensive 1pt Safety','Defensive 2pt Conversion']
xps = df.loc[df['play_type'].isin(xp_cats)]
df = df.loc[~df['play_type'].isin(xp_cats)]

In [27]:
zeros = df.loc[df['distance']==0]
print(len(zeros))
print(zeros.play_text.values)

96193
['Extra point by Ryan Killeen (USC) is good.'
 'Trojans fumble by Alex Holmes (USC), recovered by Alex Holmes (USC), advanced for no gain.'
 'Extra point by Brandon Pace (VT) is good.' ...
 'Riley Lees 4 Yd pass from Clayton Thorson (Charlie Kuhbander Kick)'
 'Jared McGee 82 Yd Fumble Return (Charlie Kuhbander Kick)'
 'Riley Lees 8 Yd Run (Charlie Kuhbander Kick)']


In [4]:
def fix_uncat(play_type, play_text):
    if play_type != 'Uncategorized':
        return play_type
    else:
        if isinstance(play_text,str):
            if "Start of the 1st quarter." in play_text:
                return "End Period"
            elif "Start of the 2nd quarter." in play_text:
                return "End Period"
            elif "Start of the 3rd quarter." in play_text:
                return "End of Half"
            elif "Start of the 4th quarter." in play_text:
                return "End Period"
            elif "Start of overtime." in play_text:
                return "End Period"
            elif "End of the game." in play_text:
                return "End of Game"
            elif "Extra point" in play_text:
                if "is good" in play_text:
                    return "Extra Point Good"
                elif "is no good." in play_text[-13:]:
                    return "Extra Point Missed"
                else:
                    return play_type
            elif "field goal" in play_text:
                if "is good" in play_text:
                    return "Field Goal Good"
                elif "is no good." in play_text[-13:]:
                    return "Field Goal Missed"
                else:
                    print(play_text)
                    return play_type
            elif "missed PAT returned." in play_text:
                return "Extra Point Missed"
            else:
                return play_type
    return play_type

df['play_type'] = df.apply(lambda row: fix_uncat(row['play_type'], row['play_text']),axis=1)

uncat = df.loc[df.play_type=='Uncategorized']
print(len(uncat))
print(uncat.play_text.head(50))

119
3940     DeAngelo Hall (VT) took lateral and rushed for...
11077    Terrence Biggers (MSU) took lateral and rushed...
11092    Derek Abney (UK) took lateral and rushed for 1...
13313    Terrance Phillips (PSU) took lateral and rushe...
28227    Andrico Hines (MTSU) took lateral and rushed f...
30739    Lance Pendleton (BYU) took lateral and rushed ...
31087    Chris Bruhn (WSU) took lateral and rushed for ...
33853    Tim Blackwell (USM) took lateral and rushed fo...
36684    Bruce Gradkowski (TOL) took lateral and rushed...
39523    Michael Turner (NIU) took lateral and rushed f...
43241    Jason Samples (TSU) took lateral and rushed fo...
48186    Steve Breaston (MICH) took lateral and rushed ...
52925    Duane Coleman (CLEM) took lateral and rushed f...
53764    Scott Lunde (WSU) took lateral and rushed for ...
63806    Aric Williams (OSU) took lateral and rushed fo...
75692    Garrett Lepisto (UCLA) took lateral and rushed...
83122    Sean Taylor (MIA) took lateral and rushed f

# Need Separate Model for XP, Kickoffs

In [5]:
# drop_cols
separate = ['End Period', 'Kickoff Return (Offense)', 'Extra Point Good', 'Timeout',
 'End of Half', 'End of Game', 'Two Point Pass', 'Two Point Rush', 
 'Kickoff Return (Defense)', 'Uncategorized', 'Kickoff Return Touchdown', 'Blocked PAT','Kickoff', 
 'Extra Point Missed', '2pt Conversion', 'Defensive 2pt Conversion', 'Offensive 1pt Safety']

print(len(df))
sep_df = df.loc[df.play_type.isin(separate)]
print(len(sep_df))
df = df.loc[~df.play_type.isin(separate)]
print(len(df))


2223578
272728
1950850


In [6]:
# drop overtime and 61 period 0 entries
print(len(df))
df = df.loc[df.period.isin([1,2,3,4])]
print(len(df))

1950850
1944000


In [7]:
# instead of zero its nan for clock.seconds and clock.minutes
df['clock.seconds'] = df['clock.seconds'].fillna(0)
df['clock.minutes'] = df['clock.minutes'].fillna(0)

print(list(df['clock.seconds'].unique()))

gb = df.groupby(['clock.seconds'])['id'].count()
gb = gb.sort_values(ascending=False)
print(gb.head(10))

[0.0, 25.0, 45.0, 30.0, 11.0, 15.0, 40.0, 55.0, 18.0, 22.0, 54.0, 23.0, 33.0, 44.0, 20.0, 34.0, 4.0, 10.0, 53.0, 56.0, 51.0, 21.0, 6.0, 16.0, 46.0, 3.0, 58.0, 7.0, 47.0, 27.0, 57.0, 17.0, 48.0, 37.0, 24.0, 14.0, 50.0, 5.0, 35.0, 43.0, 39.0, 52.0, 26.0, 36.0, 42.0, 12.0, 2.0, 32.0, 28.0, 8.0, 31.0, 19.0, 9.0, 29.0, 13.0, 41.0, 59.0, 38.0, 49.0, 1.0]
clock.seconds
0.0     198033
30.0     68195
45.0     55273
50.0     49608
15.0     49477
20.0     48352
40.0     47525
55.0     46692
10.0     44460
25.0     39756
Name: id, dtype: int64


In [8]:
# # calculate time remaining in half
def tr_half(period, minutes, seconds):
    tr = 0
    if period in [1,3]:
        # add a quarter of time remaining
        tr += 900
    tr += (60 * minutes + seconds)
    return tr

def tr_game(period, minutes, seconds):
    quarters_left = 4-period
    added_secs = 15*60*quarters_left
    return (60*minutes + seconds + added_secs)

df['tr_half'] = df.apply(lambda row: tr_half(row['period'],row['clock.minutes'],row['clock.seconds']),axis=1)
df['tr_game'] = df.apply(lambda row: tr_game(row['period'],row['clock.minutes'],row['clock.seconds']),axis=1)

print(df[['period','clock.minutes','clock.seconds','tr_half','tr_game']].head())

   period  clock.minutes  clock.seconds  tr_half  tr_game
2       1           14.0            0.0   1740.0   3540.0
3       1           14.0           25.0   1765.0   3565.0
4       1           14.0           45.0   1785.0   3585.0
5       1           13.0           30.0   1710.0   3510.0
6       1           11.0           11.0   1571.0   3371.0


In [9]:
# drop clock numbers, not needed anymore 
df = df.drop(columns=['clock.minutes','clock.seconds'])

# Get Desired Features

Need 6 feature variables. 2 weighting variables. and one target.

6 features:  
-Down  
-Seconds left in half  
-Yards to go for touchdown (log?)  
-Yards to go for first down (log?)  
-Goal to goal indicator  
-Under 2 minutes indicator  

2 weights:  
-number of drives to next score
-absolute score differential

Also need target variable, next score relative to current offense.


In [10]:
print(len(df))
df = df.dropna(subset=['play_text'])
print(len(df))

1944000
1943642


In [11]:
def add_tds(play_type, play_text):
    if play_type != 'Penalty':
        if isinstance(play_text, str):
            if 'Touchdown' in play_type:
                return 1
            elif 'touchdown' in play_text:
                return 1
            elif 'for a TD' in play_text:
                return 1
    return 0
    
df['touchdown'] = df.apply(lambda row: add_tds(row['play_type'],row['play_text']), axis=1)
    


In [12]:
td_plays = df.loc[df['touchdown']==1]
print(list(td_plays.play_type.unique()))

['Pass Completion', 'Blocked Punt Touchdown', 'Rush', 'Fumble Recovery (Own)', 'Interception Return Touchdown', 'Punt Return Touchdown', 'Fumble Return Touchdown', 'Blocked Field Goal Touchdown', 'Missed Field Goal Return Touchdown', 'Sack', 'Pass Incompletion', 'Passing Touchdown', 'Rushing Touchdown', 'Punt', 'Fumble Recovery (Opponent)', 'Pass Reception', 'Blocked Punt', 'Pass Interception Return', 'Blocked Field Goal']


In [13]:
defensive_tds = ['Blocked Punt Touchdown', 'Interception Return Touchdown','Punt Return Touchdown',
            'Fumble Return Touchdown','Blocked Field Goal Touchdown','Missed Field Goal Return Touchdown',
             'Sack']

not_touchdowns = ['Pass Incompletion']

# create list for faster comparison
dtd_nt = defensive_tds + not_touchdowns

offensive_tds = ['Pass Completion','Rush','Fumble Recovery (Own)','Rushing Touchdown','Passing Touchdown']

# split into offensive and defensive touchdowns

df['offensive_TD'] = np.where(((~df['play_type'].isin(dtd_nt)) & (df['touchdown']==1)),1,0)

df['defensive_TD'] = np.where(((df['play_type'].isin(defensive_tds)) & (df['touchdown']==1)),1,0)



In [14]:
# add field goals and safeties

df['fg'] = np.where(df['play_type'] == 'Field Goal Good',1,0)
df['safety'] = np.where(df['play_type'] == 'Safety',1,0)


In [15]:
## 32266005905 remove safety, it was on kickoff somehow
df = df.loc[df['id']!=322660059036]

## 4010320813 has two plays from 4010320812
df.loc[df['id'] == 401032081101874002, ['drive_id']] = 4010320812
df.loc[df['id'] == 401032081101907203, ['drive_id']] = 4010320812

## 40054786811 has two drives
df.loc[(df['drive_id']==40054786811) & (df['offense']=='Baylor'), ['drive_id']] = 4005478681100

In [16]:
drive_gb = df.groupby(['drive_id'])['offensive_TD','defensive_TD','fg','safety'].max().reset_index()

drive_gb['drive_score'] = 7 * drive_gb['offensive_TD'] + -7 * drive_gb['defensive_TD'] + 3 * drive_gb['fg'] + -2 * drive_gb['safety']
drive_gb['drive_score'] = drive_gb['drive_score'].astype(int)
drive_gb = drive_gb[['drive_id','drive_score']]

df = pd.merge(left=df, right=drive_gb, how='left', on=['drive_id','drive_id'])

In [17]:
# since clock numbers aren't consistent for some plays, I am sorting drives by average time remaining 
# of all plays on the drive

tr = df.groupby(['drive_id'])['tr_game'].mean().reset_index()

tr = tr.rename(columns={'tr_game':'avg_drive_time'})
print(tr.head(5))

df = pd.merge(left=df, right=tr, how='left', on=['drive_id','drive_id'])

     drive_id  avg_drive_time
0  4005476401         3518.75
1  4005476402         3361.00
2  4005476403         3255.25
3  4005476404         3145.60
4  4005476405         3064.50


In [20]:
df['half'] = np.where(df['period'] < 3, 1, 2)
df['is_scoring_drive'] = np.where(df['drive_score'] != 0, 1, 0)

In [4]:
# games=df.groupby(['game_id'])
# print(len(games))

# del df
# gc.collect()

# new_df = pd.DataFrame()

# counter = 0
# for game, game_plays in games:
#     counter += 1
#     if counter % 1000 == 0:
#         print(counter)
#     # sort by time remaining to order them
#     ordered = game_plays.sort_values(by=['avg_drive_time'],ascending=False)
#     # label drive numbers
#     i = ordered.drive_id
#     ordered['drive_no'] = i.ne(i.shift()).cumsum()
    
#     scoring_drives_1H = ordered.loc[(ordered['is_scoring_drive']==1)&(ordered['half']==1)].copy()
#     scoring_drives_2H = ordered.loc[(ordered['is_scoring_drive']==1)&(ordered['half']==2)].copy()
    
#     # get last score of half drive number
#     last_score_1H = scoring_drives_1H.drive_no.max()
#     last_score_2H = scoring_drives_2H.drive_no.max()
    
#     # need drive numbers, drive scores, and drive offenses for each scoring drive for each half
#     sdn_1H = scoring_drives_1H.drive_no.values
#     sdn_2H = scoring_drives_2H.drive_no.values
    
#     ds_1H = scoring_drives_1H.drive_score.values
#     ds_2H = scoring_drives_2H.drive_score.values
    
#     sdo_1H = scoring_drives_1H.offense.values
#     sdo_2H = scoring_drives_2H.offense.values
    
#     # also need to split plays into first and second half
#     drives_1H = ordered.loc[ordered['half']==1].copy()
#     drives_2H = ordered.loc[ordered['half']==2].copy()
    
#     if len(drives_1H) < 1:
#         continue
#     if len(drives_2H) < 1:
#         continue
    
#     # drive numbers for first half
#     dn_1H = drives_1H.drive_no.values
#     dn_2H = drives_2H.drive_no.values
    
#     # if last drive of half is not scoring drive, add dummy scoring drive with zeros
#     # also treat cases where there are no scoring drives
#     if len(sdn_1H) == 0:
#         sdn_1H = np.append(sdn_1H, dn_1H[-1])
#         ds_1H = np.append(ds_1H, 0)
#         sdo_1H = np.append(sdo_1H, "Dummy Offense")
#     elif sdn_1H[-1] < dn_1H[-1]:
#         sdn_1H = np.append(sdn_1H, dn_1H[-1])
#         ds_1H = np.append(ds_1H, 0)
#         sdo_1H = np.append(sdo_1H, sdo_1H[-1])

#     if len(sdn_2H) == 0:
#         sdn_2H = np.append(sdn_2H, dn_2H[-1])
#         ds_2H = np.append(ds_2H, 0)
#         sdo_2H = np.append(sdo_2H, "Dummy Offense")
#     elif sdn_2H[-1] < dn_2H[-1]:
#         sdn_2H = np.append(sdn_2H, dn_2H[-1])
#         ds_2H = np.append(ds_2H, 0)
#         sdo_2H = np.append(sdo_2H, sdo_2H[-1])
    
#     # get index to lookup drive numbers, drive scores, and drive offenses
#     drives_1H['next_idx'] = np.searchsorted(sdn_1H,dn_1H,'left')
#     drives_2H['next_idx'] = np.searchsorted(sdn_2H,dn_2H,'left')
    
#     drives_1H['next_sd'] = sdn_1H[drives_1H.next_idx.values]
#     drives_1H['dtns'] = drives_1H['next_sd'].astype(int) - drives_1H['drive_no'].astype(int)
    
#     drives_2H['next_sd'] = sdn_2H[drives_2H.next_idx.values]
#     drives_2H['dtns'] = drives_2H['next_sd'].astype(int) - drives_2H['drive_no'].astype(int)
    
#     drives_1H['next_score'] = ds_1H[drives_1H.next_idx.values]
#     drives_1H['ns_offense'] = sdo_1H[drives_1H.next_idx.values]
    
#     drives_2H['next_score'] = ds_2H[drives_2H.next_idx.values]
#     drives_2H['ns_offense'] = sdo_2H[drives_2H.next_idx.values]
    
#     new_df = pd.concat([new_df,drives_1H])
#     new_df = pd.concat([new_df,drives_2H])
    
    

# print(len(new_df))
# PATH = './output/master.csv'
# new_df.to_csv(PATH,index=False)
# del new_df
# gc.collect()

PATH = './output/master.csv'

df= pd.read_csv(PATH)



In [5]:
print(list(df))

['away', 'defense', 'defense_conference', 'defense_score', 'distance', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'game_id', 'tr_half', 'tr_game', 'touchdown', 'offensive_TD', 'defensive_TD', 'fg', 'safety', 'drive_score', 'avg_drive_time', 'half', 'is_scoring_drive', 'drive_no', 'next_idx', 'next_sd', 'dtns', 'next_score', 'ns_offense']


In [6]:
# fix/validate yardline log?
df['offense_yard_line'] = np.where(df['home'] == df['offense'],df['yard_line'],(100-df['yard_line']))
              
df = df.drop(columns=['yard_line'])                                  
                                       

In [7]:
# goal to go
df['GTG'] = np.where((df['offense_yard_line']+df['distance']==100),1,0)

In [8]:
# under two min in half
df['UTM'] = np.where(df['tr_half']<=120,1,0)

In [9]:
# indicator for kneels/drop them
print(len(df))
df = df[~df['play_text'].str.contains("kneel")]
df = df[~df['play_text'].str.contains("TEAM run for a loss")]

print(len(df))

1941622
1936696


In [10]:
# absolute score differential
# scale
# zero for more than 30

df['score_diff'] = df['offense_score'] - df['defense_score']
df['abs_SD'] = df['score_diff'].abs()
df['SD_weight'] = np.maximum((30-df['abs_SD'])/(30),0)

In [11]:
# add weight for drives till next score
df['dtns_weight'] = np.maximum((5-df['dtns'])/(5),0)

print(df[['dtns','dtns_weight','next_score']].head(75))

    dtns  dtns_weight  next_score
0      8          0.0           7
1      8          0.0           7
2      8          0.0           7
3      8          0.0           7
4      8          0.0           7
5      8          0.0           7
6      8          0.0           7
7      8          0.0           7
8      8          0.0           7
9      7          0.0           7
10     7          0.0           7
11     7          0.0           7
12     7          0.0           7
13     6          0.0           7
14     6          0.0           7
15     5          0.0           7
16     5          0.0           7
17     5          0.0           7
18     5          0.0           7
19     4          0.2           7
20     4          0.2           7
21     4          0.2           7
22     4          0.2           7
23     4          0.2           7
24     4          0.2           7
25     4          0.2           7
26     4          0.2           7
27     3          0.4           7
28     3      

In [12]:
# drop bad data
print(len(df))
df = df.loc[df['distance'] >= 0.5]
print(len(df))

1936696
1854667


In [13]:
# 1st down YTG log?
# field YTG log?

df['FD_YTG_log'] = np.log10(df['distance'])

In [16]:
drives = df.groupby(['drive_id'])['dtns','drive_score'].count().reset_index()

drive_ids = drives.loc[drives['dtns']==1]

di_ones = list(drive_ids.drive_id.values)

one_play = df.loc[df['drive_id'].isin(di_ones)]

print(one_play.play_text.values)

# gb = drives.groupby(['drive_score']).count().reset_index()
# gb['rate'] = gb['drive_id']/(gb['drive_id'].sum())
# print(gb)
# print(drives)

['Larry Johnson (PSU) rushed left side for 17 yards.'
 'Horned Frogs fumble by Sean Stilley (TCU), recovered by Drew Wood (CSU), 7 yard return.'
 'Fabian Walker (FSU) pass right side intercepted by Bruce Thornton (GA). Returned for a 71 yard touchdown.'
 ... 'Napoleon Maxwell run for 1 yd to the FlaIn 36'
 'LeVante Bellamy run for 11 yds to the WMich 37 LeVante Bellamy fumbled, recovered by Toled Jamal Hines'
 'Charlie Brewer pass intercepted Shea Campbell return for 2 yds to the Bayl 33']
