In [19]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
gc.collect()

from tqdm import tqdm

# Load Data

In [20]:
years = list(range(2002, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(str(num_plays) + " plays were loaded")

100%|██████████| 17/17 [00:18<00:00,  1.71s/it]

2223578 plays were loaded





In [21]:
print(list(df))
print(list(df.play_type.unique()))

['away', 'clock.minutes', 'clock.seconds', 'defense', 'defense_conference', 'defense_score', 'distance', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'game_id']
['Uncategorized', 'Kickoff Return (Offense)', 'Sack', 'Rush', 'Punt Return', 'Penalty', 'Pass Completion', 'Pass Incompletion', 'Safety', 'End Period', 'Pass Interception', 'Blocked Punt Touchdown', 'Fumble Recovery (Own)', 'Timeout', 'Fumble Recovery (Opponent)', 'Two Point Pass', 'Two Point Rush', 'Interception Return Touchdown', 'Blocked Punt', 'Punt Return Touchdown', 'Blocked Field Goal', 'Kickoff Return (Defense)', 'Fumble Return Touchdown', 'Kickoff Return Touchdown', 'Blocked PAT', 'Blocked Field Goal Touchdown', 'Missed Field Goal Return Touchdown', 'Punt', 'Pass', 'Kickoff', 'Extra Point Good', 'Field Goal Good', 'Field Goal Missed', 'Extra Point Missed', '2pt Conversion', 'Offensive 1pt Safety', 'Pass Reception', '

In [22]:
def fix_uncat(play_type, play_text):
    if play_type != 'Uncategorized':
        return play_type
    else:
        if isinstance(play_text,str):
            if "Start of the 1st quarter." in play_text:
                return "End Period"
            elif "Start of the 2nd quarter." in play_text:
                return "End Period"
            elif "Start of the 3rd quarter." in play_text:
                return "End of Half"
            elif "Start of the 4th quarter." in play_text:
                return "End Period"
            elif "Start of overtime." in play_text:
                return "End Period"
            elif "End of the game." in play_text:
                return "End of Game"
            elif "Extra point" in play_text:
                if "is good" in play_text:
                    return "Extra Point Good"
                elif "is no good." in play_text[-13:]:
                    return "Extra Point Missed"
                else:
                    return play_type
            elif "field goal" in play_text:
                if "is good" in play_text:
                    return "Field Goal Good"
                elif "is no good." in play_text[-13:]:
                    return "Field Goal Missed"
                else:
                    print(play_text)
                    return play_type
            elif "missed PAT returned." in play_text:
                return "Extra Point Missed"
            else:
                return play_type
    return play_type

df['play_type'] = df.apply(lambda row: fix_uncat(row['play_type'], row['play_text']),axis=1)

uncat = df.loc[df.play_type=='Uncategorized']
print(len(uncat))
print(uncat.play_text.head(50))

119
3940     DeAngelo Hall (VT) took lateral and rushed for...
11077    Terrence Biggers (MSU) took lateral and rushed...
11092    Derek Abney (UK) took lateral and rushed for 1...
13313    Terrance Phillips (PSU) took lateral and rushe...
28227    Andrico Hines (MTSU) took lateral and rushed f...
30739    Lance Pendleton (BYU) took lateral and rushed ...
31087    Chris Bruhn (WSU) took lateral and rushed for ...
33853    Tim Blackwell (USM) took lateral and rushed fo...
36684    Bruce Gradkowski (TOL) took lateral and rushed...
39523    Michael Turner (NIU) took lateral and rushed f...
43241    Jason Samples (TSU) took lateral and rushed fo...
48186    Steve Breaston (MICH) took lateral and rushed ...
52925    Duane Coleman (CLEM) took lateral and rushed f...
53764    Scott Lunde (WSU) took lateral and rushed for ...
63806    Aric Williams (OSU) took lateral and rushed fo...
75692    Garrett Lepisto (UCLA) took lateral and rushed...
83122    Sean Taylor (MIA) took lateral and rushed f

# Need Separate Model for XP, Kickoffs

In [23]:
# drop_cols
separate = ['End Period', 'Kickoff Return (Offense)', 'Extra Point Good', 'Timeout',
 'End of Half', 'End of Game', 'Two Point Pass', 'Two Point Rush', 
 'Kickoff Return (Defense)', 'Uncategorized', 'Kickoff Return Touchdown', 'Blocked PAT','Kickoff', 
 'Extra Point Missed', '2pt Conversion', 'Defensive 2pt Conversion', 'Offensive 1pt Safety']

print(len(df))
sep_df = df.loc[df.play_type.isin(separate)]
print(len(sep_df))
df = df.loc[~df.play_type.isin(separate)]
print(len(df))


2223578
272728
1950850


In [24]:
# drop overtime and 61 period 0 entries
print(len(df))
df = df.loc[df.period.isin([1,2,3,4])]
print(len(df))

1950850
1944000


In [25]:
# instead of zero its nan for clock.seconds and clock.minutes
df['clock.seconds'] = df['clock.seconds'].fillna(0)
df['clock.minutes'] = df['clock.minutes'].fillna(0)

print(list(df['clock.seconds'].unique()))

gb = df.groupby(['clock.seconds'])['id'].count()
gb = gb.sort_values(ascending=False)
print(gb.head(10))

[0.0, 25.0, 45.0, 30.0, 11.0, 15.0, 40.0, 55.0, 18.0, 22.0, 54.0, 23.0, 33.0, 44.0, 20.0, 34.0, 4.0, 10.0, 53.0, 56.0, 51.0, 21.0, 6.0, 16.0, 46.0, 3.0, 58.0, 7.0, 47.0, 27.0, 57.0, 17.0, 48.0, 37.0, 24.0, 14.0, 50.0, 5.0, 35.0, 43.0, 39.0, 52.0, 26.0, 36.0, 42.0, 12.0, 2.0, 32.0, 28.0, 8.0, 31.0, 19.0, 9.0, 29.0, 13.0, 41.0, 59.0, 38.0, 49.0, 1.0]
clock.seconds
0.0     198033
30.0     68195
45.0     55273
50.0     49608
15.0     49477
20.0     48352
40.0     47525
55.0     46692
10.0     44460
25.0     39756
Name: id, dtype: int64


In [26]:
# # calculate time remaining in half
def tr_half(period, minutes, seconds):
    tr = 0
    if period in [1,3]:
        # add a quarter of time remaining
        tr += 900
    tr += (60 * minutes + seconds)
    return tr

def tr_game(period, minutes, seconds):
    quarters_left = 4-period
    added_secs = 15*60*quarters_left
    return (60*minutes + seconds + added_secs)

df['tr_half'] = df.apply(lambda row: tr_half(row['period'],row['clock.minutes'],row['clock.seconds']),axis=1)
df['tr_game'] = df.apply(lambda row: tr_game(row['period'],row['clock.minutes'],row['clock.seconds']),axis=1)

print(df[['period','clock.minutes','clock.seconds','tr_half','tr_game']].head())

   period  clock.minutes  clock.seconds  tr_half  tr_game
2       1           14.0            0.0   1740.0   3540.0
3       1           14.0           25.0   1765.0   3565.0
4       1           14.0           45.0   1785.0   3585.0
5       1           13.0           30.0   1710.0   3510.0
6       1           11.0           11.0   1571.0   3371.0


In [27]:
# drop clock numbers, not needed anymore 
df = df.drop(columns=['clock.minutes','clock.seconds'])

# Get Desired Features

Need 6 feature variables. 2 weighting variables. and one target.

6 features:  
-Down  
-Seconds left in half  
-Yards to go for touchdown (log?)  
-Yards to go for first down (log?)  
-Goal to goal indicator  
-Under 2 minutes indicator  

2 weights:  
-number of drives to next score
-absolute score differential

Also need target variable, next score relative to current offense.


In [28]:
print(len(df))
df = df.dropna(subset=['play_text'])
print(len(df))

1944000
1943642


In [29]:
def add_tds(play_type, play_text):
    if play_type != 'Penalty':
        if isinstance(play_text, str):
            if 'Touchdown' in play_type:
                return 1
            elif 'touchdown' in play_text:
                return 1
            elif 'for a TD' in play_text:
                return 1
    return 0
    
df['touchdown'] = df.apply(lambda row: add_tds(row['play_type'],row['play_text']), axis=1)
    


In [30]:
td_plays = df.loc[df['touchdown']==1]
print(list(td_plays.play_type.unique()))

['Pass Completion', 'Blocked Punt Touchdown', 'Rush', 'Fumble Recovery (Own)', 'Interception Return Touchdown', 'Punt Return Touchdown', 'Fumble Return Touchdown', 'Blocked Field Goal Touchdown', 'Missed Field Goal Return Touchdown', 'Sack', 'Pass Incompletion', 'Passing Touchdown', 'Rushing Touchdown', 'Punt', 'Fumble Recovery (Opponent)', 'Pass Reception', 'Blocked Punt', 'Pass Interception Return', 'Blocked Field Goal']


In [31]:
defensive_tds = ['Blocked Punt Touchdown', 'Interception Return Touchdown','Punt Return Touchdown',
            'Fumble Return Touchdown','Blocked Field Goal Touchdown','Missed Field Goal Return Touchdown',
             'Sack']

not_touchdowns = ['Pass Incompletion']

# create list for faster comparison
dtd_nt = defensive_tds + not_touchdowns

offensive_tds = ['Pass Completion','Rush','Fumble Recovery (Own)','Rushing Touchdown','Passing Touchdown']

# split into offensive and defensive touchdowns

df['offensive_TD'] = np.where(((~df['play_type'].isin(dtd_nt)) & (df['touchdown']==1)),1,0)

df['defensive_TD'] = np.where(((df['play_type'].isin(defensive_tds)) & (df['touchdown']==1)),1,0)



In [32]:
# add field goals and safeties

df['fg'] = np.where(df['play_type'] == 'Field Goal Good',1,0)
df['safety'] = np.where(df['play_type'] == 'Safety',1,0)


In [34]:
## 32266005905 remove safety, it was on kickoff somehow
df = df.loc[df['id']!=322660059036]

## 4010320813 has two plays from 4010320812
df.loc[df['id'] == 401032081101874002, ['drive_id']] = 4010320812
df.loc[df['id'] == 401032081101907203, ['drive_id']] = 4010320812

## 40054786811 has two drives
df.loc[(df['drive_id']==40054786811) & (df['offense']=='Baylor'), ['drive_id']] = 4005478681100

In [35]:
drive_gb = df.groupby(['drive_id'])['offensive_TD','defensive_TD','fg','safety'].max().reset_index()

drive_gb['drive_score'] = 7 * drive_gb['offensive_TD'] + -7 * drive_gb['defensive_TD'] + 3 * drive_gb['fg'] + -2 * drive_gb['safety']
drive_gb['drive_score'] = drive_gb['drive_score'].astype(int)
drive_gb = drive_gb[['drive_id','drive_score']]

df = pd.merge(left=df, right=drive_gb, how='left', on=['drive_id','drive_id'])

In [36]:
# since clock numbers aren't consistent for some plays, I am sorting drives by average time remaining 
# of all plays on the drive

tr = df.groupby(['drive_id'])['tr_game'].mean().reset_index()

tr = tr.rename(columns={'tr_game':'avg_drive_time'})
print(tr.head(5))

df = pd.merge(left=df, right=tr, how='left', on=['drive_id','drive_id'])

     drive_id  avg_drive_time
0  4005476401         3518.75
1  4005476402         3361.00
2  4005476403         3255.25
3  4005476404         3145.60
4  4005476405         3064.50


In [37]:
df['half'] = np.where(df['period'] < 3, 1, 2)
df['is_scoring_drive'] = np.where(df['drive_score'] != 0, 1, 0)

In [38]:
print(list(df))
print(df.head())
raise ValueError

['away', 'defense', 'defense_conference', 'defense_score', 'distance', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'game_id', 'tr_half', 'tr_game', 'touchdown', 'offensive_TD', 'defensive_TD', 'fg', 'safety', 'drive_score', 'avg_drive_time', 'half', 'is_scoring_drive']
         away     defense defense_conference  defense_score  distance  down  \
0  Louisville  Louisville     Conference USA              0         5     3   
1  Louisville  Louisville     Conference USA              0         9     2   
2  Louisville  Louisville     Conference USA              0        10     1   
3  Louisville  Louisville     Conference USA              0        14     4   
4  Louisville        Duke                ACC              0         5     4   

      drive_id  home             id     offense  ... tr_game  touchdown  \
0  22250015001  Duke  2225001500103        Duke  ...  3540.0          0   


ValueError: 

In [71]:
# test
    
scoring_drives = np.array([[1,3],[5,7],[6,-7]])

test = pd.DataFrame([[1],[2],[3],[4],[5],[6],[7]],columns=['drive_no'])

scoring_drives = np.vstack([scoring_drives, [test.drive_no.max(),0]])

drive_numbers = scoring_drives[:,0]
drive_scores = scoring_drives[:,1]

test['next_sd'] = drive_numbers[np.searchsorted(scoring_drives[:,0],test.drive_no.values,'left')]
test['dtns'] = test['next_sd'] - test['drive_no']

test['next_score'] = drive_scores[np.searchsorted(scoring_drives[:,0],test.drive_no.values,'left')]

test

Unnamed: 0,drive_no,next_sd,dtns,next_score
0,1,1,0,3
1,2,5,3,7
2,3,5,2,7
3,4,5,1,7
4,5,5,0,7
5,6,6,0,-7
6,7,7,0,0


In [None]:
games=df.groupby(['game_id'])

new_df = []

counter = 0
for game, game_plays in games:
    counter += 1
    if counter % 50 == 0:
        print(counter)
    # sort by time remaining to order them
    ordered = game_plays.sort_values(by=['avg_drive_time'],ascending=False)
    # label drive numbers
    i = ordered.drive_id
    ordered['drive_no'] = i.ne(i.shift()).cumsum()
    
    scoring_drives_1H = ordered.loc[(ordered['is_scoring_drive']==1)&(ordered['half']==1)]
    scoring_drives_2H = ordered.loc[(ordered['is_scoring_drive']==1)&(ordered['half']==2)]
    
    np_sd_1H = scoring_drives_1H.drive_no.values
    np_sd_2H = scoring_drives_1H.drive_no.values
    
    
        
cols = list(df)
cols.append('target')
cols.append('drives_tns')

new_df = pd.DataFrame(new_df,columns=cols)

print(new_df.head())


In [None]:
print(len(new_df))
print(len(df))
    