In [1]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
gc.collect()

from tqdm import tqdm

# Load Data

In [2]:
years = list(range(2002, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(str(num_plays) + " plays were loaded")

100%|██████████| 17/17 [00:16<00:00,  1.59s/it]

2223578 plays were loaded





In [3]:
print(list(df))
print(list(df.play_type.unique()))

['away', 'clock.minutes', 'clock.seconds', 'defense', 'defense_conference', 'defense_score', 'distance', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained']
['Uncategorized', 'Kickoff Return (Offense)', 'Sack', 'Rush', 'Punt Return', 'Penalty', 'Pass Completion', 'Pass Incompletion', 'Safety', 'End Period', 'Pass Interception', 'Blocked Punt Touchdown', 'Fumble Recovery (Own)', 'Timeout', 'Fumble Recovery (Opponent)', 'Two Point Pass', 'Two Point Rush', 'Interception Return Touchdown', 'Blocked Punt', 'Punt Return Touchdown', 'Blocked Field Goal', 'Kickoff Return (Defense)', 'Fumble Return Touchdown', 'Kickoff Return Touchdown', 'Blocked PAT', 'Blocked Field Goal Touchdown', 'Missed Field Goal Return Touchdown', 'Punt', 'Pass', 'Kickoff', 'Extra Point Good', 'Field Goal Good', 'Field Goal Missed', 'Extra Point Missed', '2pt Conversion', 'Offensive 1pt Safety', 'Pass Reception', 'Passing Tou

In [4]:
def fix_uncat(play_type, play_text):
    global mxpr
    if play_type != 'Uncategorized':
        return play_type
    else:
        if isinstance(play_text,str):
            if "Start of the 1st quarter." in play_text:
                return "End Period"
            elif "Start of the 2nd quarter." in play_text:
                return "End Period"
            elif "Start of the 3rd quarter." in play_text:
                return "End of Half"
            elif "Start of the 4th quarter." in play_text:
                return "End Period"
            elif "Start of overtime." in play_text:
                return "End Period"
            elif "End of the game." in play_text:
                return "End of Game"
            elif "Extra point" in play_text:
                if "is good" in play_text:
                    return "Extra Point Good"
                elif "is no good." in play_text[-13:]:
                    return "Extra Point Missed"
                else:
                    return play_type
            elif "field goal" in play_text:
                if "is good" in play_text:
                    return "Field Goal Good"
                elif "is no good." in play_text[-13:]:
                    return "Field Goal Missed"
                else:
                    print(play_text)
                    return play_type
            elif "missed PAT returned." in play_text:
                return "Extra Point Missed"
            else:
                return play_type
    return play_type

df['play_type'] = df.apply(lambda row: fix_uncat(row['play_type'], row['play_text']),axis=1)

uncat = df.loc[df.play_type=='Uncategorized']
mpr = uncat.loc[uncat.play_type.str.contains('missed PAT returned')]
print(len(uncat))
print(uncat.play_text.head(50))

119
3940     DeAngelo Hall (VT) took lateral and rushed for...
11077    Terrence Biggers (MSU) took lateral and rushed...
11092    Derek Abney (UK) took lateral and rushed for 1...
13313    Terrance Phillips (PSU) took lateral and rushe...
28227    Andrico Hines (MTSU) took lateral and rushed f...
30739    Lance Pendleton (BYU) took lateral and rushed ...
31087    Chris Bruhn (WSU) took lateral and rushed for ...
33853    Tim Blackwell (USM) took lateral and rushed fo...
36684    Bruce Gradkowski (TOL) took lateral and rushed...
39523    Michael Turner (NIU) took lateral and rushed f...
43241    Jason Samples (TSU) took lateral and rushed fo...
48186    Steve Breaston (MICH) took lateral and rushed ...
52925    Duane Coleman (CLEM) took lateral and rushed f...
53764    Scott Lunde (WSU) took lateral and rushed for ...
63806    Aric Williams (OSU) took lateral and rushed fo...
75692    Garrett Lepisto (UCLA) took lateral and rushed...
83122    Sean Taylor (MIA) took lateral and rushed f

# Need Separate Model for XP, Kickoffs

In [5]:
# drop_cols
separate = ['End Period', 'Kickoff Return (Offense)', 'Extra Point Good', 'Timeout',
 'End of Half', 'End of Game', 'Two Point Pass', 'Two Point Rush', 
 'Kickoff Return (Defense)', 'Uncategorized', 'Kickoff Return Touchdown', 'Blocked PAT','Kickoff', 
 'Extra Point Missed', '2pt Conversion', 'Defensive 2pt Conversion']

print(len(df))
sep_df = df.loc[df.play_type.isin(separate)]
print(len(sep_df))
df = df.loc[~df.play_type.isin(separate)]
print(len(df))


2223578
272727
1950851


In [6]:
# drop overtime and 61 period 0 entries
print(len(df))
df = df.loc[df.period.isin([1,2,3,4])]
print(len(df))

1950851
1944001


In [14]:
print(list(df['clock.seconds'].unique()))

# instead of zero its nan for clock.seconds and clock.minutes
df['clock.seconds'] = df['clock.seconds'].fillna(0)
df['clock.minutes'] = df['clock.minutes'].fillna(0)

[0.0, 25.0, 45.0, 30.0, 11.0, 15.0, 40.0, 55.0, 18.0, 22.0, 54.0, 23.0, 33.0, 44.0, 20.0, 34.0, 4.0, 10.0, 53.0, 56.0, 51.0, 21.0, 6.0, 16.0, 46.0, 3.0, 58.0, 7.0, 47.0, 27.0, 57.0, 17.0, 48.0, 37.0, 24.0, 14.0, 50.0, 5.0, 35.0, 43.0, 39.0, 52.0, 26.0, 36.0, 42.0, 12.0, 2.0, 32.0, 28.0, 8.0, 31.0, 19.0, 9.0, 29.0, 13.0, 41.0, 59.0, 38.0, 49.0, 1.0]


In [16]:
# # calculate time remaining in half
def tr_half(period, minutes, seconds):
    tr = 0
    if period in [1,3]:
        # add a quarter of time remaining
        tr += 900
    tr += (60 * minutes + seconds)
    return tr

def tr_game(period, minutes, seconds):
    quarters_left = 4-period
    added_secs = 15*60*quarters_left
    return (60*minutes + seconds + added_secs)

df['tr_half'] = df.apply(lambda row: tr_half(row['period'],row['clock.minutes'],row['clock.seconds']),axis=1)
df['tr_game'] = df.apply(lambda row: tr_game(row['period'],row['clock.minutes'],row['clock.seconds']),axis=1)

print(df[['period','clock.minutes','clock.seconds','tr_half','tr_game']].head(10))

    period  clock.minutes  clock.seconds  tr_half  tr_game
2        1           14.0            0.0   1740.0   3540.0
3        1           14.0           25.0   1765.0   3565.0
4        1           14.0           45.0   1785.0   3585.0
5        1           13.0           30.0   1710.0   3510.0
6        1           11.0           11.0   1571.0   3371.0
7        1           11.0            0.0   1560.0   3360.0
8        1           13.0           15.0   1695.0   3495.0
9        1           12.0           45.0   1665.0   3465.0
10       1           12.0           25.0   1645.0   3445.0
11       1           12.0            0.0   1620.0   3420.0


In [19]:
# drop clock numbers, not needed anymore 
df = df.drop(columns=['clock.minutes','clock.seconds'])

# Get Desired Features

Need 6 variables. Well 8.

Down  
Seconds left in half  
Yards to go for touchdown (log?)  
Yards to go for first down (log?)  
Goal to goal indicator  
Under 2 minutes indicator  

Using two others to weigh observations

Also need target variable. Next Score.


In [33]:
df['game_id'] = df.loc[:,'drive_id'].astype(str).str[:-2].astype(int)

12660


In [40]:
# test

UK = df.loc[df['home']=='Kentucky']
print(list(UK))

game_ids = list(UK.game_id.unique())

print(len(game_ids))

# test game
test_game = UK.loc[UK['game_id']==232430096]

# sort by time remaining game
test_game = test_game.sort_values(by='tr_game',ascending=False)
print(list(test.drive_id.unique()))

sample_drive = test_game.loc[test_game['drive_id']==23243009601]

print(sample_drive.head(50))
print(sample_drive[['play_text']].tail(1).values)

['away', 'defense', 'defense_conference', 'defense_score', 'distance', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'tr_half', 'tr_game', 'game_id']
144
[23243009601, 23243009603, 23243009604, 23243009606, 23243009607, 23243009608, 23243009610, 23243009611, 23243009612, 23243009614, 23243009615, 23243009617, 23243009619, 23243009621, 23243009623, 23243009625, 23243009626, 23243009628, 23243009630, 23243009631]
            away   defense defense_conference  defense_score  distance  down  \
7467  Louisville  Kentucky                SEC              0        10     1   
7466  Louisville  Kentucky                SEC              0        10     2   
7465  Louisville  Kentucky                SEC              0        10     3   
7464  Louisville  Kentucky                SEC              0        10     1   
7463  Louisville  Kentucky                SEC              0        10     1   
74

In [43]:
def add_tds(play_type, play_text):
    if play_type != 'Penalty':
        if 'touchdown' in play_text:
            return 1
    else:
        return 0
    
test_game['touchdown'] = test_game.apply(lambda row: add_tds(row['play_type'],row['play_text']), axis=1)

td_plays = test_game.loc[test_game['touchdown']==1]

td_play_text = list(td_plays.play_text.values)
for tpt in td_play_text:
    print(tpt)
    


Eric Shelton (UL) rushed up the middle for a 6 yard touchdown.
Stefan LeFors (UL) pass left side complete to D.J. Kamer (UL) for a 3 yard touchdown.
Wildcats punt blocked, recovered by Jonathan Jackerson (UL), advanced for a touchdown.
Jared Lorenzen (UK) pass left side complete to Glenn Holt Jr. (UK) for a 13 yard touchdown.
Draak Davis (UK) rushed up the middle for a 1 yard touchdown.
Jared Lorenzen (UK) pass right side complete to Keenan Burton (UK) for a 31 yard touchdown.
Eric Shelton (UL) rushed right side for a 22 yard touchdown.
Lionel Gates (UL) rushed left side for a 2 yard touchdown.
