In [1]:
import pandas as pd
import numpy as np 
import datetime
import math
import gc
gc.collect()

from tqdm import tqdm

# Load Data

In [2]:
years = list(range(2002, int(datetime.datetime.now().year)))

df = pd.DataFrame()
for year in tqdm(years):
    path = './output/'+str(year)+'/'+str(year)+'_pbp.csv'
    sea_df = pd.read_csv(path)
    df = pd.concat([df,sea_df])

num_plays = len(df)
print(str(num_plays) + " plays were loaded")

100%|██████████| 17/17 [00:18<00:00,  1.72s/it]

2223578 plays were loaded





In [3]:
print(list(df))
print(list(df.play_type.unique()))

['away', 'clock.minutes', 'clock.seconds', 'defense', 'defense_conference', 'defense_score', 'distance', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'game_id']
['Uncategorized', 'Kickoff Return (Offense)', 'Sack', 'Rush', 'Punt Return', 'Penalty', 'Pass Completion', 'Pass Incompletion', 'Safety', 'End Period', 'Pass Interception', 'Blocked Punt Touchdown', 'Fumble Recovery (Own)', 'Timeout', 'Fumble Recovery (Opponent)', 'Two Point Pass', 'Two Point Rush', 'Interception Return Touchdown', 'Blocked Punt', 'Punt Return Touchdown', 'Blocked Field Goal', 'Kickoff Return (Defense)', 'Fumble Return Touchdown', 'Kickoff Return Touchdown', 'Blocked PAT', 'Blocked Field Goal Touchdown', 'Missed Field Goal Return Touchdown', 'Punt', 'Pass', 'Kickoff', 'Extra Point Good', 'Field Goal Good', 'Field Goal Missed', 'Extra Point Missed', '2pt Conversion', 'Offensive 1pt Safety', 'Pass Reception', '

In [4]:
def fix_uncat(play_type, play_text):
    global mxpr
    if play_type != 'Uncategorized':
        return play_type
    else:
        if isinstance(play_text,str):
            if "Start of the 1st quarter." in play_text:
                return "End Period"
            elif "Start of the 2nd quarter." in play_text:
                return "End Period"
            elif "Start of the 3rd quarter." in play_text:
                return "End of Half"
            elif "Start of the 4th quarter." in play_text:
                return "End Period"
            elif "Start of overtime." in play_text:
                return "End Period"
            elif "End of the game." in play_text:
                return "End of Game"
            elif "Extra point" in play_text:
                if "is good" in play_text:
                    return "Extra Point Good"
                elif "is no good." in play_text[-13:]:
                    return "Extra Point Missed"
                else:
                    return play_type
            elif "field goal" in play_text:
                if "is good" in play_text:
                    return "Field Goal Good"
                elif "is no good." in play_text[-13:]:
                    return "Field Goal Missed"
                else:
                    print(play_text)
                    return play_type
            elif "missed PAT returned." in play_text:
                return "Extra Point Missed"
            else:
                return play_type
    return play_type

df['play_type'] = df.apply(lambda row: fix_uncat(row['play_type'], row['play_text']),axis=1)

uncat = df.loc[df.play_type=='Uncategorized']
mpr = uncat.loc[uncat.play_type.str.contains('missed PAT returned')]
print(len(uncat))
print(uncat.play_text.head(50))

119
3940     DeAngelo Hall (VT) took lateral and rushed for...
11077    Terrence Biggers (MSU) took lateral and rushed...
11092    Derek Abney (UK) took lateral and rushed for 1...
13313    Terrance Phillips (PSU) took lateral and rushe...
28227    Andrico Hines (MTSU) took lateral and rushed f...
30739    Lance Pendleton (BYU) took lateral and rushed ...
31087    Chris Bruhn (WSU) took lateral and rushed for ...
33853    Tim Blackwell (USM) took lateral and rushed fo...
36684    Bruce Gradkowski (TOL) took lateral and rushed...
39523    Michael Turner (NIU) took lateral and rushed f...
43241    Jason Samples (TSU) took lateral and rushed fo...
48186    Steve Breaston (MICH) took lateral and rushed ...
52925    Duane Coleman (CLEM) took lateral and rushed f...
53764    Scott Lunde (WSU) took lateral and rushed for ...
63806    Aric Williams (OSU) took lateral and rushed fo...
75692    Garrett Lepisto (UCLA) took lateral and rushed...
83122    Sean Taylor (MIA) took lateral and rushed f

# Need Separate Model for XP, Kickoffs

In [5]:
# drop_cols
separate = ['End Period', 'Kickoff Return (Offense)', 'Extra Point Good', 'Timeout',
 'End of Half', 'End of Game', 'Two Point Pass', 'Two Point Rush', 
 'Kickoff Return (Defense)', 'Uncategorized', 'Kickoff Return Touchdown', 'Blocked PAT','Kickoff', 
 'Extra Point Missed', '2pt Conversion', 'Defensive 2pt Conversion', 'Offensive 1pt Safety']

print(len(df))
sep_df = df.loc[df.play_type.isin(separate)]
print(len(sep_df))
df = df.loc[~df.play_type.isin(separate)]
print(len(df))


2223578
272727
1950851


In [6]:
# drop overtime and 61 period 0 entries
print(len(df))
df = df.loc[df.period.isin([1,2,3,4])]
print(len(df))

1950851
1944001


In [7]:
print(list(df['clock.seconds'].unique()))

# instead of zero its nan for clock.seconds and clock.minutes
df['clock.seconds'] = df['clock.seconds'].fillna(0)
df['clock.minutes'] = df['clock.minutes'].fillna(0)

[nan, 25.0, 45.0, 30.0, 11.0, 15.0, 40.0, 55.0, 18.0, 22.0, 54.0, 23.0, 33.0, 44.0, 20.0, 34.0, 4.0, 10.0, 53.0, 56.0, 51.0, 21.0, 6.0, 16.0, 46.0, 3.0, 58.0, 7.0, 47.0, 27.0, 57.0, 17.0, 48.0, 37.0, 24.0, 14.0, 50.0, 5.0, 35.0, 43.0, 39.0, 52.0, 26.0, 36.0, 42.0, 12.0, 2.0, 32.0, 28.0, 8.0, 31.0, 19.0, 9.0, 29.0, 13.0, 41.0, 59.0, 38.0, 49.0, 1.0]


In [8]:
# # calculate time remaining in half
def tr_half(period, minutes, seconds):
    tr = 0
    if period in [1,3]:
        # add a quarter of time remaining
        tr += 900
    tr += (60 * minutes + seconds)
    return tr

def tr_game(period, minutes, seconds):
    quarters_left = 4-period
    added_secs = 15*60*quarters_left
    return (60*minutes + seconds + added_secs)

df['tr_half'] = df.apply(lambda row: tr_half(row['period'],row['clock.minutes'],row['clock.seconds']),axis=1)
df['tr_game'] = df.apply(lambda row: tr_game(row['period'],row['clock.minutes'],row['clock.seconds']),axis=1)

print(df[['period','clock.minutes','clock.seconds','tr_half','tr_game']].head(10))

    period  clock.minutes  clock.seconds  tr_half  tr_game
2        1           14.0            0.0   1740.0   3540.0
3        1           14.0           25.0   1765.0   3565.0
4        1           14.0           45.0   1785.0   3585.0
5        1           13.0           30.0   1710.0   3510.0
6        1           11.0           11.0   1571.0   3371.0
7        1           11.0            0.0   1560.0   3360.0
8        1           13.0           15.0   1695.0   3495.0
9        1           12.0           45.0   1665.0   3465.0
10       1           12.0           25.0   1645.0   3445.0
11       1           12.0            0.0   1620.0   3420.0


In [9]:
# drop clock numbers, not needed anymore 
df = df.drop(columns=['clock.minutes','clock.seconds'])

# Get Desired Features

Need 6 variables. Well 8.

Down  
Seconds left in half  
Yards to go for touchdown (log?)  
Yards to go for first down (log?)  
Goal to goal indicator  
Under 2 minutes indicator  

Using two others to weigh observations

Also need target variable. Next Score.


In [11]:
# test

# UK = df.loc[df['home']=='Kentucky']
# print(list(UK))

# game_ids = list(UK.game_id.unique())

# print(len(game_ids))

# # test game
# test_game = UK.loc[UK['game_id']==232430096]

# # sort by time remaining game
# test_game = test_game.sort_values(by='tr_game',ascending=False)
# print(list(test.drive_id.unique()))

# sample_drive = test_game.loc[test_game['drive_id']==23243009601]

# print(sample_drive.head(50))
# print(sample_drive[['play_text']].tail(1).values)

In [12]:
print(len(df))
df = df.dropna(subset=['play_text'])
print(len(df))

1944001
1943643


In [13]:
def add_tds(play_type, play_text):
    if play_type != 'Penalty':
        if isinstance(play_text, str):
            if 'Touchdown' in play_type:
                return 1
            elif 'touchdown' in play_text:
                return 1
            elif 'for a TD' in play_text:
                return 1
    return 0
    
df['touchdown'] = df.apply(lambda row: add_tds(row['play_type'],row['play_text']), axis=1)
    


In [14]:
td_plays = df.loc[df['touchdown']==1]
print(list(td_plays.play_type.unique()))

['Pass Completion', 'Blocked Punt Touchdown', 'Rush', 'Fumble Recovery (Own)', 'Interception Return Touchdown', 'Punt Return Touchdown', 'Fumble Return Touchdown', 'Blocked Field Goal Touchdown', 'Missed Field Goal Return Touchdown', 'Sack', 'Pass Incompletion', 'Passing Touchdown', 'Rushing Touchdown', 'Punt', 'Fumble Recovery (Opponent)', 'Pass Reception', 'Blocked Punt', 'Pass Interception Return', 'Blocked Field Goal']


In [15]:
defensive_tds = ['Blocked Punt Touchdown', 'Interception Return Touchdown','Punt Return Touchdown',
            'Fumble Return Touchdown','Blocked Field Goal Touchdown','Missed Field Goal Return Touchdown',
             'Sack']

not_touchdowns = ['Pass Incompletion']

offensive_tds = ['Pass Completion','Rush','Fumble Recovery (Own)','Rushing Touchdown','Passing Touchdown']

# split into offensive and defensive touchdowns

def cat_offense(play_type, touchdown):
    if touchdown == 1:
        if play_type not in defensive_tds:
            if play_type != 'Pass Incompletion':
                return 1
    return 0

def cat_defense(play_type, touchdown):
    if touchdown ==1:
        if play_type in defensive_tds:
            return 1
    return 0

df['offensive_TD'] = df.apply(lambda row: cat_offense(row['play_type'],row['touchdown']),axis=1)

df['defensive_TD'] = df.apply(lambda row: cat_defense(row['play_type'],row['touchdown']),axis=1)

print(list(df))



['away', 'defense', 'defense_conference', 'defense_score', 'distance', 'down', 'drive_id', 'home', 'id', 'offense', 'offense_conference', 'offense_score', 'period', 'play_text', 'play_type', 'yard_line', 'yards_gained', 'game_id', 'tr_half', 'tr_game', 'touchdown', 'offensive_TD', 'defensive_TD']


In [26]:
# add field goals and safeties
def add_fgs(play_type):
    if play_type == 'Field Goal Good':
        return 1
    return 0

def add_safeties(play_type):
    if play_type == 'Safety':
        return 1
    return 0

df['fg'] = df.apply(lambda row: add_fgs(row['play_type']),axis=1)

df['safety'] = df.apply(lambda row: add_safeties(row['play_type']),axis=1)


# weird = non_tds.loc[non_tds['play_type'] == 'Interception Return Touchdown']
# for w in list(weird.play_text.values):
#     print(w)

['Sack', 'Rush', 'Punt Return', 'Penalty', 'Pass Completion', 'Pass Incompletion', 'Safety', 'Field Goal Good', 'Pass Interception', 'Fumble Recovery (Own)', 'Field Goal Missed', 'Fumble Recovery (Opponent)', 'Blocked Punt', 'Blocked Field Goal', 'Punt', 'Pass', 'Pass Reception', 'Pass Interception Return', 'Missed Field Goal Return', 'Interception']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [33]:
# add score for each drive

drive_gb = df.groupby(['drive_id'])['offensive_TD','defensive_TD','fg','safety'].max().reset_index()

drive_gb['drive_score'] = 7 * drive_gb['offensive_TD'] + -7 * drive_gb['defensive_TD'] + 3 * drive_gb['fg'] + -2 * drive_gb['safety']
drive_gb['drive_score'] = drive_gb['drive_score'].astype(int)

possible = [0, 3, 7, -2, -7]

not_possible = drive_gb.loc[~drive_gb['drive_score'].isin(possible)]
print(not_possible)
print(len(not_possible))

drive_ids = list(df.drive_id.unique())
# print(len(drive_ids))

           drive_id  offensive_TD  defensive_TD  fg  safety  drive_score
0        4005476401             0             0   0       0            0
1        4005476402             0             0   0       0            0
2        4005476403             0             0   0       0            0
3        4005476404             0             0   0       0            0
4        4005476405             0             0   0       0            0
5        4005476406             0             0   0       0            0
6        4005476407             1             0   0       0            7
7        4005476408             0             0   0       0            0
8        4005476409             0             0   0       0            0
9        4005476411             1             0   0       0            7
10       4005476412             0             0   0       0            0
11       4005476413             1             0   0       0            7
12       4005476414             0             0   0

In [44]:
## 32266005905 remove safety, it was on kickoff somehow
# df = df.loc[df['id']!=322660059036]

## 4010320813 possible duplicate (seems one number short of normal ID)

## 40054786811 seems like a duplicate (seems one number short of normal ID)

np = df.loc[df['drive_id']==4010320813]

print(np.head(50))
for w in list(np.play_text.values):
    print(w)
    
for w in list(np.play_type.values):
    print(w)
    


            away         defense defense_conference  defense_score  distance  \
155264  Missouri  Oklahoma State             Big 12              7        10   
155266  Missouri  Oklahoma State             Big 12              7         6   
155267  Missouri  Oklahoma State             Big 12              7         7   
155268  Missouri  Oklahoma State             Big 12              7         7   
155269  Missouri  Oklahoma State             Big 12              7        10   
155270  Missouri  Oklahoma State             Big 12              7         0   
155271  Missouri        Missouri                SEC              0        15   
155272  Missouri        Missouri                SEC              0        10   
155273  Missouri  Oklahoma State             Big 12              7         8   

        down    drive_id            home                  id         offense  \
155264     1  4010320813  Oklahoma State  401032081101917401        Missouri   
155266     1  4010320813  Oklahoma Stat

In [46]:
mo = df.loc[df.game_id==40103208]
mo_dis = list(mo.drive_id.unique())
for md in mo_dis:
    print(mo.loc[mo.drive_id==md].head(15))

              away     defense defense_conference  defense_score  distance  \
155164  Cincinnati  Cincinnati  American Athletic              0         4   
155165  Cincinnati  Cincinnati  American Athletic              0         1   
155166  Cincinnati  Cincinnati  American Athletic              0        14   
155167  Cincinnati  Cincinnati  American Athletic              0        10   
155168  Cincinnati  Cincinnati  American Athletic              0         1   
155169  Cincinnati  Cincinnati  American Athletic              0         4   
155170  Cincinnati  Cincinnati  American Athletic              0        10   
155171  Cincinnati  Cincinnati  American Athletic              0         9   
155172  Cincinnati  Cincinnati  American Athletic              0         9   
155174  Cincinnati  Cincinnati  American Athletic              0        10   
155175  Cincinnati  Cincinnati  American Athletic              0         2   

        down    drive_id           home                  id    

[10 rows x 25 columns]
            away         defense defense_conference  defense_score  distance  \
155327  Missouri  Oklahoma State             Big 12             14         5   
155329  Missouri  Oklahoma State             Big 12             14        10   
155330  Missouri  Oklahoma State             Big 12             14        10   
155331  Missouri  Oklahoma State             Big 12             14         4   
155332  Missouri  Oklahoma State             Big 12             14         6   
155333  Missouri  Oklahoma State             Big 12             14        10   

        down    drive_id            home                  id   offense  ...  \
155327     1  4010320819  Oklahoma State  401032081102966401  Missouri  ...   
155329     1  4010320819  Oklahoma State  401032081102965901  Missouri  ...   
155330     1  4010320819  Oklahoma State  401032081102964101  Missouri  ...   
155331     3  4010320819  Oklahoma State  401032081102957801  Missouri  ...   
155332     2  4010320

                     away defense defense_conference  defense_score  distance  \
155448  Mississippi State    Iowa            Big Ten              3        10   
155449  Mississippi State    Iowa            Big Ten              3        10   
155451  Mississippi State    Iowa            Big Ten              3         2   
155452  Mississippi State    Iowa            Big Ten              3        11   
155453  Mississippi State    Iowa            Big Ten              3        11   
155454  Mississippi State    Iowa            Big Ten              3        10   

        down    drive_id  home                  id            offense  ...  \
155448     2  4010320837  Iowa  401032083102914901  Mississippi State  ...   
155449     1  4010320837  Iowa  401032083102904701  Mississippi State  ...   
155451     2  4010320837  Iowa  401032083102907401  Mississippi State  ...   
155452     3  4010320837  Iowa  401032083102918801  Mississippi State  ...   
155453     4  4010320837  Iowa  4010320831

              away     defense defense_conference  defense_score  distance  \
155581  Washington  Washington             Pac-12              3         8   
155582  Washington  Washington             Pac-12              3         3   
155583  Washington  Washington             Pac-12              3        10   
155584  Washington  Washington             Pac-12              3        10   
155586  Washington  Washington             Pac-12              3         3   
155587  Washington  Washington             Pac-12              3         3   
155588  Washington  Washington             Pac-12              3        10   
155590  Washington  Washington             Pac-12              3         8   
155591  Washington  Washington             Pac-12              3        10   
155592  Washington  Washington             Pac-12              3        13   
155593  Washington  Washington             Pac-12              3        15   
155594  Washington  Washington             Pac-12              3

            away   defense defense_conference  defense_score  distance  down  \
155754  Kentucky  Kentucky                SEC              3        10     1   
155755  Kentucky  Kentucky                SEC              3        13     4   
155756  Kentucky  Kentucky                SEC              3         6     3   
155757  Kentucky  Kentucky                SEC              3        10     2   
155758  Kentucky  Kentucky                SEC              3        10     1   
155759  Kentucky  Kentucky                SEC              3        10     1   
155760  Kentucky  Kentucky                SEC              3        10     2   
155761  Kentucky  Kentucky                SEC              3         1     4   
155762  Kentucky  Kentucky                SEC              3        10     3   
155763  Kentucky  Kentucky                SEC              3        10     2   

          drive_id        home                  id     offense  ... yard_line  \
155754  4010320885  Penn State  401032

In [None]:
games = df.groupby(['game_id'])
print(games)

for game, game_plays in games:
    for index, row in game_plays.iterrows():
        pass