## Importing Data

In [1]:
import pandas as pd
import numpy as np
import plotly_express as px

In [2]:
years = [x for x in range(2009,2020)]

# PBP Data
preseason_pbp = {}
regseason_pbp = {}
pstseason_pbp = {}
for year in years:
    preseason_pbp[year] = pd.read_csv(f"../cleaned_data/pbp_data/pre_season/pre_pbp_{year}.csv", low_memory=False)
    regseason_pbp[year] = pd.read_csv(f"../cleaned_data/pbp_data/regular_season/reg_pbp_{year}.csv", low_memory=False)
    pstseason_pbp[year] = pd.read_csv(f"../cleaned_data/pbp_data/post_season/post_pbp_{year}.csv", low_memory=False)

all_pre_pbp = pd.concat(preseason_pbp)
all_reg_pbp = pd.concat(regseason_pbp)
all_pst_pbp = pd.concat(pstseason_pbp)

df_pbp = pd.concat([all_pre_pbp, all_reg_pbp, all_pst_pbp])

# Make sure there are no duplicates, drop stupid columns, fill nans, etc
print(df_pbp.shape)
df_pbp.drop(columns=['Unnamed: 0','play_id'], inplace=True)
df_pbp.fillna(value=0)
df_pbp.drop_duplicates(inplace=True)
print(df_pbp.shape)

(614497, 257)
(614496, 255)


In [3]:
# GAME DATA
preseason_games = {}
regseason_games = {}
pstseason_games = {}
for year in years:
    preseason_games[year] = pd.read_csv(f"../data/games_data/pre_season/pre_games_{year}.csv")
    regseason_games[year] = pd.read_csv(f"../data/games_data/regular_season/reg_games_{year}.csv")
    pstseason_games[year] = pd.read_csv(f"../data/games_data/post_season/post_games_{year}.csv")

all_pre_games = pd.concat(preseason_games)
all_reg_games = pd.concat(regseason_games)
all_pst_games = pd.concat(pstseason_games)

df_games = pd.concat([all_pre_games, all_reg_games, all_pst_games])

# Make sure there are no duplicates, drop stupid columns, fill nans, etc
print(df_games.shape)
df_games.drop(columns='Unnamed: 0', inplace=True)
df_games.dropna(inplace=True)
df_games.drop_duplicates(inplace=True)
print(df_games.shape)

(3661, 11)
(3660, 10)


## Preprocessing

In [4]:
# get numeric features
df_pbp_numeric = df_pbp.select_dtypes(include="number")
features = df_pbp_numeric.columns.tolist()
df_pbp_numeric.describe()

Unnamed: 0,season,game_id,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,quarter_end,drive,sp,qtr,...,lateral_recovery,fumble_recovery_1_yards,fumble_recovery_2_yards,return_yards,penalty_yards,replay_or_challenge,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv
count,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,...,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0
mean,2014.006993,2014162000.0,50.135991,424.722428,825.88045,1722.409313,0.0,12.119815,0.071548,2.563719,...,0.001941,0.030913,0.000474,1.021745,0.656053,0.008492,6.3e-05,1.6e-05,0.0,0.0
std,3.152465,3148374.0,24.980828,273.033294,547.359029,1038.468246,0.0,7.010383,0.257738,1.120667,...,0.044019,1.078881,0.139259,5.398683,2.715012,0.091757,0.007966,0.004034,0.0,0.0
min,2009.0,2009081000.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,-34.0,-3.0,-16.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2011.0,2011121000.0,31.0,174.0,317.0,821.0,0.0,6.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2014.0,2014102000.0,53.0,412.0,812.0,1800.0,0.0,12.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2017.0,2017092000.0,71.0,661.0,1299.0,2591.0,0.0,18.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2019.0,2020020000.0,99.0,900.0,1800.0,3600.0,0.0,38.0,1.0,6.0,...,1.0,106.0,77.0,109.0,66.0,1.0,1.0,1.0,0.0,0.0


In [13]:
for idx, feature in enumerate(df_pbp_numeric.columns):
    print(feature, idx)

season 0
game_id 1
yardline_100 2
quarter_seconds_remaining 3
half_seconds_remaining 4
game_seconds_remaining 5
quarter_end 6
drive 7
sp 8
qtr 9
down 10
goal_to_go 11
ydstogo 12
ydsnet 13
yards_gained 14
shotgun 15
no_huddle 16
qb_dropback 17
qb_kneel 18
qb_spike 19
qb_scramble 20
air_yards 21
yards_after_catch 22
kick_distance 23
home_timeouts_remaining 24
away_timeouts_remaining 25
timeout 26
posteam_timeouts_remaining 27
defteam_timeouts_remaining 28
total_home_score 29
total_away_score 30
posteam_score 31
defteam_score 32
score_differential 33
posteam_score_post 34
defteam_score_post 35
score_differential_post 36
no_score_prob 37
opp_fg_prob 38
opp_safety_prob 39
opp_td_prob 40
fg_prob 41
safety_prob 42
td_prob 43
extra_point_prob 44
two_point_conversion_prob 45
ep 46
epa 47
total_home_epa 48
total_away_epa 49
total_home_rush_epa 50
total_away_rush_epa 51
total_home_pass_epa 52
total_away_pass_epa 53
air_epa 54
yac_epa 55
comp_air_epa 56
comp_yac_epa 57
total_home_comp_air_epa 58
t

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# fit the scaler and use it to transform the data
# we will arbitrarily take columns 7 and onward
df_scaled = scaler.fit_transform(df_pbp_numeric.iloc[:,7:])
df_scaled = pd.DataFrame(df_scaled, columns=df_pbp_numeric.columns[7:])

df_scaled.describe()

Unnamed: 0,drive,sp,qtr,down,goal_to_go,ydstogo,ydsnet,yards_gained,shotgun,no_huddle,...,lateral_recovery,fumble_recovery_1_yards,fumble_recovery_2_yards,return_yards,penalty_yards,replay_or_challenge,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv
count,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,...,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0,614496.0
mean,0.300536,0.071548,0.312744,0.442223,0.049649,0.15842,0.622102,0.306668,0.416894,0.065166,...,0.001941,0.243078,0.037506,0.136174,0.00994,0.008492,6.3e-05,1.6e-05,0.0,0.0
std,0.18947,0.257738,0.224133,0.286526,0.217219,0.099276,0.140905,0.057439,0.493046,0.246818,...,0.044019,0.007706,0.001741,0.043189,0.041137,0.091757,0.007966,0.004034,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.135135,0.0,0.2,0.25,0.0,0.083333,0.5,0.277372,0.0,0.0,...,0.0,0.242857,0.0375,0.128,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.297297,0.0,0.4,0.5,0.0,0.208333,0.586022,0.277372,0.0,0.0,...,0.0,0.242857,0.0375,0.128,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.459459,0.0,0.6,0.75,0.0,0.208333,0.725806,0.321168,1.0,0.0,...,0.0,0.242857,0.0375,0.128,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0


## Game Length

In [7]:
game_length = {}
old_id = df_pbp_numeric['game_id'].iloc[0]
count_plays = 1
for index, row in df_pbp_numeric.iterrows():
    if row.game_id != old_id:
        game_length[row.game_id] = count_plays
        old_id = row.game_id
        count_plays = 1
    else: 
        count_plays += 1

In [11]:
df_hist = pd.DataFrame({'num_plays': game_length.values()})

fig = px.histogram(df_hist, x='num_plays', 
                   labels={'num_plays':'# of plays'},
                   title='Distribution of Game Duration in # of Plays')
fig.show()