In [19]:
import pandas as pd
import numpy as np
import plotly_express as px

In [20]:
# Let's store some metadata
years = [year for year in range(2009,2020)]

## Data Importing

In [21]:
# Collect the play-by-play data from pre/reg/post season
preseason_pbp = {}
regseason_pbp = {}
pstseason_pbp = {}

for year in years:
    preseason_pbp[year] = pd.read_csv(f"../data/pbp_data/pre_season/pre_pbp_{year}.csv", low_memory=False)
    regseason_pbp[year] = pd.read_csv(f"../data/pbp_data/regular_season/reg_pbp_{year}.csv", low_memory=False)
    pstseason_pbp[year] = pd.read_csv(f"../data/pbp_data/post_season/post_pbp_{year}.csv", low_memory=False)

In [22]:
all_preseason = pd.concat(preseason_pbp)
all_regseason = pd.concat(regseason_pbp)
all_pstseason = pd.concat(pstseason_pbp)
all_pbp = pd.concat([all_preseason, all_regseason, all_pstseason])

## Data Cleaning

In [23]:
all_pbp.reset_index(names=['season','old_index'], inplace=True)
all_pbp

Unnamed: 0.1,season,old_index,play_id,game_id,home_team,away_team,posteam,posteam_type,defteam,side_of_field,...,penalty_player_name,penalty_yards,replay_or_challenge,replay_or_challenge_result,penalty_type,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv,Unnamed: 0
0,2009,0,37,2009080950,TEN,BUF,TEN,home,BUF,BUF,...,,,0,,,0.0,0.0,0.0,0.0,
1,2009,1,59,2009080950,TEN,BUF,TEN,home,BUF,TEN,...,,,0,,,0.0,0.0,0.0,0.0,
2,2009,2,83,2009080950,TEN,BUF,TEN,home,BUF,TEN,...,,,0,,,0.0,0.0,0.0,0.0,
3,2009,3,104,2009080950,TEN,BUF,TEN,home,BUF,TEN,...,,,0,,,0.0,0.0,0.0,0.0,
4,2009,4,128,2009080950,TEN,BUF,TEN,home,BUF,TEN,...,,,0,,,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645298,2019,2074,4036,2020020200,KC,SF,KC,home,SF,KC,...,,,0,,,0.0,0.0,0.0,0.0,2075.0
645299,2019,2075,4053,2020020200,KC,SF,KC,home,SF,KC,...,,,0,,,0.0,0.0,0.0,0.0,2076.0
645300,2019,2076,4074,2020020200,KC,SF,KC,home,SF,KC,...,,,0,,,0.0,0.0,0.0,0.0,2077.0
645301,2019,2077,4091,2020020200,KC,SF,KC,home,SF,KC,...,,,0,,,0.0,0.0,0.0,0.0,2078.0


In [24]:
# drop some columns we don't need
all_pbp.drop(['Unnamed: 0', 'old_index'], axis=1, inplace=True)

In [29]:
# drop games with teams that have only played a few games
print("# Games before dropping games with odd teams:", len(all_pbp['game_id'].unique()) )
odd_teams = ['APR', 'NPR', 'SAN', 'CRT', 'RIC', 'NFC']
for team in odd_teams:
    indexToDrop = all_pbp[ (all_pbp['home_team'] == team)].index
    all_pbp.drop(index=indexToDrop, inplace=True)
for team in odd_teams:
    indexToDrop = all_pbp[ (all_pbp['away_team'] == team)].index
    all_pbp.drop(index=indexToDrop, inplace=True)
print("# Games after dropping games with odd teams:", len(all_pbp['game_id'].unique()) )

# Games before dropping games with odd teams: 3653
# Games after dropping games with odd teams: 3647


In [63]:
# Handling Missing Data
all_pbp.dropna(subset=['yardline_100'], inplace=True)

# Removing Duplicates
all_pbp.drop_duplicates(inplace=True)

In [64]:
print("Dataframe shape:", all_pbp.shape)
features = all_pbp.columns.tolist()
print("Features:")

for feature in features:
    print(feature)

Dataframe shape: (624084, 257)
Features:
season
play_id
game_id
home_team
away_team
posteam
posteam_type
defteam
side_of_field
yardline_100
game_date
quarter_seconds_remaining
half_seconds_remaining
game_seconds_remaining
game_half
quarter_end
drive
sp
qtr
down
goal_to_go
time
yrdln
ydstogo
ydsnet
desc
play_type
yards_gained
shotgun
no_huddle
qb_dropback
qb_kneel
qb_spike
qb_scramble
pass_length
pass_location
air_yards
yards_after_catch
run_location
run_gap
field_goal_result
kick_distance
extra_point_result
two_point_conv_result
home_timeouts_remaining
away_timeouts_remaining
timeout
timeout_team
td_team
posteam_timeouts_remaining
defteam_timeouts_remaining
total_home_score
total_away_score
posteam_score
defteam_score
score_differential
posteam_score_post
defteam_score_post
score_differential_post
no_score_prob
opp_fg_prob
opp_safety_prob
opp_td_prob
fg_prob
safety_prob
td_prob
extra_point_prob
two_point_conversion_prob
ep
epa
total_home_epa
total_away_epa
total_home_rush_epa
total_awa

## Data Exploration

In [58]:
all_pbp.describe()

Unnamed: 0,season,play_id,game_id,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,quarter_end,drive,sp,...,lateral_sack_player_name,fumble_recovery_1_yards,fumble_recovery_2_yards,return_yards,penalty_yards,replay_or_challenge,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv
count,643501.0,643501.0,643501.0,624084.0,643302.0,643302.0,643302.0,643501.0,643501.0,643501.0,...,0.0,8826.0,70.0,643480.0,48246.0,643501.0,624362.0,624362.0,624362.0,624362.0
mean,2014.01665,2142.256999,2014169000.0,49.990025,413.534844,811.37194,1706.282262,0.017888,12.223423,0.070452,...,,2.21278,4.1,1.047902,8.516374,0.008207,6.4e-05,1.6e-05,0.0,0.0
std,3.156491,1240.38285,3152193.0,24.976083,279.369008,555.022738,1051.971632,0.132545,7.092778,0.255908,...,,8.941436,12.784898,5.476557,5.324839,0.090218,0.008004,0.004002,0.0,0.0
min,2009.0,35.0,2009081000.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,,-34.0,-16.0,-18.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2011.0,1075.0,2011121000.0,31.0,151.0,286.0,797.0,0.0,6.0,0.0,...,,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
50%,2014.0,2127.0,2014103000.0,52.0,396.0,799.0,1800.0,0.0,12.0,0.0,...,,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
75%,2017.0,3182.0,2017092000.0,71.0,656.0,1289.0,2588.0,0.0,18.0,0.0,...,,0.0,2.75,0.0,10.0,0.0,0.0,0.0,0.0,0.0
max,2019.0,5921.0,2020020000.0,99.0,900.0,1800.0,3600.0,1.0,38.0,1.0,...,,106.0,77.0,109.0,66.0,1.0,1.0,1.0,0.0,0.0


In [27]:
print('Number of games recorded in PBP data:', len(all_pbp['game_id'].unique()))
print('Teams in PBP data:', all_pbp['home_team'].unique())

Number of games recorded in PBP data: 3653
Teams in PBP data: ['TEN' 'PHI' 'BAL' 'PIT' 'OAK' 'NYJ' 'IND' 'NO' 'SF' 'DET' 'BUF' 'GB' 'KC'
 'SD' 'MIA' 'NYG' 'NE' 'STL' 'DAL' 'MIN' 'WAS' 'CLE' 'JAC' 'HOU' 'CHI'
 'ARI' 'SEA' 'CIN' 'TB' 'CAR' 'ATL' 'DEN' 'LA' 'LAC' 'JAX' 'AFC' 'NFC']


## Data Visualization

In [30]:
# pick one game (the very first one)
pbp_game1 = all_pbp[all_pbp['game_id'] == 2009080950]
pbp_game1

Unnamed: 0,season,play_id,game_id,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,...,penalty_player_id,penalty_player_name,penalty_yards,replay_or_challenge,replay_or_challenge_result,penalty_type,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv
0,2009,37,2009080950,TEN,BUF,TEN,home,BUF,BUF,30.0,...,,,,0,,,0.0,0.0,0.0,0.0
1,2009,59,2009080950,TEN,BUF,TEN,home,BUF,TEN,74.0,...,,,,0,,,0.0,0.0,0.0,0.0
2,2009,83,2009080950,TEN,BUF,TEN,home,BUF,TEN,64.0,...,,,,0,,,0.0,0.0,0.0,0.0
3,2009,104,2009080950,TEN,BUF,TEN,home,BUF,TEN,67.0,...,,,,0,,,0.0,0.0,0.0,0.0
4,2009,128,2009080950,TEN,BUF,TEN,home,BUF,TEN,55.0,...,,,,0,,,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,2009,3845,2009080950,TEN,BUF,TEN,home,BUF,TEN,64.0,...,,,,0,,,0.0,0.0,0.0,0.0
159,2009,3862,2009080950,TEN,BUF,TEN,home,BUF,TEN,66.0,...,,,,0,,,0.0,0.0,0.0,0.0
160,2009,3883,2009080950,TEN,BUF,TEN,home,BUF,TEN,68.0,...,,,,0,,,0.0,0.0,0.0,0.0
161,2009,3904,2009080950,TEN,BUF,TEN,home,BUF,TEN,68.0,...,,,,0,,,0.0,0.0,0.0,0.0


In [62]:
fig = px.scatter(pbp_game1, x=pbp_game1.index, y='yardline_100', color='td_prob')
fig.show()

In [52]:
print("Plays of Interest:")
idx = 0
for row in pbp_game1.itertuples():
    if row.sp==1:
        if row.posteam_type == "home":
            team = row.home_team
        else:
            team = row.away_team
        print(f"Index: {idx}\n\tPoss. Team: {team} Desc: {row.desc}")
    idx += 1

Plays of Interest:
Index: 8
	Poss. Team: TEN Desc: (15:00) (Punt formation) A.Trapasso left end for 40 yards, TOUCHDOWN.
Index: 9
	Poss. Team: TEN Desc: R.Bironas extra point is GOOD, Center-K.Amato, Holder-A.Trapasso.
Index: 30
	Poss. Team: TEN Desc: (6:55) L.White left guard for 3 yards, TOUCHDOWN.
Index: 31
	Poss. Team: TEN Desc: R.Bironas extra point is GOOD, Center-K.Amato, Holder-A.Trapasso.
Index: 61
	Poss. Team: BUF Desc: (8:21) (Field Goal formation) R.Lindell 20 yard field goal is GOOD, Center-R.Neill, Holder-B.Moorman.
Index: 75
	Poss. Team: TEN Desc: (1:12) V.Young pass short right to P.Williams for 5 yards, TOUCHDOWN.
Index: 76
	Poss. Team: TEN Desc: R.Bironas extra point is GOOD, Center-K.Amato, Holder-A.Trapasso.
Index: 84
	Poss. Team: BUF Desc: (:10) (Field Goal formation) R.Lindell 47 yard field goal is GOOD, Center-R.Neill, Holder-B.Moorman.
Index: 103
	Poss. Team: BUF Desc: (7:17) R.Lindell 52 yard field goal is GOOD, Center-G.Sanborn, Holder-B.Moorman.
Index: 132
	P