In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
years = [x for x in range(2009,2020)]

In [3]:
preseason_dfs = {}
regseason_dfs = {}
pstseason_dfs = {}
for year in years:
    preseason_dfs[year] = pd.read_csv(f"../cleaned_data/pbp_data/pre_season/pre_pbp_{year}.csv", low_memory=False)
    regseason_dfs[year] = pd.read_csv(f"../cleaned_data/pbp_data/regular_season/reg_pbp_{year}.csv", low_memory=False)
    pstseason_dfs[year] = pd.read_csv(f"../cleaned_data/pbp_data/post_season/post_pbp_{year}.csv", low_memory=False)

all_pre_pbp = pd.concat(preseason_dfs)
all_reg_pbp = pd.concat(regseason_dfs)
all_pst_pbp = pd.concat(pstseason_dfs)

df = pd.concat([all_pre_pbp, all_reg_pbp, all_pst_pbp])

print(df.shape)

(614497, 257)


In [4]:
df = df.drop(columns=['Unnamed: 0'])

## Data Exploration

In [5]:
df

Unnamed: 0,Unnamed: 1,season,play_id,game_id,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,...,penalty_player_name,penalty_yards,replay_or_challenge,replay_or_challenge_result,penalty_type,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv,season_type
2009,0,2009,59,2009080950,TEN,BUF,TEN,home,BUF,TEN,74.0,...,,0.0,0,,,0.0,0.0,0.0,0.0,pre
2009,1,2009,83,2009080950,TEN,BUF,TEN,home,BUF,TEN,64.0,...,,0.0,0,,,0.0,0.0,0.0,0.0,pre
2009,2,2009,104,2009080950,TEN,BUF,TEN,home,BUF,TEN,67.0,...,,0.0,0,,,0.0,0.0,0.0,0.0,pre
2009,3,2009,128,2009080950,TEN,BUF,TEN,home,BUF,TEN,55.0,...,,0.0,0,,,0.0,0.0,0.0,0.0,pre
2009,4,2009,152,2009080950,TEN,BUF,TEN,home,BUF,BUF,40.0,...,,0.0,0,,,0.0,0.0,0.0,0.0,pre
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,1972,2019,3998,2020020200,KC,SF,KC,home,SF,KC,59.0,...,,0.0,0,,,0.0,0.0,0.0,0.0,post
2019,1973,2019,4015,2020020200,KC,SF,KC,home,SF,KC,64.0,...,,0.0,0,,,0.0,0.0,0.0,0.0,post
2019,1974,2019,4036,2020020200,KC,SF,KC,home,SF,KC,64.0,...,,0.0,0,,,0.0,0.0,0.0,0.0,post
2019,1975,2019,4053,2020020200,KC,SF,KC,home,SF,KC,67.0,...,,0.0,0,,,0.0,0.0,0.0,0.0,post


In [6]:
df.describe()

Unnamed: 0,season,play_id,game_id,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,quarter_end,drive,sp,...,lateral_recovery,fumble_recovery_1_yards,fumble_recovery_2_yards,return_yards,penalty_yards,replay_or_challenge,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv
count,614497.0,614497.0,614497.0,614497.0,614497.0,614497.0,614497.0,614497.0,614497.0,614497.0,...,614497.0,614497.0,614497.0,614497.0,614497.0,614497.0,614497.0,614497.0,614497.0,614497.0
mean,2014.006986,2121.765107,2014162000.0,50.136071,424.721916,825.88075,1722.408153,0.0,12.119817,0.071548,...,0.001941,0.030913,0.000474,1.021743,0.656052,0.008491,6.3e-05,1.6e-05,0.0,0.0
std,3.152467,1223.330103,3148375.0,24.980885,273.033367,547.358634,1038.467798,0.0,7.010377,0.257738,...,0.044019,1.07888,0.139258,5.398679,2.715009,0.091757,0.007966,0.004034,0.0,0.0
min,2009.0,35.0,2009081000.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,-34.0,-3.0,-16.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2011.0,1070.0,2011121000.0,31.0,174.0,317.0,821.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2014.0,2111.0,2014102000.0,53.0,412.0,812.0,1800.0,0.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2017.0,3147.0,2017092000.0,71.0,661.0,1299.0,2591.0,0.0,18.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2019.0,5876.0,2020020000.0,99.0,900.0,1800.0,3600.0,0.0,38.0,1.0,...,1.0,106.0,77.0,109.0,66.0,1.0,1.0,1.0,0.0,0.0


In [7]:
print("Games available in data:", len(df['game_id'].unique()))

Games available in data: 3658


In [8]:
from myFuncs import nanalysis

In [9]:
nan_results = nanalysis(df)

No NaNs remaining in numeric data! Well Done!


In [52]:
count = 0
for id in df['game_id'].unique():
    game = df[df['game_id'] == id]
    fig = px.scatter(game, x=range(len(game)), 
                     y='yardline_100', color='ep', title=f'Game ID: {id}',
                     width=1200,height=400)
    fig.show()
    count += 1
    if count > 2:
        break

In [13]:
# Let's explore the numeric features
df_numeric = df.select_dtypes(include="number")
numeric_features = df_numeric.columns.tolist()
for feat in numeric_features:
    print(feat)

season
play_id
game_id
yardline_100
quarter_seconds_remaining
half_seconds_remaining
game_seconds_remaining
quarter_end
drive
sp
qtr
down
goal_to_go
ydstogo
ydsnet
yards_gained
shotgun
no_huddle
qb_dropback
qb_kneel
qb_spike
qb_scramble
air_yards
yards_after_catch
kick_distance
home_timeouts_remaining
away_timeouts_remaining
timeout
posteam_timeouts_remaining
defteam_timeouts_remaining
total_home_score
total_away_score
posteam_score
defteam_score
score_differential
posteam_score_post
defteam_score_post
score_differential_post
no_score_prob
opp_fg_prob
opp_safety_prob
opp_td_prob
fg_prob
safety_prob
td_prob
extra_point_prob
two_point_conversion_prob
ep
epa
total_home_epa
total_away_epa
total_home_rush_epa
total_away_rush_epa
total_home_pass_epa
total_away_pass_epa
air_epa
yac_epa
comp_air_epa
comp_yac_epa
total_home_comp_air_epa
total_away_comp_air_epa
total_home_comp_yac_epa
total_away_comp_yac_epa
total_home_raw_air_epa
total_away_raw_air_epa
total_home_raw_yac_epa
total_away_raw_yac_

## Team Offense Stats

In [17]:
teams = df['home_team'].unique()
print(teams)

['TEN' 'PHI' 'BAL' 'PIT' 'OAK' 'NYJ' 'IND' 'NO' 'SF' 'DET' 'BUF' 'GB' 'KC'
 'SD' 'MIA' 'NYG' 'NE' 'STL' 'DAL' 'MIN' 'WAS' 'CLE' 'JAC' 'HOU' 'CHI'
 'ARI' 'SEA' 'CIN' 'TB' 'CAR' 'ATL' 'DEN' 'LA' 'LAC' 'JAX' 'AFC' 'NFC'
 'SAN' 'CRT' 'RIC' 'NPR' 'APR']


In [35]:
avg_yds_gained = []
avg_epa = []
avg_wpa = []
for team in teams:
    filtered_df = df[df['posteam'] == team]
    avg_yds_gained.append(filtered_df['yards_gained'].mean())
    avg_epa.append(filtered_df['epa'].mean())
    avg_wpa.append(filtered_df['wpa'].mean())
    print(team)
    print(filtered_df['yards_gained'].describe(), "\n")

TEN
count    18168.000000
mean         4.005724
std          8.144682
min        -19.000000
25%          0.000000
50%          0.000000
75%          6.000000
max         99.000000
Name: yards_gained, dtype: float64 

PHI
count    19919.000000
mean         4.186054
std          8.080012
min        -24.000000
25%          0.000000
50%          0.000000
75%          6.000000
max         91.000000
Name: yards_gained, dtype: float64 

BAL
count    20408.000000
mean         3.944973
std          7.637706
min        -22.000000
25%          0.000000
50%          1.000000
75%          6.000000
max         95.000000
Name: yards_gained, dtype: float64 

PIT
count    19204.000000
mean         4.172256
std          8.283921
min        -19.000000
25%          0.000000
50%          0.000000
75%          6.000000
max         97.000000
Name: yards_gained, dtype: float64 

OAK
count    18420.000000
mean         3.905537
std          7.914302
min        -18.000000
25%          0.000000
50%          0.000

In [39]:
team_stats = pd.DataFrame({'team': teams, 'avg_yds_per_play': avg_yds_gained, 'avg_epa': avg_epa, 'avg_wpa': avg_wpa})
fig = px.bar(team_stats, 
             x='team', y='avg_yds_per_play', title='Average Yards Gained Per Play by Team',
             width=1200, height=400)
fig.show()

In [43]:
fig = px.bar(team_stats, 
             x='team', y='avg_epa', title='Average EPA by Team (Offensive Plays)',
             width=1200, height=400)
fig.show()

In [44]:
fig = px.bar(team_stats, 
             x='team', y='avg_wpa', title='Average WPA by Team (Offensive Plays)',
             width=1200, height=400)
fig.show()

In [49]:
from collections import defaultdict

# filter to touchdown plays
touchdown_plays = df[df['touchdown'] == 1]

passer_td_counts = defaultdict(lambda: 0)
receiver_td_counts = defaultdict(lambda: 0)
rusher_td_counts = defaultdict(lambda: 0)

for row in touchdown_plays:
    if row.play_type == "pass":
        passer_td_counts[row.passer_player_name] += 1
        receiver_td_counts[row.receiver_player_name] += 1
    elif row.play_type == "run":
        rusher_td_counts[row.rusher_player_name] += 1

17333
