In [1]:
import pandas as pd
import plotly_express as px

In [2]:
# Let's store some metadata
years = [year for year in range(2009,2020)]

## Data Importing

In [3]:
# Collect the data on all pre/reg/post season games
preseason_games = {}
regseason_games = {}
pstseason_games = {}

for year in years:
    preseason_games[year] = pd.read_csv(f"../data/games_data/pre_season/pre_games_{year}.csv")
    regseason_games[year] = pd.read_csv(f"../data/games_data/regular_season/reg_games_{year}.csv")
    pstseason_games[year] = pd.read_csv(f"../data/games_data/post_season/post_games_{year}.csv")

In [4]:
# Collect the play-by-play data from pre/reg/post season
preseason_pbp = {}
regseason_pbp = {}
pstseason_pbp = {}

for year in years:
    preseason_pbp[year] = pd.read_csv(f"../data/pbp_data/pre_season/pre_pbp_{year}.csv", low_memory=False)
    regseason_pbp[year] = pd.read_csv(f"../data/pbp_data/regular_season/reg_pbp_{year}.csv", low_memory=False)
    pstseason_pbp[year] = pd.read_csv(f"../data/pbp_data/post_season/post_pbp_{year}.csv", low_memory=False)

## Feature Engineering

In [5]:
# Combine all game data into one frame
all_preseason = pd.concat(preseason_games)
all_regseason = pd.concat(regseason_games)
all_pstseason = pd.concat(pstseason_games)
all_games = pd.concat([all_preseason, all_regseason, all_pstseason])

# remove columns which I find useless
all_games.drop(['Unnamed: 0', 'game_url'], axis=1, inplace=True)

# add diff column
all_games['diff'] = all_games['home_score'] - all_games['away_score']

# drop nan rows
print("# Games before dropping NaN rows:", len(all_games))
all_games.dropna(inplace=True)
print("# Games after dropping NaN rows:", len(all_games))

# drop games with teams that have only played a few games
odd_teams = ['APR', 'NPR', 'SAN', 'CRT', 'RIC']
for team in odd_teams:
    indexToDrop = all_games[ (all_games['home_team'] == team)].index
    all_games.drop(index=indexToDrop, inplace=True)
for team in odd_teams:
    indexToDrop = all_games[ (all_games['away_team'] == team)].index
    all_games.drop(index=indexToDrop, inplace=True)
print("# Games agyer dropping games with odd teams:", len(all_games))

features = all_games.columns.tolist()
print("Features:", features)

# Games before dropping NaN rows: 3661
# Games after dropping NaN rows: 3660
# Games agyer dropping games with odd teams: 3630
Features: ['type', 'game_id', 'home_team', 'away_team', 'week', 'season', 'state_of_game', 'home_score', 'away_score', 'diff']


In [6]:
pd.set_option('display.width',500)
print(all_games)

         type     game_id home_team away_team  week  season state_of_game  home_score  away_score  diff
2009 0    pre  2009080950       TEN       BUF     0    2009          POST        21.0        18.0   3.0
     1    pre  2009081351       PHI        NE     1    2009          POST        25.0        27.0  -2.0
     2    pre  2009081352       BAL       WAS     1    2009          POST        23.0         0.0  23.0
     3    pre  2009081350       PIT       ARI     1    2009          POST        20.0        10.0  10.0
     4    pre  2009081353       OAK       DAL     1    2009          POST        31.0        10.0  21.0
...       ...         ...       ...       ...   ...     ...           ...         ...         ...   ...
2019 7   post  2020011200        KC       HOU    18    2019          POST        51.0        31.0  20.0
     8   post  2020011201        GB       SEA    18    2019          POST        28.0        23.0   5.0
     9   post  2020011900        KC       TEN    18    2019     

## Data Exploration

In [7]:
teams = all_games['home_team'].unique()
print(teams)

for team in teams:
    print(team)
    print("Num home games:", len(all_games[all_games['home_team'] == team]))
    print("Num away games:", len(all_games[all_games['away_team'] == team]))

['TEN' 'PHI' 'BAL' 'PIT' 'OAK' 'NYJ' 'IND' 'NO' 'SF' 'DET' 'GB' 'KC' 'SD'
 'MIA' 'NYG' 'NE' 'STL' 'DAL' 'MIN' 'WAS' 'CLE' 'JAC' 'HOU' 'CHI' 'ARI'
 'SEA' 'CIN' 'TB' 'CAR' 'ATL' 'DEN' 'BUF' 'LA' 'JAX' 'LAC']
TEN
Num home games: 111
Num away games: 114
PHI
Num home games: 114
Num away games: 113
BAL
Num home games: 115
Num away games: 120
PIT
Num home games: 115
Num away games: 116
OAK
Num home games: 110
Num away games: 110
NYJ
Num home games: 110
Num away games: 116
IND
Num home games: 116
Num away games: 116
NO
Num home games: 118
Num away games: 116
SF
Num home games: 114
Num away games: 114
DET
Num home games: 110
Num away games: 113
GB
Num home games: 115
Num away games: 120
KC
Num home games: 117
Num away games: 113
SD
Num home games: 81
Num away games: 82
MIA
Num home games: 108
Num away games: 111
NYG
Num home games: 111
Num away games: 115
NE
Num home games: 128
Num away games: 115
STL
Num home games: 70
Num away games: 70
DAL
Num home games: 116
Num away games: 113
MIN
Num home

In [8]:
# let's rank the performance of each team at home by their avg diff
avg_home_diffs = []
for team in teams:
    home_games = all_games[all_games['home_team'] == team]
    avg_home_diffs.append(home_games['diff'].mean())
    
df_teams = pd.DataFrame({'team': teams, 'avg_home_diff': avg_home_diffs})
df_teams['pos_avg_diff'] = df_teams['avg_home_diff'] > 0

In [10]:
fig = px.bar(df_teams, x='team', y='avg_home_diff', 
             color='pos_avg_diff', color_discrete_map={False:"red", True:"green"},
             title="Team Performance at Home",
             labels={"team":"Team", "avg_home_diff":"Avg. Diff @ Home"})
fig.update_layout(showlegend=False)
fig.show()

In [12]:
fig = px.histogram(all_games, x='diff',
                   title='Differential Distribution - All Games 2009-2019',
                   labels={'diff':'Differential'})
fig.show()