In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

df = pd.read_csv('data/team_stats.csv')
cols_to_move = ['season','date','team','team_opp','won','home']
new_col_order = cols_to_move + [col for col in df.columns if col not in cols_to_move]
df = df[new_col_order].copy()
df = df.drop(columns=['mp','mp.1','standings','standings_opp'])

# drop all columns with more than 100 missing values
df = df.dropna(thresh=100, axis=1).copy()

nulls = df.isnull().sum()
nulls = nulls[nulls > 0]

# fill null columns with mean of the column
for col in nulls.index:
    df[col] = df[col].fillna(df[col].mean())
    
# Create a unique identifier for each game
df['game_identifier'] = df.apply(lambda row: '_'.join(sorted([row['date'], row['team'], row['team_opp']])), axis=1)

# Assign a unique game ID based on the game_identifier
df['game_id'] = df.groupby('game_identifier').ngroup()
df = df.drop(columns=['game_identifier'])

df = df.sort_values(by=['date','game_id'])

df = df.set_index('game_id')
df = df.reset_index()

df['target'] = df.groupby(['team'])['won'].shift(-1)
df = df.dropna(subset=['target'])

df.insert(6, 'target', df.pop('target'))


# turn won and target into integers
df['won'] = df['won'].astype(int)
df['target'] = df['target'].astype(int)
df

Unnamed: 0,game_id,season,date,team,team_opp,won,target,home,fg,fga,...,trb%_max_opp,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp
0,0,2018,2017-10-17,BOS,CLE,0,0,0,36.0,88.0,...,19.4,43.7,3.8,6.0,31.6,27.3,138.0,107.0,102,1
1,0,2018,2017-10-17,CLE,BOS,1,1,1,38.0,83.0,...,13.7,46.5,9.9,4.5,34.7,29.9,129.0,112.0,99,0
2,1,2018,2017-10-17,HOU,GSW,1,1,0,47.0,97.0,...,22.7,57.1,6.1,9.1,31.7,32.6,250.0,127.0,121,1
3,1,2018,2017-10-17,GSW,HOU,0,1,1,43.0,80.0,...,13.8,42.9,2.8,6.5,18.2,31.2,152.0,126.0,122,0
4,2,2018,2017-10-18,ATL,DAL,1,0,0,48.0,94.0,...,29.7,56.6,3.5,9.7,40.0,30.4,219.0,126.0,111,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16834,8417,2024,2024-02-23,MIL,MIN,1,1,0,41.0,87.0,...,24.2,27.3,3.6,6.3,42.9,35.5,163.0,125.0,107,1
16835,8417,2024,2024-02-23,MIN,MIL,0,1,1,43.0,98.0,...,15.3,40.7,10.3,4.8,50.0,32.4,176.0,118.0,112,0
16836,8418,2024,2024-02-23,WAS,OKC,0,0,0,41.0,104.0,...,23.8,37.5,3.8,8.9,100.0,32.0,163.0,108.0,147,1
16837,8418,2024,2024-02-23,OKC,WAS,1,1,1,55.0,92.0,...,32.4,63.9,8.1,5.9,60.0,40.6,130.0,143.0,106,0


In [2]:

def remove_playoffs(df):
    df['gp'] = df.groupby(['season','team']).cumcount() 
    df_len1 = len(df)
    df = df[df['gp'] <= 82].copy()
    df_len2 = len(df)
    print(f'Removed {df_len1 - df_len2} playoff games')
    return df

df = remove_playoffs(df)

df


Removed 690 playoff games


Unnamed: 0,game_id,season,date,team,team_opp,won,target,home,fg,fga,...,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,gp
0,0,2018,2017-10-17,BOS,CLE,0,0,0,36.0,88.0,...,43.7,3.8,6.0,31.6,27.3,138.0,107.0,102,1,0
1,0,2018,2017-10-17,CLE,BOS,1,1,1,38.0,83.0,...,46.5,9.9,4.5,34.7,29.9,129.0,112.0,99,0,0
2,1,2018,2017-10-17,HOU,GSW,1,1,0,47.0,97.0,...,57.1,6.1,9.1,31.7,32.6,250.0,127.0,121,1,0
3,1,2018,2017-10-17,GSW,HOU,0,1,1,43.0,80.0,...,42.9,2.8,6.5,18.2,31.2,152.0,126.0,122,0,0
4,2,2018,2017-10-18,ATL,DAL,1,0,0,48.0,94.0,...,56.6,3.5,9.7,40.0,30.4,219.0,126.0,111,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16834,8417,2024,2024-02-23,MIL,MIN,1,1,0,41.0,87.0,...,27.3,3.6,6.3,42.9,35.5,163.0,125.0,107,1,56
16835,8417,2024,2024-02-23,MIN,MIL,0,1,1,43.0,98.0,...,40.7,10.3,4.8,50.0,32.4,176.0,118.0,112,0,55
16836,8418,2024,2024-02-23,WAS,OKC,0,0,0,41.0,104.0,...,37.5,3.8,8.9,100.0,32.0,163.0,108.0,147,1,55
16837,8418,2024,2024-02-23,OKC,WAS,1,1,1,55.0,92.0,...,63.9,8.1,5.9,60.0,40.6,130.0,143.0,106,0,55


# Create Win Pct features

In [3]:
def get_standings(df):  
    df['games_played'] = df.groupby(['season','team']).cumcount() 
    df['wins'] = (df.groupby(['season','team'])['won'].cumsum())
    df['wins'] = df.groupby(['season','team'])['wins'].shift(1)
    df['win_pct'] = df['wins'] / df['games_played'] 
    df['wins'] = df['wins'].fillna(0)
    df['win_pct'] = df['win_pct'].fillna(0.5)
    df = df.drop(columns=['games_played','wins']).copy()
    
    # get opponent win pct,
    opp_df = df[['game_id','team','win_pct']].copy()
    opp_df.columns = ['game_id','team_opp','win_pct_opp']
    
    return df.merge(opp_df, on=['game_id','team_opp'], how='left')

df = get_standings(df)
   

In [4]:


df = df.sort_values(['team','team_opp','date'])

matchup_count_season = df.groupby(['season','team','team_opp']).cumcount() 
matchup_count_all = df.groupby(['team','team_opp']).cumcount() 

matchup_wins_season = df.groupby(['season','team','team_opp'])['won'].cumsum().shift(1)
matchup_wins_all = df.groupby(['team','team_opp'])['won'].cumsum().shift(1)

df['matchup_win_pct_season'] = matchup_wins_season / matchup_count_season
df['matchup_win_pct_all'] = matchup_wins_all / matchup_count_all

# fill nulls with 0.5
df['matchup_win_pct_season'] = df['matchup_win_pct_season'].fillna(0.5)

# fill inf with .5
df['matchup_win_pct_season'] = df['matchup_win_pct_season'].replace([float('inf')], 0.5)

df['matchup_win_pct_all'] = df['matchup_win_pct_all'].fillna(0.5)

# fill inf with .5
df['matchup_win_pct_all'] = df['matchup_win_pct_all'].replace([float('inf')], 0.5)

df = df.sort_values(by='game_id')

# df.to_parquet('data/team_stats/non_elo.parquet', index=False)



# Find Home Advantage

In [5]:
df_short = df[df['home'] == 1].copy()
data = df_short.copy()

value_counts_data = data['won'].value_counts()

# find the home advantage
home_advantage = value_counts_data[1] / (value_counts_data[0] + value_counts_data[1])

home_advantage

0.5653627135429562

In [6]:

df = df.sort_values(by=['team','date'])


non_rolling_cols = ['target','team_opp','date']

rolling_df = df.drop(columns=non_rolling_cols).copy()
rolling_df = rolling_df.set_index('game_id')

rolling_df = rolling_df.groupby(['season','team']).rolling(10, 1).mean()

rolling_df.columns = [f'{col}_rolling' for col in rolling_df.columns]

rolling_df = rolling_df.reset_index()

df = df.merge(rolling_df, on=['game_id','team','season'], how='left')

df = df.sort_values(by='game_id')

In [7]:
df

Unnamed: 0,game_id,season,date,team,team_opp,won,target,home,fg,fga,...,usg%_max_opp_rolling,ortg_max_opp_rolling,drtg_max_opp_rolling,total_opp_rolling,home_opp_rolling,gp_rolling,win_pct_rolling,win_pct_opp_rolling,matchup_win_pct_season_rolling,matchup_win_pct_all_rolling
536,0,2018,2017-10-17,BOS,CLE,0,0,0,36.0,88.0,...,27.30,138.0,107.0,102.0,1.0,0.0,0.500000,0.500000,0.500000,0.500000
2676,0,2018,2017-10-17,CLE,BOS,1,1,1,38.0,83.0,...,29.90,129.0,112.0,99.0,0.0,0.0,0.500000,0.500000,0.500000,0.500000
4819,1,2018,2017-10-17,GSW,HOU,0,1,1,43.0,80.0,...,31.20,152.0,126.0,122.0,0.0,0.0,0.500000,0.500000,0.500000,0.500000
5345,1,2018,2017-10-17,HOU,GSW,1,1,0,47.0,97.0,...,32.60,250.0,127.0,121.0,1.0,0.0,0.500000,0.500000,0.500000,0.500000
3199,2,2018,2017-10-18,DAL,ATL,0,0,1,38.0,86.0,...,35.60,279.0,124.0,117.0,0.0,0.0,0.500000,0.500000,0.500000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9165,8417,2024,2024-02-23,MIL,MIN,1,1,0,41.0,87.0,...,43.63,174.6,121.3,112.4,0.6,51.5,0.651368,0.504791,0.550000,0.590152
9688,8417,2024,2024-02-23,MIN,MIL,0,1,1,43.0,98.0,...,34.18,194.6,130.6,103.2,0.6,50.5,0.698948,0.523866,0.800000,0.456405
11286,8418,2024,2024-02-23,OKC,WAS,1,1,1,55.0,92.0,...,34.86,191.3,129.9,114.9,0.3,50.5,0.687257,0.498089,0.683333,0.472638
16145,8418,2024,2024-02-23,WAS,OKC,0,0,0,41.0,104.0,...,33.48,165.1,115.1,126.3,0.5,50.5,0.178798,0.628262,0.050000,0.412823
