In [2]:
import pandas as pd
import numpy as np
import os
from nba_api.stats.static import teams
from nba_api.stats.endpoints import scoreboard
from nba_api.stats.endpoints import leaguegamefinder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime, timedelta


In [4]:
def get_basic_boxscores(date="2018-09-01"):
    nba_teams = teams.get_teams()
    team_names = [team['full_name'] for team in nba_teams]
    team_names.sort()
    team_ids = [team['id'] for team in nba_teams]

    games = None
    for ids in team_ids:
        if games is None:
            gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=ids)
            games = gamefinder.get_data_frames()[0]
        else:
            gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=ids)
            games = pd.concat([games, gamefinder.get_data_frames()[0]])

    games.GAME_ID = pd.to_numeric(games.GAME_ID, downcast='integer')
    games.GAME_DATE = pd.to_datetime(games.GAME_DATE, infer_datetime_format=True)

    today = (datetime.utcnow() - timedelta(hours=9)).strftime('%Y-%m-%d')
    games = games[games['GAME_DATE'] < today]
    games = games[games['GAME_DATE'] > date].sort_values(by='GAME_DATE', ascending=False)

    games.reset_index(drop=True, inplace=True)

    games.sort_values(by='GAME_ID', ascending=False)
    games['HOME_TEAM'] = 0
    games.loc[games['MATCHUP'].str.contains('vs.'), ['HOME_TEAM']] = 1
    games.loc[games['HOME_TEAM'] == 1, :]
    games.sort_values(by=['GAME_DATE', 'GAME_ID'], ascending=False).reset_index(drop=True)
    return games

In [5]:
def roll(df, roll_number = 10, procedure = '', suff = '_Roll', selected_columns=[]):
    df_rolling = df[selected_columns + ["TEAM_ABBREVIATION"]]
    df_rolling = df_rolling.groupby(["TEAM_ABBREVIATION"], group_keys=False)

    def find_team_averages(team):
        return team.rolling(roll_number, closed='left').mean()

    def find_team_medians(team):
        return team.rolling(roll_number, closed='left').median()

    def find_team_stds(team):
        return team.rolling(roll_number, closed='left').std()

    if procedure == 'median':
        df_rolling = df_rolling.apply(find_team_medians)
    elif procedure == 'std':
        df_rolling = df_rolling.apply(find_team_stds)
    else:
        procedure = 'mean'
        df_rolling = df_rolling.apply(find_team_averages)

    df_rolling = df_rolling[selected_columns]
    df_rolling = df_rolling.sort_index()

    new_column_names = {}
    for col in df_rolling.columns:
        new_column_names[col] = col + suff + '_' + procedure

    df_rolling = df_rolling.rename(columns=new_column_names)
    return df_rolling

In [7]:
#get basic boxscore data to add columns to the advanced boxscore
date = datetime.now() - timedelta(days=60)
date_str = date.strftime('%Y-%m-%d')

basic = get_basic_boxscores(date=date_str)
games_df = basic[['TEAM_ID', 'TEAM_ABBREVIATION', 'GAME_ID', 'GAME_DATE', 'HOME_TEAM', 'PLUS_MINUS']].copy()

In [8]:
#get advanced boxscore data from pickle
# advanced = pd.read_pickle(f'data/pkl/{adv_pickle_filename}')
advanced = pd.read_pickle('../data/pkl/boxscores_advanced_team_all.pkl')

In [10]:
############################################################################
# Get today's date
today = datetime.now().strftime('%Y-%m-%d')

# Get scoreboard for today's games
scoreboard_today = scoreboard.Scoreboard(game_date=today)
games = scoreboard_today.game_header.get_data_frame()

# Get all NBA teams
nba_teams = teams.get_teams()

In [11]:
# Create an empty list to store the team data
team_data = []
# Loop through each game and add team data to the list
for index, game in games.iterrows():
    home_team_id = game["HOME_TEAM_ID"]
    away_team_id = game["VISITOR_TEAM_ID"]

    home_team = next((team for team in nba_teams if team["id"] == home_team_id), None)
    away_team = next((team for team in nba_teams if team["id"] == away_team_id), None)

    if home_team is not None and away_team is not None:
        team_data.append({
            "game_id": game["GAME_ID"],
            "home_team_id": home_team["id"],
            "home_team": home_team["abbreviation"],
            "away_team_id": away_team["id"],
            "away_team": away_team["abbreviation"]
        })

# Convert the list of team data to a DataFrame
team_df = pd.DataFrame(team_data)

In [12]:
team_df

Unnamed: 0,game_id,home_team_id,home_team,away_team_id,away_team
0,22201042,1610612765,DET,1610612743,DEN
1,22201043,1610612751,BKN,1610612758,SAC
2,22201044,1610612761,TOR,1610612760,OKC
3,22201045,1610612749,MIL,1610612754,IND
4,22201046,1610612756,PHX,1610612753,ORL


In [13]:
df1 = team_df[['home_team_id', 'home_team', 'game_id']]
df1.rename(columns={'game_id': 'GAME_ID', 'home_team': 'TEAM_ABBREVIATION', 'home_team_id': 'TEAM_ID'}, inplace=True)
df1['GAME_DATE'] = today
df1['HOME_TEAM'] = 1
df1['PLUS_MINUS'] = 0
df2 = team_df[['away_team_id', 'away_team', 'game_id']]
df2.rename(columns={'game_id': 'GAME_ID', 'away_team': 'TEAM_ABBREVIATION', 'away_team_id': 'TEAM_ID'}, inplace=True)
df2['GAME_DATE'] = today
df2['HOME_TEAM'] = 0
df2['PLUS_MINUS'] = 0
games_today_df = pd.concat([df1, df2], ignore_index=True, sort=False)
games_today_df.GAME_DATE = pd.to_datetime(games_today_df.GAME_DATE)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.rename(columns={'game_id': 'GAME_ID', 'home_team': 'TEAM_ABBREVIATION', 'home_team_id': 'TEAM_ID'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['GAME_DATE'] = today
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.rename(columns={'game_id': 'GAME_ID', 'away_team': 'TEAM_ABBREVIATION', 'away_team_id': 'TEAM_ID'}, inplace=True)
A value is trying to be set on a copy of a slice from a Da

In [14]:
games_today_df

Unnamed: 0,TEAM_ID,TEAM_ABBREVIATION,GAME_ID,GAME_DATE,HOME_TEAM,PLUS_MINUS
0,1610612765,DET,22201042,2023-03-16,1,0
1,1610612751,BKN,22201043,2023-03-16,1,0
2,1610612761,TOR,22201044,2023-03-16,1,0
3,1610612749,MIL,22201045,2023-03-16,1,0
4,1610612756,PHX,22201046,2023-03-16,1,0
5,1610612743,DEN,22201042,2023-03-16,0,0
6,1610612758,SAC,22201043,2023-03-16,0,0
7,1610612760,OKC,22201044,2023-03-16,0,0
8,1610612754,IND,22201045,2023-03-16,0,0
9,1610612753,ORL,22201046,2023-03-16,0,0


In [15]:
games_df = pd.concat([games_today_df, games_df], ignore_index=True, sort=False)
games_df

Unnamed: 0,TEAM_ID,TEAM_ABBREVIATION,GAME_ID,GAME_DATE,HOME_TEAM,PLUS_MINUS
0,1610612765,DET,0022201042,2023-03-16,1,0.0
1,1610612751,BKN,0022201043,2023-03-16,1,0.0
2,1610612761,TOR,0022201044,2023-03-16,1,0.0
3,1610612749,MIL,0022201045,2023-03-16,1,0.0
4,1610612756,PHX,0022201046,2023-03-16,1,0.0
...,...,...,...,...,...,...
763,1610612764,WAS,22200660,2023-01-16,1,-9.0
764,1610612756,PHX,22200663,2023-01-16,0,-30.0
765,1610612761,TOR,22200659,2023-01-16,0,2.0
766,1610612762,UTA,22200662,2023-01-16,0,1.0


In [21]:
advanced_today_df = games_today_df

columns = ['TEAM_NAME', 'TEAM_CITY',
'MIN', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE',
'PACE_PER40', 'POSS', 'PIE']

for column in columns:
    advanced_today_df[column] = 0

advanced_today_df = advanced_today_df.reindex(columns=['GAME_ID', 'TEAM_ID', 'TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY',
                                                    'MIN', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
                                                    'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
                                                    'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
                                                    'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE',
                                                    'PACE_PER40', 'POSS', 'PIE'])

In [22]:
advanced_today_df

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,...,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
0,22201042,1610612765,0,DET,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22201043,1610612751,0,BKN,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22201044,1610612761,0,TOR,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22201045,1610612749,0,MIL,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,22201046,1610612756,0,PHX,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,22201042,1610612743,0,DEN,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,22201043,1610612758,0,SAC,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,22201044,1610612760,0,OKC,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,22201045,1610612754,0,IND,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,22201046,1610612753,0,ORL,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
advanced = pd.concat([advanced_today_df, advanced], ignore_index=True, sort=False)
############################################################################

In [17]:
#drop unecessary columns
columns_to_drop = ['TEAM_CITY', 'MIN', 'E_OFF_RATING', 'E_DEF_RATING',
               'E_NET_RATING', 'AST_RATIO', 'E_TM_TOV_PCT', 'USG_PCT',
               'E_USG_PCT', 'E_PACE', 'PACE_PER40', 'PIE']
advanced = advanced.drop(columns=columns_to_drop)

In [20]:
advanced.head(20)

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,OFF_RATING,DEF_RATING,NET_RATING,AST_PCT,AST_TOV,OREB_PCT,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,PACE,POSS,GAME_DATE,HOME_TEAM,PLUS_MINUS
0,22201042,1610612765,0,DET,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,NaT,,
1,22201043,1610612751,0,BKN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,NaT,,
2,22201044,1610612761,0,TOR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,NaT,,
3,22201045,1610612749,0,MIL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,NaT,,
4,22201046,1610612756,0,PHX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,NaT,,
5,22201042,1610612743,0,DEN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,NaT,,
6,22201043,1610612758,0,SAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,NaT,,
7,22201044,1610612760,0,OKC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,NaT,,
8,22201045,1610612754,0,IND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,NaT,,
9,22201046,1610612753,0,ORL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,NaT,,


In [None]:
#change game_id type to match between the 2 data frames
games_df['GAME_ID'] = games_df['GAME_ID'].astype('int32')
advanced['GAME_ID'] = advanced['GAME_ID'].astype('int32')

#merge the needed columns from basic to advanced
advanced = advanced.merge(games_df.drop(columns=['TEAM_ID']), on=['GAME_ID', 'TEAM_ABBREVIATION'])

advanced = advanced.drop_duplicates()

#drop rows that only have 1 team for the game id
value_counts = advanced['GAME_ID'].value_counts()
unique_values = value_counts[value_counts == 1].index.tolist()
advanced = advanced[~advanced['GAME_ID'].isin(unique_values)]
advanced = advanced.reset_index(drop=True)

advanced_desc = advanced.sort_values(by=['GAME_DATE'], ascending=True).copy()

#define features to engineer
non_eng_features = ['TEAM_ABBREVIATION', 'GAME_ID', 'TEAM_ID', 'TEAM_NAME',
                    'GAME_DATE', 'HOME_TEAM', 'PLUS_MINUS']
eng_features = advanced_desc.drop(columns=non_eng_features).columns.tolist()

#caluculate rolling metrics
if 'mean' in roll_methods:
    df_temp = roll(df = advanced_desc, roll_number=4, procedure='mean', selected_columns=eng_features)
    advanced = advanced.merge(df_temp, left_index=True, right_index=True)
if 'median' in roll_methods:
    df_temp = roll(df = advanced_desc, roll_number=4, procedure='median', selected_columns=eng_features)
    advanced = advanced.merge(df_temp, left_index=True, right_index=True)
if 'std' in roll_methods:
    df_temp = roll(df = advanced_desc, roll_number=4, procedure='std', selected_columns=eng_features)
    advanced = advanced.merge(df_temp, left_index=True, right_index=True)

#drop original columns to prevent data leakage
drop_columns = ['OFF_RATING', 'DEF_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'OREB_PCT', 'DREB_PCT',
    'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'PACE', 'POSS']
advanced.drop(columns=drop_columns, inplace=True)

#split data frame between the home teams and the away teams
advanced = advanced.sort_values(by=['GAME_DATE', 'GAME_ID', 'HOME_TEAM'], ascending=False).reset_index(drop=True)
adv_home = advanced.iloc[::2].copy()
adv_away = advanced.iloc[1::2].copy()

#change the column names based on home or away
columns_away = {}
columns_home = {}
columns_to_merge = []
for column in advanced.columns:
    if column == 'GAME_ID' or column == 'PLUS_MINUS' or column == 'GAME_DATE':
        continue
    columns_to_merge.append(column + '_a')
    columns_away[column] = column + '_a'
    columns_home[column] = column + '_h'
#merge the home and away data frames on to the same game id
adv_away.rename(columns=columns_away, inplace=True)
adv_home.rename(columns=columns_home, inplace=True)
columns_to_merge.append('GAME_ID')
merged_df = adv_home.merge(adv_away[columns_to_merge], on=['GAME_ID'])
merged_df = merged_df.dropna()

#get elo and raptor scores
elo_past = pd.read_pickle('data/pkl/elo_past.pkl')


merged_df = merged_df.merge(elo_past, left_on=['GAME_DATE', 'TEAM_ABBREVIATION_h'], right_on=['date', 'team1'])
merged_df.drop(columns=['date', 'team1', 'team2'], inplace=True)
merged_df.rename(columns={'elo1_pre': 'elo_h', 'elo2_pre': 'elo_a', 'raptor1_pre': 'raptor_h', 'raptor2_pre': 'raptor_a'},
      inplace=True)

#make lists of feature column names
X_features_num = [col for col in merged_df.columns if 'GAME_ID' not in col
                 and 'GAME_DATE' not in col
                 and 'TEAM_ID' not in col
                 and 'TEAM_NAME' not in col
                 and 'TEAM_ABBREVIATION' not in col
                 and 'PLUS_MINUS' not in col
                 and 'HOME_TEAM' not in col]

X_features_cat = ['TEAM_ABBREVIATION_h', 'TEAM_ABBREVIATION_a']
#scale the numerical features
preproc_data = merged_df.copy()
if scaled == True:
    scaler = MinMaxScaler()
    preproc_data[X_features_num] = scaler.fit_transform(preproc_data[X_features_num])

#one hot encode the teams
if ohe == True:
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(preproc_data[X_features_cat])
    cols = [str(team) +'_h' for team in ohe.categories_[0]] + [str(team) +'_a' for team in ohe.categories_[1]]
    preproc_data[cols]=ohe.transform(preproc_data[X_features_cat])
    X_features = [col for col in preproc_data.columns if 'GAME_ID' not in col
                 and 'GAME_DATE' not in col
                 and 'TEAM_ID' not in col
                 and 'TEAM_NAME' not in col
                 and 'TEAM_ABBREVIATION' not in col
                 and 'PLUS_MINUS' not in col
                 and 'HOME_TEAM' not in col]
else:
    X_features = [col for col in preproc_data.columns if 'GAME_ID' not in col
                 and 'GAME_DATE' not in col
                 and 'TEAM_ID' not in col
                 and 'TEAM_NAME' not in col
                 and 'PLUS_MINUS' not in col
                 and 'HOME_TEAM' not in col]