In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from datetime import datetime, timedelta

import os
os.chdir('/Users/juliayu001/code/alecmatt5/nba_betting_analysis/backend/logic')
from today_games_preprocessor import preprocess_advanced, get_basic_boxscores, roll
os.chdir('/Users/juliayu001/code/alecmatt5/nba_betting_analysis/backend')

from nba_api.stats.endpoints import scoreboard
from nba_api.stats.static import teams

In [None]:
preproc_part1, X_features = preprocess_advanced('boxscores_advanced_team_all.pkl',
                                        roll_methods=['mean'],
                                        ohe=True,
                                        scaled=False)

In [None]:
preproc_part1.head(20)

In [None]:
#get basic boxscore data to add columns to the advanced boxscore
date = datetime.now() - timedelta(days=60)
date_str = date.strftime('%Y-%m-%d')

basic = get_basic_boxscores(date=date_str)
games_df = basic[['TEAM_ID', 'TEAM_ABBREVIATION', 'GAME_ID', 'GAME_DATE', 'HOME_TEAM', 'PLUS_MINUS']].copy()

#get advanced boxscore data from pickle
# advanced = pd.read_pickle(f'data/pkl/{adv_pickle_filename}')
advanced = pd.read_pickle('data/pkl/boxscores_advanced_team_all.pkl')

In [None]:
pd.concat([games_today_df, games_df], ignore_index=True, sort=False)

In [None]:
pd.concat([advanced_today_df, advanced], ignore_index=True, sort=False)

# Today's games

In [None]:
# Get today's date
today = datetime.now().strftime('%Y-%m-%d')

# Get scoreboard for today's games
scoreboard_today = scoreboard.Scoreboard(game_date=today)
games = scoreboard_today.game_header.get_data_frame()

# Get all NBA teams
nba_teams = teams.get_teams()

# Create an empty list to store the team data
team_data = []

# Loop through each game and add team data to the list
for index, game in games.iterrows():
    home_team_id = game["HOME_TEAM_ID"]
    away_team_id = game["VISITOR_TEAM_ID"]
    
    home_team = next((team for team in nba_teams if team["id"] == home_team_id), None)
    away_team = next((team for team in nba_teams if team["id"] == away_team_id), None)
    
    if home_team is not None and away_team is not None:
        team_data.append({
            "game_id": game["GAME_ID"],
            "home_team_id": home_team["id"],
            "home_team": home_team["abbreviation"],
            "away_team_id": away_team["id"],
            "away_team": away_team["abbreviation"]
        })

# Convert the list of team data to a DataFrame
team_df = pd.DataFrame(team_data)

# Print the DataFrame
print(team_df)


In [None]:
df1 = team_df[['home_team_id', 'home_team', 'game_id']]
df1.rename(columns={'game_id': 'GAME_ID', 'home_team': 'TEAM_ABBREVIATION', 'home_team_id': 'TEAM_ID'}, inplace=True)
df1['GAME_DATE'] = pd.to_datetime(today)
df1['HOME_TEAM'] = 1
df1['PLUS_MINUS'] = 0
df1

In [None]:
df2 = team_df[['away_team_id', 'away_team', 'game_id']]
df2.rename(columns={'game_id': 'GAME_ID', 'away_team': 'TEAM_ABBREVIATION', 'away_team_id': 'TEAM_ID'}, inplace=True)
df2['GAME_DATE'] = today
df2['HOME_TEAM'] = 0
df2['PLUS_MINUS'] = 0
df2

In [None]:
games_today_df = pd.concat([df1, df2], ignore_index=True, sort=False)

In [None]:
games_today_df.GAME_DATE = pd.to_datetime(games_today_df.GAME_DATE)

In [None]:
advanced_today_df = games_today_df
advanced_today_df

columns = ['TEAM_NAME', 'TEAM_CITY',
'MIN', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE',
'PACE_PER40', 'POSS', 'PIE']

for column in columns:
    advanced_today_df[column] = 0

advanced_today_df = advanced_today_df.reindex(columns=['GAME_ID', 'TEAM_ID', 'TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY',
                                                       'MIN', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
                                                       'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
                                                       'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
                                                       'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE',
                                                       'PACE_PER40', 'POSS', 'PIE'])


# Debug

In [None]:
from datetime import datetime, timedelta
date = datetime.now() - timedelta(days=50)
date_str = date.strftime('%Y-%m-%d')

In [None]:
date_str

In [None]:
basic = get_basic_boxscores(date=date_str)
games_df = basic[['TEAM_ID', 'TEAM_ABBREVIATION', 'GAME_ID', 'GAME_DATE', 'HOME_TEAM', 'PLUS_MINUS']].copy()

In [None]:
games_df

In [None]:
games_df.head(600)['TEAM_ABBREVIATION'].value_counts()

In [None]:
#get advanced boxscore data from pickle
advanced = pd.read_pickle('data/pkl/boxscores_advanced_team_all.pkl')

In [None]:
advanced = advanced.head(600)

In [None]:
#drop unecessary columns
columns_to_drop = ['TEAM_CITY', 'MIN', 'E_OFF_RATING', 'E_DEF_RATING',
               'E_NET_RATING', 'AST_RATIO', 'E_TM_TOV_PCT', 'USG_PCT',
               'E_USG_PCT', 'E_PACE', 'PACE_PER40', 'PIE']
advanced = advanced.drop(columns=columns_to_drop)

In [None]:
#change game_id type to match between the 2 data frames
advanced['GAME_ID'] = advanced['GAME_ID'].astype('int32')

In [None]:
#merge the needed columns from basic to advanced
advanced = advanced.merge(games_df.drop(columns=['TEAM_ID']), on=['GAME_ID', 'TEAM_ABBREVIATION'])

In [None]:
advanced

In [None]:
advanced = advanced.sort_values(by=['GAME_DATE', 'GAME_ID', 'HOME_TEAM'], ascending=False).reset_index(drop=True)

In [None]:
advanced = advanced.drop_duplicates()

In [None]:
#drop rows that only have 1 team for the game id
value_counts = advanced['GAME_ID'].value_counts()
unique_values = value_counts[value_counts == 1].index.tolist()
advanced = advanced[~advanced['GAME_ID'].isin(unique_values)]
advanced = advanced.reset_index(drop=True)

In [None]:
advanced_desc = advanced.sort_values(by=['GAME_DATE'], ascending=True).copy()
advanced_desc

In [None]:
#define features to engineer
non_eng_features = ['TEAM_ABBREVIATION', 'GAME_ID', 'TEAM_ID', 'TEAM_NAME',
                    'GAME_DATE', 'HOME_TEAM', 'PLUS_MINUS']
eng_features = advanced_desc.drop(columns=non_eng_features).columns.tolist()

In [None]:
roll_methods=['mean']

In [None]:
#caluculate rolling metrics
if 'mean' in roll_methods:
    df_temp = roll(df = advanced_desc, roll_number=4, procedure='mean', selected_columns=eng_features)
    advanced = advanced.merge(df_temp, left_index=True, right_index=True)
if 'median' in roll_methods:
    df_temp = roll(df = advanced_desc, roll_number=4, procedure='median', selected_columns=eng_features)
    advanced = advanced.merge(df_temp, left_index=True, right_index=True)
if 'std' in roll_methods:
    df_temp = roll(df = advanced_desc, roll_number=4, procedure='std', selected_columns=eng_features)
    advanced = advanced.merge(df_temp, left_index=True, right_index=True)

In [None]:
advanced

In [None]:
#drop original columns to prevent data leakage
drop_columns = ['OFF_RATING', 'DEF_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'OREB_PCT', 'DREB_PCT',
    'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'PACE', 'POSS']
advanced.drop(columns=drop_columns, inplace=True)

In [None]:
advanced

In [None]:
#split data frame between the home teams and the away teams
advanced = advanced.sort_values(by=['GAME_DATE', 'GAME_ID', 'HOME_TEAM'], ascending=False).reset_index(drop=True)
adv_home = advanced.iloc[::2].copy()
adv_away = advanced.iloc[1::2].copy()

In [None]:
advanced.head(30)

# Basic boxscores

In [None]:
from nba_api.stats.static import teams
from nba_api.stats.endpoints import scoreboard

# Get yesterday's date
from datetime import datetime, timedelta
yesterday = datetime.now() - timedelta(days=1)
yesterday_str = yesterday.strftime('%m/%d/%Y')

scoreboard_ = scoreboard.Scoreboard(game_date=yesterday_str, league_id='00', day_offset=0)
games = scoreboard_.game_header.get_data_frame()
if not games.empty:
    game_ids = list(games['GAME_ID'])
    print(team['full_name'], game_ids)

# Advanced boxscores

The command above/below gets the advanced boxscore from a given game id

boxscores_advanced.get_data_frames()[0] returns the advanced boxscore broken up per player

boxscores_advanced.get_data_frames()[1] returns the advanced boxscore broken up per team

In [None]:
from nba_api.stats.endpoints import boxscoreadvancedv2

boxscores_advanced_player = None
boxscores_advanced_team = None

for game_id in game_ids:
    if boxscores_advanced_team is None:
        gamefinder = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=game_id)
        boxscores_advanced_team = gamefinder.get_data_frames()[1]
        boxscores_advanced_player = gamefinder.get_data_frames()[0]
    else:
        gamefinder = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=game_id)
        boxscores_advanced_team = pd.concat([boxscores_advanced_team, gamefinder.get_data_frames()[1]])
        boxscores_advanced_player = pd.concat([boxscores_advanced_player, gamefinder.get_data_frames()[0]])

In [None]:
boxscores_advanced_team.columns

In [None]:
# boxscores_advanced_team_yesterday.to_pickle('boxscores_advanced_team_part2.pkl')

In [None]:
# boxscores_advanced_player_yesterday.to_pickle('boxscores_advanced_player_part2.pkl')

In [None]:
df = pd.read_pickle('../data/pkl/raw_games_5yrs.pkl')

In [None]:
df['GAME_DATE']

# Debug

In [None]:
#get basic boxscore data to add columns to the advanced boxscore
date = datetime.now() - timedelta(days=60)
date_str = date.strftime('%Y-%m-%d')

basic = get_basic_boxscores(date=date_str)
games_df = basic[['TEAM_ID', 'TEAM_ABBREVIATION', 'GAME_ID', 'GAME_DATE', 'HOME_TEAM', 'PLUS_MINUS']].copy()

#get advanced boxscore data from pickle
# advanced = pd.read_pickle(f'data/pkl/{adv_pickle_filename}')
advanced = pd.read_pickle('data/pkl/boxscores_advanced_team_all.pkl')

In [None]:
home_team

In [None]:
############################################################################
# Get today's date
today = datetime.now().strftime('%Y-%m-%d')

# Get scoreboard for today's games
scoreboard_today = scoreboard.Scoreboard(game_date=today)
games = scoreboard_today.game_header.get_data_frame()

# Get all NBA teams
nba_teams = teams.get_teams()

# Create an empty list to store the team data
team_data = []

# Loop through each game and add team data to the list
for index, game in games.iterrows():
    home_team_id = game["HOME_TEAM_ID"]
    away_team_id = game["VISITOR_TEAM_ID"]

    home_team = next((team for team in nba_teams if team["id"] == home_team_id), None)
    away_team = next((team for team in nba_teams if team["id"] == away_team_id), None)

    if home_team is not None and away_team is not None:
        team_data.append({
            "game_id": game["GAME_ID"],
            "home_team_id": home_team["id"],
            "home_team_name": home_team["abbreviation"],
            "home_team": home_team["abbreviation"],
            "away_team_id": away_team["id"],
            "away_team": away_team["abbreviation"],
            "away_team_name": away_team["abbreviation"]
        })

# Convert the list of team data to a DataFrame
team_df = pd.DataFrame(team_data)

df1 = team_df[['home_team_id', 'home_team', 'game_id']]
df1.rename(columns={'game_id': 'GAME_ID', 'home_team': 'TEAM_ABBREVIATION', 'home_team_id': 'TEAM_ID'}, inplace=True)
df1['GAME_DATE'] = today
df1['HOME_TEAM'] = 1
df1['PLUS_MINUS'] = 0
df2 = team_df[['away_team_id', 'away_team', 'game_id']]
df2.rename(columns={'game_id': 'GAME_ID', 'away_team': 'TEAM_ABBREVIATION', 'away_team_id': 'TEAM_ID'}, inplace=True)
df2['GAME_DATE'] = today
df2['HOME_TEAM'] = 0
df2['PLUS_MINUS'] = 0
games_today_df = pd.concat([df1, df2], ignore_index=True, sort=False)
games_today_df.GAME_DATE = pd.to_datetime(games_today_df.GAME_DATE)

In [None]:
games_today_df

In [None]:
games_df = pd.concat([games_today_df, games_df], ignore_index=True, sort=False)

advanced_today_df = games_today_df

columns = ['TEAM_NAME', 'TEAM_CITY',
'MIN', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE',
'PACE_PER40', 'POSS', 'PIE']

for column in columns:
    advanced_today_df[column] = 0

advanced_today_df = advanced_today_df.reindex(columns=['GAME_ID', 'TEAM_ID', 'TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY',
                                                    'MIN', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
                                                    'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
                                                    'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
                                                    'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE',
                                                    'PACE_PER40', 'POSS', 'PIE'])

advanced = pd.concat([advanced_today_df, advanced], ignore_index=True, sort=False)

# Debug

In [37]:
#get basic boxscore data to add columns to the advanced boxscore
date = datetime.now() - timedelta(days=60)
date_str = date.strftime('%Y-%m-%d')

basic = get_basic_boxscores(date=date_str)
games_df = basic[['TEAM_ID', 'TEAM_ABBREVIATION', 'GAME_ID', 'GAME_DATE', 'HOME_TEAM', 'PLUS_MINUS']].copy()

#get advanced boxscore data from pickle
# advanced = pd.read_pickle(f'data/pkl/{adv_pickle_filename}')
advanced = pd.read_pickle('data/pkl/boxscores_advanced_team_all.pkl')

############################################################################
# Get today's date
today = datetime.now().strftime('%Y-%m-%d')

# Get scoreboard for today's games
scoreboard_today = scoreboard.Scoreboard(game_date=today)
games = scoreboard_today.game_header.get_data_frame()

# Get all NBA teams
nba_teams = teams.get_teams()

# Create an empty list to store the team data
team_data = []

# Loop through each game and add team data to the list
for index, game in games.iterrows():
    home_team_id = game["HOME_TEAM_ID"]
    away_team_id = game["VISITOR_TEAM_ID"]

    home_team = next((team for team in nba_teams if team["id"] == home_team_id), None)
    away_team = next((team for team in nba_teams if team["id"] == away_team_id), None)

    if home_team is not None and away_team is not None:
        team_data.append({
            "game_id": game["GAME_ID"],
            "home_team_id": home_team["id"],
            "home_team": home_team["abbreviation"],
            "away_team_id": away_team["id"],
            "away_team": away_team["abbreviation"]
        })

# Convert the list of team data to a DataFrame
team_df = pd.DataFrame(team_data)

df1 = team_df[['home_team_id', 'home_team', 'game_id']]
df1.rename(columns={'game_id': 'GAME_ID', 'home_team': 'TEAM_ABBREVIATION', 'home_team_id': 'TEAM_ID'}, inplace=True)
df1['GAME_DATE'] = today
df1['HOME_TEAM'] = 1
df1['PLUS_MINUS'] = 0
df2 = team_df[['away_team_id', 'away_team', 'game_id']]
df2.rename(columns={'game_id': 'GAME_ID', 'away_team': 'TEAM_ABBREVIATION', 'away_team_id': 'TEAM_ID'}, inplace=True)
df2['GAME_DATE'] = today
df2['HOME_TEAM'] = 0
df2['PLUS_MINUS'] = 0
games_today_df = pd.concat([df1, df2], ignore_index=True, sort=False)
games_today_df.GAME_DATE = pd.to_datetime(games_today_df.GAME_DATE)

games_df = pd.concat([games_today_df, games_df], ignore_index=True, sort=False)

advanced_today_df = games_today_df

columns = ['TEAM_NAME', 'TEAM_CITY',
'MIN', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE',
'PACE_PER40', 'POSS', 'PIE']

for column in columns:
    advanced_today_df[column] = 0

advanced_today_df = advanced_today_df.reindex(columns=['GAME_ID', 'TEAM_ID', 'TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY',
                                                    'MIN', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
                                                    'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
                                                    'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
                                                    'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE',
                                                    'PACE_PER40', 'POSS', 'PIE'])

advanced = pd.concat([advanced_today_df, advanced], ignore_index=True, sort=False)
############################################################################

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.rename(columns={'game_id': 'GAME_ID', 'home_team': 'TEAM_ABBREVIATION', 'home_team_id': 'TEAM_ID'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['GAME_DATE'] = today
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.rename(columns={'game_id': 'GAME_ID', 'away_team': 'TEAM_ABBREVIATION', 'away_team_id': 'TEAM_ID'}, inplace=True)
A value is trying to be set on a copy of a slice from a Da

In [38]:
#drop unecessary columns
columns_to_drop = ['TEAM_CITY', 'MIN', 'E_OFF_RATING', 'E_DEF_RATING',
               'E_NET_RATING', 'AST_RATIO', 'E_TM_TOV_PCT', 'USG_PCT',
               'E_USG_PCT', 'E_PACE', 'PACE_PER40', 'PIE']
advanced = advanced.drop(columns=columns_to_drop)

#change game_id type to match between the 2 data frames
games_df['GAME_ID'] = games_df['GAME_ID'].astype('int32')
advanced['GAME_ID'] = advanced['GAME_ID'].astype('int32')

#merge the needed columns from basic to advanced
advanced = advanced.merge(games_df.drop(columns=['TEAM_ID']), on=['GAME_ID', 'TEAM_ABBREVIATION'])

In [39]:
advanced = advanced.drop_duplicates()

#drop rows that only have 1 team for the game id
value_counts = advanced['GAME_ID'].value_counts()
unique_values = value_counts[value_counts == 1].index.tolist()
advanced = advanced[~advanced['GAME_ID'].isin(unique_values)]
advanced = advanced.reset_index(drop=True)

In [40]:
advanced = advanced.sort_values(by=['GAME_DATE', 'GAME_ID'], ascending=False).copy()

In [41]:
advanced_desc = advanced.sort_values(by=['GAME_DATE', 'GAME_ID'], ascending=True).copy()

#define features to engineer
non_eng_features = ['TEAM_ABBREVIATION', 'GAME_ID', 'TEAM_ID', 'TEAM_NAME',
                    'GAME_DATE', 'HOME_TEAM', 'PLUS_MINUS']
eng_features = advanced_desc.drop(columns=non_eng_features).columns.tolist()

In [43]:
test_advanced = advanced.copy()

In [44]:
# roll_methods=['mean']
# #caluculate rolling metrics
# if 'mean' in roll_methods:
#     df_temp = roll(df = advanced_desc, roll_number=4, procedure='mean', selected_columns=eng_features)
#     advanced = advanced.merge(df_temp, left_index=True, right_index=True)
# if 'median' in roll_methods:
#     df_temp = roll(df = advanced_desc, roll_number=4, procedure='median', selected_columns=eng_features)
#     advanced = advanced.merge(df_temp, left_index=True, right_index=True)
# if 'std' in roll_methods:
#     df_temp = roll(df = advanced_desc, roll_number=4, procedure='std', selected_columns=eng_features)
#     advanced = advanced.merge(df_temp, left_index=True, right_index=True)
    
# advanced.head(20)

In [47]:
test_advanced.head(30)

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,OFF_RATING,DEF_RATING,NET_RATING,AST_PCT,AST_TOV,OREB_PCT,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,PACE,POSS,GAME_DATE,HOME_TEAM,PLUS_MINUS
6,22201041,1610612746,0,LAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2023-03-15,1,0.0
13,22201041,1610612744,0,GSW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2023-03-15,0,0.0
5,22201040,1610612759,0,SAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2023-03-15,1,0.0
12,22201040,1610612742,0,DAL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2023-03-15,0,0.0
4,22201039,1610612750,0,MIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2023-03-15,1,0.0
11,22201039,1610612738,0,BOS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2023-03-15,0,0.0
3,22201038,1610612745,0,HOU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2023-03-15,1,0.0
10,22201038,1610612747,0,LAL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2023-03-15,0,0.0
2,22201037,1610612741,0,CHI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2023-03-15,1,0.0
9,22201037,1610612758,0,SAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2023-03-15,0,0.0


In [None]:
test_advanced.groupby('TEAM_ABBREVIATION')['x'].rolling(2).mean()

In [None]:
advanced = advanced.drop_duplicates()

#drop rows that only have 1 team for the game id
value_counts = advanced['GAME_ID'].value_counts()
unique_values = value_counts[value_counts == 1].index.tolist()
advanced = advanced[~advanced['GAME_ID'].isin(unique_values)]
advanced = advanced.reset_index(drop=True)

advanced_desc = advanced.sort_values(by=['GAME_DATE'], ascending=True).copy()

#define features to engineer
non_eng_features = ['TEAM_ABBREVIATION', 'GAME_ID', 'TEAM_ID', 'TEAM_NAME',
                    'GAME_DATE', 'HOME_TEAM', 'PLUS_MINUS']
eng_features = advanced_desc.drop(columns=non_eng_features).columns.tolist()


roll_methods=['mean']
#caluculate rolling metrics
if 'mean' in roll_methods:
    df_temp = roll(df = advanced_desc, roll_number=4, procedure='mean', selected_columns=eng_features)
    advanced = advanced.merge(df_temp, left_index=True, right_index=True)
if 'median' in roll_methods:
    df_temp = roll(df = advanced_desc, roll_number=4, procedure='median', selected_columns=eng_features)
    advanced = advanced.merge(df_temp, left_index=True, right_index=True)
if 'std' in roll_methods:
    df_temp = roll(df = advanced_desc, roll_number=4, procedure='std', selected_columns=eng_features)
    advanced = advanced.merge(df_temp, left_index=True, right_index=True)
    
advanced.head(20)

In [None]:
df_rolling = df[selected_columns + ["TEAM_ABBREVIATION"]]
    df_rolling = df_rolling.groupby(["TEAM_ABBREVIATION"], group_keys=False)

    def find_team_averages(team):
        return team.rolling(roll_number, closed='left').mean()

    def find_team_medians(team):
        return team.rolling(roll_number, closed='left').median()

    def find_team_stds(team):
        return team.rolling(roll_number, closed='left').std()

    if procedure == 'median':
        df_rolling = df_rolling.apply(find_team_medians)
    elif procedure == 'std':
        df_rolling = df_rolling.apply(find_team_stds)
    else:
        procedure = 'mean'
        df_rolling = df_rolling.apply(find_team_averages)

    df_rolling = df_rolling[selected_columns]
    df_rolling = df_rolling.sort_index()

    new_column_names = {}
    for col in df_rolling.columns:
        new_column_names[col] = col + suff + '_' + procedure

    df_rolling = df_rolling.rename(columns=new_column_names)