In [151]:
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
from sklearn.metrics import log_loss


# Helper Functions

For Data Preprocessing

In [152]:
def is_home(matchup):
    return int("@" not in matchup)

def is_win(result):
    return int("L" not in result)

def sort_by_date(df):
    return df.sort_values(by='GAME_DATE')

# Data Preprocessing

In [153]:
nba_player_data_2025_26 = pd.read_csv('out/player_stats_2025-26.csv')
nba_player_data_2024_25 = pd.read_csv('out/player_stats_2024-25.csv')
nba_players = pd.read_csv('out/players.csv')
nba_teams = pd.read_csv('out/teams.csv')
nba_season_games_2024_25 = pd.read_csv('out/season_games_2024-25.csv')
nba_season_games_2025_26 = pd.read_csv('out/season_games_2025-26.csv')


In [154]:
game_data_list = [nba_player_data_2025_26, nba_player_data_2024_25, nba_season_games_2024_25, nba_season_games_2025_26]
team_data_list = [nba_season_games_2024_25, nba_season_games_2025_26]

Add another feature for home game and non-home game

In [155]:
for data in game_data_list:
    data["IS_HOME"] = data["MATCHUP"].apply(is_home)

Sort each game by date

In [156]:
sort_by_list = ["TEAM_ID", "GAME_DATE"]
for data in game_data_list:
    data.sort_values(by=sort_by_list, inplace=True)

In [157]:
for data in game_data_list:
    data["WL"] = data["MATCHUP"].apply(is_win)

    #Effective field goal percentage
    data["EFG"] = (data["FGM"] + 0.5 * data["FG3M"]) / data["FGA"]

    #True shooting percentage
    data["TS"] = data["PTS"] / (2 * (data["FGA"] + 0.44 * data["FTA"]))

    #Assist to turnover ratio
    data["ast_tov"] = data["AST"] / (data["TOV"] + 1e-8)  # Add small value to avoid division by zero)

    #Turnover rate
    data["tov_rate"] = data["TOV"] / (data["FGA"] + 0.44 * data["FTA"] + 1e-8) # Add small value to avoid division by zero

    #offensive rebound rate
    data["oreb_rate"] = data["OREB"] / (data["OREB"] + data["DREB"] + 1e-8) # Add small value to avoid division by zero

    #stealing and blocks
    data["stocks"] = data["STL"] + data["BLK"]

    #Personal fouls rate
    data["pf_rate"] = data["PF"] / (data["MIN"] + 1e-8) # Add small value to avoid division by zero


Calculates win percentages from previous games and add a column for it

In [158]:
nba_season_games_2024_25["wins_so_far"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .shift(1)
      .cumsum()
)

nba_season_games_2024_25["games_so_far"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .cumcount()
)

nba_season_games_2024_25["win_percentage"] = (
    nba_season_games_2024_25["wins_so_far"] / nba_season_games_2024_25["games_so_far"]
)


nba_season_games_2025_26["wins_so_far"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .shift(1)
      .cumsum()
)

nba_season_games_2025_26["games_so_far"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .cumcount()
)


nba_season_games_2025_26["win_percentage"] = (
    nba_season_games_2025_26["wins_so_far"] / nba_season_games_2025_26["games_so_far"]
)

Gets winrate of last 5 and 10 games

In [159]:
nba_season_games_2024_25["win_percentage_last_5"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=5, min_periods=1)
      .mean()
)

nba_season_games_2025_26["win_percentage_last_5"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=5, min_periods=1)
      .mean()
)

nba_season_games_2024_25["win_percentage_last_10"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=10, min_periods=1)
      .mean()
)

nba_season_games_2025_26["win_percentage_last_10"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=10, min_periods=1)
      .mean()
)



Gets the EWM (Exponential weighted moving average)

In [160]:
stat_list = ["WL", "PTS", "EFG", "TS", "ast_tov", "tov_rate", "oreb_rate", "stocks", "pf_rate"]
for data in team_data_list:
    for stat in stat_list:
        data[f"{stat}_ewm_5"] = (
            data.groupby("TEAM_ID")[stat]
                .shift(1)
                .ewm(span=5, min_periods=1)
                .mean()
        )
        data[f"{stat}_ewm_10"] = (
            data.groupby("TEAM_ID")[stat]
                .shift(1)
                .ewm(span=10, min_periods=1)
                .mean()
        )


In [161]:
for data in team_data_list:
    data["winstreak"] = (
        data.groupby("TEAM_ID")["WL"]
          .shift(1)
          .groupby(data["TEAM_ID"])
          .transform(lambda x: x.groupby((x != 1).cumsum()).cumcount())

    )

Add opponent stats for column

In [162]:
for i, data in enumerate(team_data_list):
    opp = data[[
        "GAME_ID",
        "GAME_DATE",
        "TEAM_ID",
        "WL_ewm_10",
        "PTS_ewm_10",
        "EFG_ewm_10",
        "TS_ewm_10",
        "ast_tov_ewm_10",
        "tov_rate_ewm_10",
        "oreb_rate_ewm_10",
        "stocks_ewm_10",
        "pf_rate_ewm_10"
    ]].copy()

    opp = opp.rename(columns={
        "TEAM_ID": "OPP_TEAM_ID",
        "WL_ewm_10": "OPP_WL_ewm_10",
        "PTS_ewm_10": "OPP_PTS_ewm_10",
        "EFG_ewm_10": "OPP_EFG_ewm_10",
        "TS_ewm_10": "OPP_TS_ewm_10",
        "ast_tov_ewm_10": "OPP_ast_tov_ewm_10",
        "tov_rate_ewm_10": "OPP_tov_rate_ewm_10",
        "oreb_rate_ewm_10": "OPP_oreb_rate_ewm_10",
        "stocks_ewm_10": "OPP_stocks_ewm_10",
        "pf_rate_ewm_10": "OPP_pf_rate_ewm_10"
    })

    merged = data.merge(
        opp,
        on=["GAME_ID", "GAME_DATE"]
    )


    # remove self-joins
    merged = merged[merged["TEAM_ID"] != merged["OPP_TEAM_ID"]]

    team_data_list[i] = merged

In [163]:
team_data_list[0]

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,OPP_TEAM_ID,OPP_WL_ewm_10,OPP_PTS_ewm_10,OPP_EFG_ewm_10,OPP_TS_ewm_10,OPP_ast_tov_ewm_10,OPP_tov_rate_ewm_10,OPP_oreb_rate_ewm_10,OPP_stocks_ewm_10,OPP_pf_rate_ewm_10
1,12024,15020,NZB,New Zealand Breakers,12400002,2024-10-04,NZB @ UTA,1,239,87,...,1610612762,8.091118e-01,112.194586,0.514353,0.535922,2.584232,0.123254,0.309704,13.717537,0.077591
3,12024,15020,NZB,New Zealand Breakers,12400011,2024-10-07,NZB @ PHI,1,241,84,...,1610612755,9.159657e-01,111.087163,0.536319,0.578076,1.842358,0.156219,0.228867,15.958701,0.096228
5,12024,15020,NZB,New Zealand Breakers,12400029,2024-10-10,NZB @ OKC,1,241,89,...,1610612760,8.064396e-01,111.168289,0.539402,0.557824,2.082285,0.141404,0.219202,13.173587,0.080705
7,12024,15025,ULM,Ratiopharm Ulm,12400060,2024-10-16,ULM @ POR,0,240,100,...,1610612757,7.404152e-01,106.916484,0.521748,0.552890,1.859084,0.153963,0.268965,13.857684,0.074961
9,12024,1610612737,ATL,Atlanta Hawks,12400018,2024-10-08,ATL vs. IND,0,240,131,...,1610612754,1.159409e-08,102.864793,0.511793,0.542944,2.085918,0.121904,0.255449,12.758887,0.084848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5595,32024,1610616855,TMC,Team C,32400004,2025-02-14,TMC vs. TMT,1,73,40,...,1610616856,9.794662e-01,52.813069,0.583797,0.603161,2.802200,0.135642,0.322162,5.659690,0.025405
5596,32024,1610616856,TMT,Team T,32400004,2025-02-14,TMT @ TMC,1,73,34,...,1610616855,9.794662e-01,52.813069,0.583797,0.603161,2.802200,0.135642,0.322162,5.659690,0.025405
5599,32024,1610616857,TMM,Team M,32400005,2025-02-14,TMM vs. TMG,1,72,39,...,1610616858,9.794662e-01,52.813069,0.583797,0.603161,2.802200,0.135642,0.322162,5.659690,0.025405
5600,32024,1610616858,TMG,Team G League,32400005,2025-02-14,TMG @ TMM,1,73,40,...,1610616857,9.794662e-01,52.813069,0.583797,0.603161,2.802200,0.135642,0.322162,5.659690,0.025405


Calculate the differences and add them to the columns

In [164]:
subtract_stat_list = ["WL_ewm_10", "PTS_ewm_10", "EFG_ewm_10", "TS_ewm_10", "ast_tov_ewm_10", "tov_rate_ewm_10", "oreb_rate_ewm_10", "stocks_ewm_10", "pf_rate_ewm_10"]
for data in team_data_list:
    for stat in subtract_stat_list:
        data[f"{stat}_diff"] = data[stat] - data[f"OPP_{stat}"]

In [165]:
team_data_list[0]

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,OPP_pf_rate_ewm_10,WL_ewm_10_diff,PTS_ewm_10_diff,EFG_ewm_10_diff,TS_ewm_10_diff,ast_tov_ewm_10_diff,tov_rate_ewm_10_diff,oreb_rate_ewm_10_diff,stocks_ewm_10_diff,pf_rate_ewm_10_diff
1,12024,15020,NZB,New Zealand Breakers,12400002,2024-10-04,NZB @ UTA,1,239,87,...,0.077591,,,,,,,,,
3,12024,15020,NZB,New Zealand Breakers,12400011,2024-10-07,NZB @ PHI,1,241,84,...,0.096228,0.084034,-24.087163,-0.111588,-0.137436,-0.404858,0.005856,0.142562,0.041299,-0.012546
5,12024,15020,NZB,New Zealand Breakers,12400029,2024-10-10,NZB @ OKC,1,241,89,...,0.080705,0.193560,-25.818289,-0.134384,-0.095233,-1.132910,0.057388,0.107619,1.176413,0.011724
7,12024,15025,ULM,Ratiopharm Ulm,12400060,2024-10-16,ULM @ POR,0,240,100,...,0.074961,0.259585,-21.566484,-0.116730,-0.090300,-0.909709,0.044829,0.057856,0.492316,0.017468
9,12024,1610612737,ATL,Atlanta Hawks,12400018,2024-10-08,ATL vs. IND,0,240,131,...,0.084848,1.000000,-17.514793,-0.106775,-0.080354,-1.136543,0.076888,0.071371,1.591113,0.007581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5595,32024,1610616855,TMC,Team C,32400004,2025-02-14,TMC vs. TMT,1,73,40,...,0.025405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5596,32024,1610616856,TMT,Team T,32400004,2025-02-14,TMT @ TMC,1,73,34,...,0.025405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5599,32024,1610616857,TMM,Team M,32400005,2025-02-14,TMM vs. TMG,1,72,39,...,0.025405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5600,32024,1610616858,TMG,Team G League,32400005,2025-02-14,TMG @ TMM,1,73,40,...,0.025405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [None]:
training_data = []
for data in team_data_list:
    features = [
        "WL_ewm_10_diff",
        "PTS_ewm_10_diff",
        "EFG_ewm_10_diff",
        "TS_ewm_10_diff",
        "ast_tov_ewm_10_diff",
        "tov_rate_ewm_10_diff",
        "oreb_rate_ewm_10_diff",
        "stocks_ewm_10_diff",
        "pf_rate_ewm_10_diff",
        "winstreak",
        "WL_ewm_10",
        "PTS_ewm_10",
        "EFG_ewm_10",
        "TS_ewm_10",
        "ast_tov_ewm_10",
        "tov_rate_ewm_10",
        "oreb_rate_ewm_10",
        "stocks_ewm_10",
        "pf_rate_ewm_10",
        "IS_HOME",
    ]
    target = "WL"

    X = data[features]
    y = data[target]
        
    training_data.append((X, y))


# Model Training

In [169]:
#Using 2024-25 season data for training
X_train = training_data[0][0]
y_train = training_data[0][1]

#Using 2025-26 season data for validation and testing
mid = len(training_data[1][0]) // 2
X_test = training_data[1][0][:mid]
y_test = training_data[1][1][:mid]
X_val = training_data[1][0][mid:]
y_val = training_data[1][1][mid:]