In [108]:
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
from sklearn.metrics import log_loss


# Helper Functions

For Data Preprocessing

In [109]:
def is_home(matchup):
    return int("@" not in matchup)

def is_win(result):
    return int("L" not in result)

def sort_by_date(df):
    return df.sort_values(by='GAME_DATE')

# Data Preprocessing

In [110]:
nba_player_data_2025_26 = pd.read_csv('out/player_stats_2025-26.csv')
nba_player_data_2024_25 = pd.read_csv('out/player_stats_2024-25.csv')
nba_players = pd.read_csv('out/players.csv')
nba_teams = pd.read_csv('out/teams.csv')
nba_season_games_2024_25 = pd.read_csv('out/season_games_2024-25.csv')
nba_season_games_2025_26 = pd.read_csv('out/season_games_2025-26.csv')


In [111]:
game_data_list = [nba_player_data_2025_26, nba_player_data_2024_25, nba_season_games_2024_25, nba_season_games_2025_26]
team_data_list = [nba_season_games_2024_25, nba_season_games_2025_26]

Add another feature for home game and non-home game

In [112]:
for data in game_data_list:
    data["IS_HOME"] = data["MATCHUP"].apply(is_home)

Sort each game by date

In [113]:
sort_by_list = ["TEAM_ID", "GAME_DATE"]
for data in game_data_list:
    data.sort_values(by=sort_by_list, inplace=True)

In [None]:
for data in game_data_list:
    data["WL"] = data["MATCHUP"].apply(is_win)

    #Effective field goal percentage
    data["EFG"] = (data["FGM"] + 0.5 * data["FG3M"]) / data["FGA"]

    #True shooting percentage
    data["TS"] = data["PTS"] / (2 * (data["FGA"] + 0.44 * data["FTA"]))


    

Calculates win percentages from previous games and add a column for it

In [115]:
nba_season_games_2024_25["wins_so_far"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .shift(1)
      .cumsum()
)

nba_season_games_2024_25["games_so_far"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .cumcount()
)

nba_season_games_2024_25["win_percentage"] = (
    nba_season_games_2024_25["wins_so_far"] / nba_season_games_2024_25["games_so_far"]
)


nba_season_games_2025_26["wins_so_far"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .shift(1)
      .cumsum()
)

nba_season_games_2025_26["games_so_far"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .cumcount()
)


nba_season_games_2025_26["win_percentage"] = (
    nba_season_games_2025_26["wins_so_far"] / nba_season_games_2025_26["games_so_far"]
)

Gets winrate of last 5 and 10 games

In [116]:
nba_season_games_2024_25["win_percentage_last_5"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=5, min_periods=1)
      .mean()
)

nba_season_games_2025_26["win_percentage_last_5"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=5, min_periods=1)
      .mean()
)

nba_season_games_2024_25["win_percentage_last_10"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=10, min_periods=1)
      .mean()
)

nba_season_games_2025_26["win_percentage_last_10"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=10, min_periods=1)
      .mean()
)

Gets the EWM (Exponential weighted moving average)

In [None]:
for data in team_data_list:

    data["win_pct_ewm_10"] = (
        data.groupby("TEAM_ID")["WL"]
            .shift(1)
            .ewm(span=10, adjust=False)
            .mean()
    )

    data["pts_ewm_10"] = (
        data.groupby("TEAM_ID")["PTS"]
            .shift(1)
            .ewm(span=10, adjust=False)
            .mean()
    )

    data["win_pct_ewm_20"] = (
        data.groupby("TEAM_ID")["WL"]
            .shift(1)
            .ewm(span=20, adjust=False)
            .mean()
    )

    data["pts_ewm_20"] = (
        data.groupby("TEAM_ID")["PTS"]
            .shift(1)
            .ewm(span=20, adjust=False)
            .mean()
    )

    data["efg_ewm_10"] = (
        data.groupby("TEAM_ID")["EFG"]
            .shift(1)
            .ewm(span=10, adjust=False)
            .mean()
    )

    # data["efg_ewm_20"] = (
    #     data.groupby("TEAM_ID")["EFG"]
    #         .shift(1)
    #         .ewm(span=20, adjust=False)
    #         .mean()
    # )

    data["ts_ewm_10"] = (
        data.groupby("TEAM_ID")["TS"]
            .shift(1)
            .ewm(span=10, adjust=False)
            .mean()
    )

    # data["ts_ewm_20"] = (
    #     data.groupby("TEAM_ID")["TS"]
    #         .shift(1)
    #         .ewm(span=20, adjust=False)
    #         .mean()
    # )
    

In [102]:
for data in team_data_list:
    data["winstreak"] = (
        data.groupby("TEAM_ID")["WL"]
          .shift(1)
          .groupby(data["TEAM_ID"])
          .transform(lambda x: x.groupby((x != 1).cumsum()).cumcount())

    )

Add opponent stats for column

In [None]:
for i, data in enumerate(team_data_list):
    opp = data[[
        "GAME_ID",
        "GAME_DATE",
        "TEAM_ID",
        "win_pct_ewm_10"
    ]].copy()

    opp = opp.rename(columns={
        "TEAM_ID": "OPP_TEAM_ID",
        "win_pct_ewm_10": "OPP_win_pct_ewm_10"
    })

    merged = data.merge(
        opp,
        on=["GAME_ID", "GAME_DATE"]
    )

    # remove self-joins
    merged = merged[merged["TEAM_ID"] != merged["OPP_TEAM_ID"]]

    team_data_list[i] = merged

In [105]:
team_data_list[0]

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,IS_HOME,wins_so_far,games_so_far,win_percentage,win_percentage_last_5,win_percentage_last_10,win_pct_ewm_10,winstreak,OPP_TEAM_ID,OPP_win_pct_ewm_10
1,12024,15020,NZB,New Zealand Breakers,12400002,2024-10-04,NZB @ UTA,1,239,87,...,0,,0,,,,,0,1610612762,8.091118e-01
3,12024,15020,NZB,New Zealand Breakers,12400011,2024-10-07,NZB @ PHI,1,241,84,...,0,1.0,1,1.0,1.0,1.0,1.000000,1,1610612755,9.159657e-01
5,12024,15020,NZB,New Zealand Breakers,12400029,2024-10-10,NZB @ OKC,1,241,89,...,0,2.0,2,1.0,1.0,1.0,1.000000,2,1610612760,8.002894e-01
7,12024,15025,ULM,Ratiopharm Ulm,12400060,2024-10-16,ULM @ POR,0,240,100,...,0,,0,,1.0,1.0,1.000000,0,1610612757,7.321672e-01
9,12024,1610612737,ATL,Atlanta Hawks,12400018,2024-10-08,ATL vs. IND,0,240,131,...,1,,0,,1.0,1.0,1.000000,0,1610612754,1.362025e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5595,32024,1610616855,TMC,Team C,32400004,2025-02-14,TMC vs. TMT,1,73,40,...,1,1610.0,1,1610.0,1.0,1.0,0.972508,1,1610616856,9.725077e-01
5596,32024,1610616856,TMT,Team T,32400004,2025-02-14,TMT @ TMC,1,73,34,...,0,,0,,1.0,1.0,0.972508,0,1610616855,9.725077e-01
5599,32024,1610616857,TMM,Team M,32400005,2025-02-14,TMM vs. TMG,1,72,39,...,1,,0,,1.0,1.0,0.972508,0,1610616858,9.725077e-01
5600,32024,1610616858,TMG,Team G League,32400005,2025-02-14,TMG @ TMM,1,73,40,...,0,,0,,1.0,1.0,0.972508,0,1610616857,9.725077e-01


# Model Training

In [None]:
opt = BayesSearchCV(
    XGBClassifier(),
    {
        'n_estimators': (10, 100),
        'max_depth': (3, 10),
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'subsample': (0.5, 1.0),
        'colsample_bytree': (0.5, 1.0),
    },
    n_iter=32,
    cv=3
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=.2)
# create model instanc
model = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
# fit model
model.fit(X_train, y_train)
# make predictions
preds = model.predict(X_test)