In [None]:
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
from sklearn.metrics import log_loss


# Helper Functions

For Data Preprocessing

In [27]:
def is_home(matchup):
    return int("@" not in matchup)

def is_win(result):
    return int("L" not in result)

def sort_by_date(df):
    return df.sort_values(by='GAME_DATE')

# Data Preprocessing

In [23]:
nba_player_data_2025_26 = pd.read_csv('out/player_stats_2025-26.csv')
nba_player_data_2024_25 = pd.read_csv('out/player_stats_2024-25.csv')
nba_players = pd.read_csv('out/players.csv')
nba_teams = pd.read_csv('out/teams.csv')
nba_season_games_2024_25 = pd.read_csv('out/season_games_2024-25.csv')
nba_season_games_2025_26 = pd.read_csv('out/season_games_2025-26.csv')


In [41]:
game_data_list = [nba_player_data_2025_26, nba_player_data_2024_25, nba_season_games_2024_25, nba_season_games_2025_26]
team_data_list = [nba_season_games_2024_25, nba_season_games_2025_26]

Add another feature for home game and non-home game

In [36]:
for data in game_data_list:
    data["IS_HOME"] = data["MATCHUP"].apply(is_home)
# nba_player_data_2025_26["IS_HOME"] = nba_player_data_2025_26["MATCHUP"].apply(is_home)
# nba_player_data_2024_25["IS_HOME"] = nba_player_data_2024_25["MATCHUP"].apply(is_home)
# nba_season_games_2024_25["IS_HOME"] = nba_season_games_2024_25["MATCHUP"].apply(is_home)
# nba_season_games_2025_26["IS_HOME"] = nba_season_games_2025_26["MATCHUP"].apply(is_home)

Sort each game by date

In [38]:
sort_by_list = ["TEAM_ID", "GAME_DATE"]
for data in game_data_list:
    data.sort_values(by=sort_by_list, inplace=True)

# nba_player_data_2025_26 = nba_player_data_2025_26.sort_values(by=sort_by_list)
# nba_player_data_2024_25 = nba_player_data_2024_25.sort_values(by=sort_by_list)
# nba_season_games_2024_25 = nba_season_games_2024_25.sort_values(by=sort_by_list)
# nba_season_games_2025_26 = nba_season_games_2025_26.sort_values(by=sort_by_list)

In [None]:
for data in game_data_list:
    data["WL"] = data["MATCHUP"].apply(is_win)

# nba_player_data_2025_26["WL"] = nba_player_data_2025_26["MATCHUP"].apply(is_win)
# nba_player_data_2024_25["WL"] = nba_player_data_2024_25["MATCHUP"].apply(is_win)
# nba_season_games_2024_25["WL"] = nba_season_games_2024_25["MATCHUP"].apply(is_win)
# nba_season_games_2025_26["WL"] = nba_season_games_2025_26["MATCHUP"].apply(is_win)

Calculates win percentages from previous games and add a column for it

In [None]:
nba_season_games_2024_25["wins_so_far"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .shift(1)
      .cumsum()
)

nba_season_games_2024_25["games_so_far"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .cumcount()
)

nba_season_games_2024_25["win_percentage"] = (
    nba_season_games_2024_25["wins_so_far"] / nba_season_games_2024_25["games_so_far"].replace(0, np.nan)
)


nba_season_games_2025_26["wins_so_far"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .shift(1)
      .cumsum()
)

nba_season_games_2025_26["games_so_far"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .cumcount()
)

nba_season_games_2025_26["win_percentage"] = (
    nba_season_games_2025_26["wins_so_far"] / nba_season_games_2025_26["games_so_far"].replace(0, np.nan)
)

In [None]:
nba_season_games_2024_25["win_percentage_last_5"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=5, min_periods=1)
      .mean()
)

nba_season_games_2025_26["win_percentage_last_5"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=5, min_periods=1)
      .mean()
)

nba_season_games_2024_25["win_percentage_last_10"] = (
    nba_season_games_2024_25.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=10, min_periods=1)
      .mean()
)


nba_season_games_2025_26["win_percentage_last_10"] = (
    nba_season_games_2025_26.groupby("TEAM_ID")["WL"]
      .shift(1)
      .rolling(window=10, min_periods=1)
      .mean()
)

In [44]:
for data in team_data_list:
    data["winstreak"] = (
        data.groupby("TEAM_ID")["WL"]
          .shift(1)
          .groupby(data["TEAM_ID"])
          .transform(lambda x: x.groupby((x != 1).cumsum()).cumcount())

    )

In [46]:
nba_season_games_2024_25["WL"]

2800    1
2774    1
2752    1
2684    0
2768    0
       ..
1022    1
1021    1
1024    1
1023    1
1025    1
Name: WL, Length: 2802, dtype: int64

In [45]:
nba_season_games_2024_25["winstreak"]

2800    0
2774    1
2752    2
2684    0
2768    0
       ..
1022    1
1021    0
1024    0
1023    0
1025    1
Name: winstreak, Length: 2802, dtype: int64

In [None]:
opt = BayesSearchCV(
    XGBClassifier(),
    {
        'n_estimators': (10, 100),
        'max_depth': (3, 10),
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'subsample': (0.5, 1.0),
        'colsample_bytree': (0.5, 1.0),
    },
    n_iter=32,
    cv=3
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=.2)
# create model instanc
model = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
# fit model
model.fit(X_train, y_train)
# make predictions
preds = model.predict(X_test)