<a href="https://colab.research.google.com/github/samsung-chow/324-Project/blob/main/XGBOOST_kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as torch_data
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

# Load data
use_cols_games = ['gameId', 'hometeamId', 'awayteamId', 'winner']
games_df = pd.read_csv('/content/drive/MyDrive/nba/Games.csv', usecols=use_cols_games, low_memory=False, nrows=50000)

use_cols_stats = ['teamId', 'seasonWins', 'seasonLosses']
team_stats_df = pd.read_csv('/content/drive/MyDrive/nba/TeamStatistics.csv', usecols=use_cols_stats, low_memory=False, nrows=50000)

use_cols_player_stats = ['personId', 'points', 'assists', 'reboundsTotal']
player_stats_df = pd.read_csv('/content/drive/MyDrive/nba/PlayerStatistics.csv', usecols=use_cols_player_stats, low_memory=False, nrows=50000)

# Handle missing values
games_df.fillna({'hometeamId': -1, 'awayteamId': -1, 'winner': -1}, inplace=True)
team_stats_df.fillna({'teamId': -1, 'seasonWins': 0, 'seasonLosses': 1}, inplace=True)
player_stats_df.fillna(0, inplace=True)

# Convert data types
games_df[['hometeamId', 'awayteamId', 'winner']] = games_df[['hometeamId', 'awayteamId', 'winner']].astype('int32')
team_stats_df[['teamId', 'seasonWins', 'seasonLosses']] = team_stats_df[['teamId', 'seasonWins', 'seasonLosses']].astype('int16')

# Compute Home Team Win Indicator
games_df['HomeTeamWins'] = (games_df['winner'] == games_df['hometeamId']).astype(int)

# Merge additional data
team_stats_df['WinPct'] = team_stats_df['seasonWins'] / (team_stats_df['seasonWins'] + team_stats_df['seasonLosses'])

# Aggregate player statistics per team
player_avg_stats = player_stats_df.groupby('personId')[['points', 'assists', 'reboundsTotal']].mean().reset_index()

# Merge team statistics
games_df = games_df.merge(team_stats_df[['teamId', 'WinPct', 'seasonWins', 'seasonLosses']], left_on='hometeamId', right_on='teamId', how='left')
games_df.rename(columns={'WinPct': 'WinPct_home', 'seasonWins': 'seasonWins_home', 'seasonLosses': 'seasonLosses_home'}, inplace=True)
games_df.drop(columns=['teamId'], inplace=True)

games_df = games_df.merge(team_stats_df[['teamId', 'WinPct', 'seasonWins', 'seasonLosses']], left_on='awayteamId', right_on='teamId', how='left')
games_df.rename(columns={'WinPct': 'WinPct_away', 'seasonWins': 'seasonWins_away', 'seasonLosses': 'seasonLosses_away'}, inplace=True)
games_df.drop(columns=['teamId'], inplace=True)

# Selecting features
games_df.fillna(0, inplace=True)
games_df['WinPct_Diff'] = games_df['WinPct_home'] - games_df['WinPct_away']
games_df['RecentPerformance'] = games_df['WinPct_home'] * 0.6 + games_df['WinPct_away'] * 0.4
games_df['seasonWinRate_home'] = games_df['seasonWins_home'] / (games_df['seasonWins_home'] + games_df['seasonLosses_home'] + 1)
games_df['seasonWinRate_away'] = games_df['seasonWins_away'] / (games_df['seasonWins_away'] + games_df['seasonLosses_away'] + 1)

games_df['AvgPlayerPoints'] = player_avg_stats['points'].mean()
games_df['AvgPlayerAssists'] = player_avg_stats['assists'].mean()
games_df['AvgPlayerRebounds'] = player_avg_stats['reboundsTotal'].mean()

X = games_df[['WinPct_Diff', 'RecentPerformance', 'seasonWinRate_home', 'seasonWinRate_away', 'AvgPlayerPoints', 'AvgPlayerAssists', 'AvgPlayerRebounds']]
y = games_df['HomeTeamWins']

# Convert DataFrame to numpy arrays
X_numpy = X.to_numpy()
y_numpy = y.to_numpy()

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_numpy), y=y_numpy)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y_numpy, test_size=0.2, random_state=42, stratify=y_numpy)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


In [4]:

# Compute class distribution
unique, counts = np.unique(y_numpy, return_counts=True)
# Print class distribution
print("Class Distribution:")
for label, count in zip(unique, counts):
    print(f"Class {label}: {count} samples ({count / len(y_numpy) * 100:.2f}%)")


Class Distribution:
Class 0: 19943 samples (39.89%)
Class 1: 30057 samples (60.11%)


In [5]:

# Print sample input and output
num_input_features = X.shape[1]
print(f"Number of input parameters: {num_input_features}")
print("Input feature labels: [WinPct_Diff, RecentPerformance, seasonWinRate_home, seasonWinRate_away, AvgPlayerPoints, AvgPlayerAssists, AvgPlayerRebounds]")
print("Sample input (features):", X_numpy[0])
print("Sample output (label):", y_numpy[0])
print("Output label 1 -> home team won, 0 -> away team won.")

Number of input parameters: 7
Input feature labels: [WinPct_Diff, RecentPerformance, seasonWinRate_home, seasonWinRate_away, AvgPlayerPoints, AvgPlayerAssists, AvgPlayerRebounds]
Sample input (features): [0.         0.         0.         0.         6.32532707 1.50569392
 2.56189076]
Sample output (label): 0
Output label 1 -> home team won, 0 -> away team won.


In [9]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss

# Convert to DMatrix
dtrain = xgb.DMatrix(X_tr, label=y_tr)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'tree_method': 'gpu_hist' if device == 'cuda' else 'auto',
}

# Simulate epoch-like training
num_epochs = 20
model = None

for epoch in range(1, num_epochs + 1):
    model = xgb.train(params, dtrain, num_boost_round=1, xgb_model=model)

    # Predictions
    y_train_pred = model.predict(dtrain)
    y_val_pred = model.predict(dval)

    # Compute loss and accuracy
    train_loss = log_loss(y_tr, y_train_pred)
    val_loss = log_loss(y_val, y_val_pred)

    train_acc = accuracy_score(y_tr, (y_train_pred >= 0.5).astype(int)) * 100
    val_acc = accuracy_score(y_val, (y_val_pred >= 0.5).astype(int)) * 100

    # Pretty print
    print(f"Epoch {epoch:2d}/{num_epochs} | "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
          f"Test Loss: {val_loss:.4f} | Test Acc: {val_acc:.2f}%")

Epoch  1/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test Acc: 60.32%
Epoch  2/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test Acc: 60.32%
Epoch  3/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test Acc: 60.32%
Epoch  4/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test Acc: 60.32%
Epoch  5/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test Acc: 60.32%
Epoch  6/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test Acc: 60.32%
Epoch  7/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test Acc: 60.32%
Epoch  8/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test Acc: 60.32%
Epoch  9/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test Acc: 60.32%
Epoch 10/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test Acc: 60.32%
Epoch 11/20 | Train Loss: 0.6728 | Train Acc: 60.06% | Test Loss: 0.6717 | Test 