<a href="https://colab.research.google.com/github/samsung-chow/324-Project/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as torch_data

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load data
use_cols_games = ['gameId', 'hometeamId', 'awayteamId', 'winner']
games_df = pd.read_csv('/content/drive/MyDrive/nba/Games.csv', usecols=use_cols_games, low_memory=False, nrows=50000)

use_cols_stats = ['teamId', 'seasonWins', 'seasonLosses']
team_stats_df = pd.read_csv('/content/drive/MyDrive/nba/TeamStatistics.csv', usecols=use_cols_stats, low_memory=False, nrows=50000)

use_cols_player_stats = ['personId', 'points', 'assists', 'reboundsTotal']
player_stats_df = pd.read_csv('/content/drive/MyDrive/nba/PlayerStatistics.csv', usecols=use_cols_player_stats, low_memory=False, nrows=50000)

# Handle missing values
games_df.fillna({'hometeamId': -1, 'awayteamId': -1, 'winner': -1}, inplace=True)
team_stats_df.fillna({'teamId': -1, 'seasonWins': 0, 'seasonLosses': 1}, inplace=True)
player_stats_df.fillna(0, inplace=True)

# Convert data types
games_df[['hometeamId', 'awayteamId', 'winner']] = games_df[['hometeamId', 'awayteamId', 'winner']].astype('int32')
team_stats_df[['teamId', 'seasonWins', 'seasonLosses']] = team_stats_df[['teamId', 'seasonWins', 'seasonLosses']].astype('int16')

# Compute Home Team Win Indicator
games_df['HomeTeamWins'] = (games_df['winner'] == games_df['hometeamId']).astype(int)

# Merge additional data
team_stats_df['WinPct'] = team_stats_df['seasonWins'] / (team_stats_df['seasonWins'] + team_stats_df['seasonLosses'])

# Aggregate player statistics per team
player_avg_stats = player_stats_df.groupby('personId')[['points', 'assists', 'reboundsTotal']].mean().reset_index()

# Merge team statistics
games_df = games_df.merge(team_stats_df[['teamId', 'WinPct', 'seasonWins', 'seasonLosses']], left_on='hometeamId', right_on='teamId', how='left')
games_df.rename(columns={'WinPct': 'WinPct_home', 'seasonWins': 'seasonWins_home', 'seasonLosses': 'seasonLosses_home'}, inplace=True)
games_df.drop(columns=['teamId'], inplace=True)

games_df = games_df.merge(team_stats_df[['teamId', 'WinPct', 'seasonWins', 'seasonLosses']], left_on='awayteamId', right_on='teamId', how='left')
games_df.rename(columns={'WinPct': 'WinPct_away', 'seasonWins': 'seasonWins_away', 'seasonLosses': 'seasonLosses_away'}, inplace=True)
games_df.drop(columns=['teamId'], inplace=True)

# Selecting features
games_df.fillna(0, inplace=True)
games_df['WinPct_Diff'] = games_df['WinPct_home'] - games_df['WinPct_away']
games_df['RecentPerformance'] = games_df['WinPct_home'] * 0.6 + games_df['WinPct_away'] * 0.4
games_df['seasonWinRate_home'] = games_df['seasonWins_home'] / (games_df['seasonWins_home'] + games_df['seasonLosses_home'] + 1)
games_df['seasonWinRate_away'] = games_df['seasonWins_away'] / (games_df['seasonWins_away'] + games_df['seasonLosses_away'] + 1)

games_df['AvgPlayerPoints'] = player_avg_stats['points'].mean()
games_df['AvgPlayerAssists'] = player_avg_stats['assists'].mean()
games_df['AvgPlayerRebounds'] = player_avg_stats['reboundsTotal'].mean()

X = games_df[['WinPct_Diff', 'RecentPerformance', 'seasonWinRate_home', 'seasonWinRate_away', 'AvgPlayerPoints', 'AvgPlayerAssists', 'AvgPlayerRebounds']]
y = games_df['HomeTeamWins']

# Convert DataFrame to numpy arrays
X_numpy = X.to_numpy()
y_numpy = y.to_numpy()

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_numpy), y=y_numpy)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y_numpy, test_size=0.2, random_state=42, stratify=y_numpy)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)


In [46]:
# Compute class distribution
unique, counts = np.unique(y_numpy, return_counts=True)
# Print class distribution
print("Class Distribution:")
for label, count in zip(unique, counts):
    print(f"Class {label}: {count} samples ({count / len(y_numpy) * 100:.2f}%)")

Class Distribution:
Class 0: 19929 samples (39.86%)
Class 1: 30071 samples (60.14%)


In [41]:
# Print sample input and output
num_input_features = X.shape[1]
print(f"Number of input parameters: {num_input_features}")
print("Input feature labels: [WinPct_Diff, RecentPerformance, seasonWinRate_home, seasonWinRate_away, AvgPlayerPoints, AvgPlayerAssists, AvgPlayerRebounds]")
print("Sample input (features):", X_numpy[0])
print("Sample output (label):", y_numpy[0])
print("Output label 1 -> home team won, 0 -> away team won.")

Number of input parameters: 7
Input feature labels: [WinPct_Diff, RecentPerformance, seasonWinRate_home, seasonWinRate_away, AvgPlayerPoints, AvgPlayerAssists, AvgPlayerRebounds]
Sample input (features): [0.         0.         0.         0.         6.31876056 1.50431959
 2.55590807]
Sample output (label): 0
Output label 1 -> home team won, 0 -> away team won.


In [48]:
class MLPModel(nn.Module):
    def __init__(self, input_size):
        super(MLPModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.SiLU(),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.SiLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Initialize Model
num_input_features = X_train_tensor.shape[1]
device = "cuda" if torch.cuda.is_available() else "cpu"

model = MLPModel(num_input_features).to(device)


In [50]:
# Focal Loss to handle class imbalance
class FocalLoss(nn.Module):
    def __init__(self, gamma=2, alpha=0.75):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, inputs, targets):
        BCE_loss = nn.BCEWithLogitsLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return loss.mean()

# Initialize optimizer and scheduler
criterion = FocalLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0003)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)

# Training loop
epochs = 25
for epoch in range(epochs):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)

        # Compute loss
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Compute training accuracy
        y_pred_class = (torch.sigmoid(y_pred) > 0.5).float()
        correct += (y_pred_class == y_batch).sum().item()
        total += y_batch.size(0)

    train_loss = total_loss / len(train_loader)
    train_accuracy = correct / total * 100
    scheduler.step()

    # Print training stats
    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_accuracy:.2f}%")

model.eval()
test_loss, correct, total = 0, 0, 0

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)

        # Compute test loss
        loss = criterion(y_pred, y_batch)
        test_loss += loss.item()

        # Compute test accuracy
        y_pred_class = (torch.sigmoid(y_pred) > 0.5).float()
        correct += (y_pred_class == y_batch).sum().item()
        total += y_batch.size(0)

test_loss /= len(test_loader)
test_accuracy = correct / total * 100

# Print final test results
print(f"\n=== FINAL TEST RESULTS ===")
print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.2f}%")

# **Check Prediction Distribution**
y_pred = (torch.sigmoid(model(X_test_tensor)) > 0.5).cpu().numpy()
unique, counts = np.unique(y_pred, return_counts=True)
print("Final Prediction Distribution:", dict(zip(unique, counts)))


Epoch 1/25 | Train Loss: 0.1266 | Train Acc: 60.14%
Epoch 2/25 | Train Loss: 0.1266 | Train Acc: 60.14%
Epoch 3/25 | Train Loss: 0.1266 | Train Acc: 60.14%
Epoch 4/25 | Train Loss: 0.1266 | Train Acc: 60.14%
Epoch 5/25 | Train Loss: 0.1266 | Train Acc: 60.14%
Epoch 6/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Epoch 7/25 | Train Loss: 0.1266 | Train Acc: 60.14%
Epoch 8/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Epoch 9/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Epoch 10/25 | Train Loss: 0.1266 | Train Acc: 60.14%
Epoch 11/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Epoch 12/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Epoch 13/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Epoch 14/25 | Train Loss: 0.1266 | Train Acc: 60.14%
Epoch 15/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Epoch 16/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Epoch 17/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Epoch 18/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Epoch 19/25 | Train Loss: 0.1265 | Train Acc: 60.14%
Ep