In [102]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader

In [103]:
class NBADataProcessor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.df = self._load_data()

    def _load_data(self):
        df = pd.read_csv(self.file_path, index_col=0)
        return df

    def _add_target(self, group):
        group = pd.concat([group, group["won"].shift(-1).rename("target")], axis=1)
        return group

    def _scale_data(self, df, selected_columns):
        scaler = MinMaxScaler()
        df[selected_columns] = scaler.fit_transform(df[selected_columns])
        return df

    def _calculate_rolling_averages(self, df, selected_columns):
        rolling = df[list(selected_columns) + ["won", "team", "season"]]
        
        def find_team_averages(team):
            team[selected_columns] = team[selected_columns].rolling(10).mean()
            return team

        rolling = rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

        return rolling

    def _add_future_game_data(self, df):
        def shift_col(team, col_name):
            next_col = team[col_name].shift(-1)
            return next_col

        df["home_next"] = self._add_col(df, "home")
        df["team_opp_next"] = self._add_col(df, "team_opp")
        df["date_next"] = self._add_col(df, "date")

        return df

    def _add_col(self, df, col_name):
        return df.groupby("team", group_keys=False).apply(lambda x: self._shift_col(x, col_name))

    def _shift_col(self, team, col_name):
        next_col = team[col_name].shift(-1)
        return next_col

    def prepare_dataset(self):
        # Sort by date and drop irrelevant columns
        self.df = self.df.sort_values("date")
        self.df = self.df.reset_index(drop=True)
        del self.df["mp.1"]
        del self.df["mp_opp.1"]
        del self.df["index_opp"]

        #Add a target column (Whether or not team won next game)
        self.df = self.df.groupby("team", group_keys=False).apply(self._add_target)
        self.df.loc[pd.isnull(self.df["target"]), "target"] = 2
        self.df["target"] = self.df["target"].astype(int, errors="ignore")
        
        #Create copy dataframe without null values
        nulls = pd.isnull(self.df).sum()
        nulls = nulls[nulls > 0]
        valid_columns = self.df.columns[~self.df.columns.isin(nulls.index)]
        self.df = self.df[valid_columns].copy()
        
        #Convert boolean column to binary
        self.df['won'] = self.df['won'].astype(int)
        
        #Scale stat columns
        removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
        selected_columns = self.df.columns[~self.df.columns.isin(removed_columns)]
        self.df = self._scale_data(self.df, selected_columns)
        rolling = self._calculate_rolling_averages(self.df, selected_columns)
        
        rolling_cols = [f"{col}_10" for col in rolling.columns]
        rolling.columns = rolling_cols
        
        #Concatenate new columns back into dataframe, dropping null and resetting index
        self.df = pd.concat([self.df, rolling], axis=1)
        self.df = self.df.dropna()
        self.df = self.df.reset_index(drop=True)

        #Add future game data to columns
        self.df = self._add_future_game_data(self.df)
        
        self.df = self.df.merge(self.df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

        return self.df
    
    def _extract_features(self, df):
        
        # Extract features for team_x and team_opp_next_x
        team_x_cols = [col for col in df.columns if '_10_x' in col and 'opp' not in col]
        team_opp_next_x_cols = [col for col in df.columns if 'opp_10_x' in col]

        # Concatenate features and rolling averages, including home_next
        features_columns = team_x_cols + team_opp_next_x_cols + ["home_next"]
        
        # Exclude columns with specific words
        excluded_words = ["season", "date", "won", "target", "team", "team_opp"]
        features_columns = [col for col in features_columns if not any(word in col for word in excluded_words)]
        
        features_df = df[features_columns].copy()

        return features_df
    
class NBADataset(Dataset):
    def __init__(self, features, target):
        self.features = features
        self.target = target

    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        x = torch.tensor(self.features[idx], dtype=torch.float32)
        y = torch.tensor(self.target[idx], dtype=torch.float32)
        return x, y

In [104]:
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)  # Adjust dropout rate as needed
        self.fc2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [105]:
# Specify the relative path to the dataset
relative_path = 'datasets/nba_games.csv'
read_file = os.path.join(os.getcwd(), relative_path)

In [106]:
# Initialize NBADataProcessor and prepare the dataset
data_processor = NBADataProcessor(file_path=read_file)
processed_df = data_processor.prepare_dataset()

In [107]:
season_df = processed_df[["season"]].copy()

# Extract features and target using the _extract_features method
features_df = data_processor._extract_features(processed_df)

# Concatenate season and date columns back to features dataframe
features_df = pd.concat([season_df, features_df], axis=1)

# Define features and target
features_columns = features_df.columns.tolist()
target_column = "target"

# Identify the cutoff season (e.g., 2022)
cutoff_season = 2022

# Divide the data into training and testing sets
train_df = features_df[features_df["season"] < cutoff_season]
test_df = features_df[features_df["season"] == cutoff_season]

# Extract features and target for training and testing sets
train_features = train_df[features_columns].values
train_target = train_df[target_column].values

test_features = test_df[features_columns].values
test_target = test_df[target_column].values

# features = features_df.values
# target = processed_df[target_column].values

# # Define dataset and dataloader
# dataset = NBADataset(features, target)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [108]:
# Initialize and train the model
input_size = len(features_columns)
net = Net(input_size=input_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net.to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

# Training loop
num_epochs = 30

In [110]:
for epoch in range(num_epochs):
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = net(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))

        loss.backward()
        optimizer.step()

        # Accumulate loss for the epoch
        total_loss += loss.item()

        # Calculate accuracy
        predicted = torch.round(outputs)
        correct_predictions += (predicted == labels.unsqueeze(1)).sum().item()
        total_samples += labels.size(0)

    # Calculate overall accuracy for the epoch
    accuracy = correct_predictions / total_samples

    # Print the loss and accuracy for each epoch
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader):.4f}, Accuracy: {accuracy:.4f}")

# After the training loop, print overall accuracy
overall_accuracy = correct_predictions / total_samples
print(f"Overall Accuracy: {overall_accuracy:.4f}")

Epoch 1/30, Loss: 0.6622, Accuracy: 0.6022
Epoch 2/30, Loss: 0.6628, Accuracy: 0.6032
Epoch 3/30, Loss: 0.6620, Accuracy: 0.6035
Epoch 4/30, Loss: 0.6629, Accuracy: 0.6028
Epoch 5/30, Loss: 0.6630, Accuracy: 0.6024
Epoch 6/30, Loss: 0.6635, Accuracy: 0.6047
Epoch 7/30, Loss: 0.6630, Accuracy: 0.6015
Epoch 8/30, Loss: 0.6646, Accuracy: 0.6039
Epoch 9/30, Loss: 0.6629, Accuracy: 0.5977
Epoch 10/30, Loss: 0.6638, Accuracy: 0.6042
Epoch 11/30, Loss: 0.6626, Accuracy: 0.6004
Epoch 12/30, Loss: 0.6635, Accuracy: 0.6064
Epoch 13/30, Loss: 0.6616, Accuracy: 0.6060
Epoch 14/30, Loss: 0.6638, Accuracy: 0.6021
Epoch 15/30, Loss: 0.6609, Accuracy: 0.6056
Epoch 16/30, Loss: 0.6635, Accuracy: 0.6041
Epoch 17/30, Loss: 0.6626, Accuracy: 0.6043
Epoch 18/30, Loss: 0.6614, Accuracy: 0.6053
Epoch 19/30, Loss: 0.6639, Accuracy: 0.6021
Epoch 20/30, Loss: 0.6638, Accuracy: 0.6024
Epoch 21/30, Loss: 0.6629, Accuracy: 0.6030
Epoch 22/30, Loss: 0.6624, Accuracy: 0.6069
Epoch 23/30, Loss: 0.6624, Accuracy: 0.60

In [None]:
# Save the trained model
torch.save(net.state_dict(), "trained_model.pth")