# Predictions for the 2024 NBA Season
### Model: General Neural Network Trained on 4 Seasons of NBA Data
### Process: Predict score of Team1 and Team2. Determine the calculated spread and compare to lines from OddShark for the game
### Evaluation: Calculate the accuracy and ATS percentage for the model

In [177]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [178]:
season_odds_24 = pd.read_csv("data_generation/2024-25_odds_cleaned.csv")
# print(season_odds_24.head(5))
print("Successfully loaded odds for 2024 season")

Successfully loaded odds for 2024 season


In [179]:
season_24 = pd.read_csv("data_generation/output_data/2024-25_data.csv")

# print(season_24.info())
print("Successfully loaded 2024 Season Stats")

Successfully loaded 2024 Season Stats


## Importing 4 Seasons of NBA DAta(2020-2024)

In [180]:
season_20 = pd.read_csv('data_generation/output_data/2020-21_data.csv')
season_21 = pd.read_csv('data_generation/output_data/2021-22_data.csv')
season_22 = pd.read_csv('data_generation/output_data/2022-23_data.csv')
season_23 = pd.read_csv('data_generation/output_data/2023-24_data.csv')

combined_seasons = pd.concat([season_20, season_21, season_22, season_23], ignore_index=True)
combined_season = combined_seasons.drop_duplicates()
combined_seasons = combined_seasons.reset_index(drop=True)

# combined_seasons.head(10)

## Prepare features and target value

In [181]:
# Drop target and prepare input
input = combined_seasons.drop(columns=['PTS'])
target = combined_seasons['PTS']

# Load selected features
spearman_corr = pd.read_csv('data_generation/output_data/spearman_corr_features.csv')
selected_features = spearman_corr['Feature'].tolist()

# Ensure no mismatching features
existing_features = [feature for feature in selected_features if feature in input.columns]

# Create input for general model
input = combined_seasons[existing_features]
input = input.dropna(axis=1)
#print(input.info())  # 126 cols, 9540 rows

# Split into train and test set
input_train, input_test, target_train, target_test = train_test_split(input, target, test_size=0.2, random_state=0)

# Scale features
scaler = StandardScaler()
input_train = scaler.fit_transform(input_train)
input_test = scaler.transform(input_test)

## Define the model

In [182]:
# Initialize sequential model 
model = Sequential([
    Input(shape=(input_train.shape[1],)), # define input shape
    Dense(64, activation='relu'), # first hidden layer with 64 units
    Dropout(0.2), 
    Dense(32, activation='relu'), # second hidden layer with 32 units
    Dense(1) # Output layer with target (PTS)
])

# Compile the model 
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mae'])

# Train the model
history = model.fit(input_train, target_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 10105.2256 - mae: 96.9666 - val_loss: 632.5529 - val_mae: 20.0995
Epoch 2/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 952us/step - loss: 712.7801 - mae: 21.5504 - val_loss: 452.7639 - val_mae: 17.0931
Epoch 3/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 921us/step - loss: 540.8569 - mae: 18.4937 - val_loss: 379.7805 - val_mae: 15.5130
Epoch 4/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 987us/step - loss: 492.7523 - mae: 17.8775 - val_loss: 337.8786 - val_mae: 14.6594
Epoch 5/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 847us/step - loss: 433.0249 - mae: 16.7730 - val_loss: 311.5729 - val_mae: 14.2059
Epoch 6/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 901us/step - loss: 392.3181 - mae: 15.8094 - val_loss: 287.4784 - val_mae: 13.6719
Epoch 7/50
[1m191/191[0m [32m━━━━━━━━

#### Toyed around with model parameters to improve model performance and increase the R2 score. Tried increasing the number of epochs and decreasing the learning rate, but that did not improve R2. Tried increasing the number of layers 2 -> 3 but also did not see an increase in R2. 

## Model Evaluation

In [183]:
# Evaluate model on test data
test_loss, test_mae = model.evaluate(input_test, target_test)
print(f"Test Loss (MSE): {test_loss}")
print(f"Test MAE: {test_mae}")

# Target prediction
sample_predictions = model.predict(input_test)
print("Sample Predictions (PTS):", sample_predictions[:10].flatten())
print("Actual Vales (PTS):", target_test[:10].values)

mse = mean_squared_error(target_test, sample_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(target_test, sample_predictions)

print("\n--- Regression Metrics ---")
print(f"Test Loss (MSE from model): {test_loss:.2f}")
print(f"Test MAE (from model): {test_mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-Squared (R²): {r2:.2f}")

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 575us/step - loss: 89.1413 - mae: 7.5536
Test Loss (MSE): 89.19083404541016
Test MAE: 7.588134765625
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 821us/step
Sample Predictions (PTS): [110.03831  108.38754  124.144585 115.92498  111.835495 112.25491
 105.309944 102.265594 113.979256 121.64472 ]
Actual Vales (PTS): [122 123 127 100 111  92  86 123 114 122]

--- Regression Metrics ---
Test Loss (MSE from model): 89.19
Test MAE (from model): 7.59
Mean Squared Error (MSE): 89.19
Root Mean Squared Error (RMSE): 9.44
R-Squared (R²): 0.45


In [184]:
TEAM_CITY_ABBREVIATIONS = {
    1610612737: "ATL",
    1610612738: "BOS",
    1610612739: "CLE",
    1610612740: "NOP",
    1610612741: "CHI",
    1610612742: "DAL",
    1610612743: "DEN",
    1610612744: "GS",
    1610612745: "HOU",
    1610612746: "LAL",
    1610612747: "LAC",
    1610612748: "MIA",
    1610612749: "MIL",
    1610612750: "MIN",
    1610612751: "BKN",
    1610612752: "NY",
    1610612753: "ORL",
    1610612754: "IND",
    1610612755: "PHI",
    1610612756: "PHX",
    1610612757: "POR",
    1610612758: "SAC",
    1610612759: "SAS",
    1610612760: "OKC",
    1610612761: "TOR",
    1610612762: "UTA",
    1610612763: "MEM",
    1610612764: "WAS",
    1610612765: "DET",
    1610612766: "CHA",
}


In [185]:
# Sort both Dfs by date 
# Ensure we process games only up to 12/1/2024
season_24['GAME_DATE_EST'] = pd.to_datetime(season_24['GAME_DATE_EST'])
end_date = datetime(2024, 12, 1)
season_24 = season_24[season_24['GAME_DATE_EST'] <= end_date]
season_24['TEAM_ABBREVIATION'] = season_24['TEAM_ID'].map(TEAM_CITY_ABBREVIATIONS)

# Convert the 'Date' in odds data to datetime format
season_odds_24['Date'] = pd.to_datetime(season_odds_24['Date'], format='%d-%b-%y')

season_odds_24 = season_odds_24.sort_values(by='Date')

# Convert the 'GAME_DATE_EST' in season stats to datetime format
season_24['GAME_DATE_EST'] = pd.to_datetime(season_24['GAME_DATE_EST'])
season_24 = season_24.sort_values(by='GAME_DATE_EST')

features = pd.read_csv('data_generation/output_data/spearman_corr_features.csv')

features_columns = features['Feature'].tolist()

existing_features = [feature for feature in features_columns if feature in season_24.columns]

# print(f"Number of features in existing_features: {len(existing_features)}")
# season_24.head(10)
# season_odds_24.head(10)

In [186]:
merged_data = []

# Iterate through each unique game date
for game_date in season_odds_24['Date'].unique():
    day_odds = season_odds_24[season_odds_24['Date'] == game_date]

    # Filter games that exist in the 2024 Seasons Data
    day_games = season_24[season_24['GAME_DATE_EST'] == game_date]

    for _, game in day_odds.iterrows():
        # Get HOME and AWAY stats
        home_stats = day_games[day_games['TEAM_ABBREVIATION'] == game['Home']]
        away_stats = day_games[day_games['TEAM_ABBREVIATION'] == game['Away']]

        # Check to see if both teams are found
        if not home_stats.empty and not away_stats.empty:
            # Filter stats to include only relevant features
            home_filtered_stats = home_stats.iloc[0][existing_features].to_dict()
            away_filtered_stats = away_stats.iloc[0][existing_features].to_dict()

            # Predict points for each team
            home_features_df = pd.DataFrame([home_filtered_stats])[existing_features]
            away_features_df = pd.DataFrame([away_filtered_stats])[existing_features]

            home_features_scaled = scaler.transform(home_features_df)
            away_features_scaled = scaler.transform(away_features_df)

            home_predicted_points = model.predict(home_features_scaled)[0][0]
            away_predicted_points = model.predict(away_features_scaled)[0][0]

            predicted_spread = home_predicted_points - away_predicted_points

            # Append game entry to merged_data
            merged_data.append({
                'Game_Date': game_date,
                'Home': game['Home'],
                'Away': game['Away'],
                'Home_Stats': home_filtered_stats,
                'Away_Stats': away_filtered_stats,
                'Home_Predicted_Points': home_predicted_points,
                'Away_Predicted_Points': away_predicted_points,
                'Predicted_Spread': predicted_spread,
                'Odds': game.to_dict()
            })

print(f"Finished processing games")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13

In [187]:
final_results = []

unique_dates = sorted({entry['Game_Date'] for entry in merged_data})

# Iterate through each game date
for game_date in unique_dates:
    print(f"Processing games for {game_date}")

    # Filter games for the current day
    day_games = [entry for entry in merged_data if entry["Game_Date"] == game_date]

    # Prepare training data for the current day
    X_train = pd.DataFrame(
        [game['Home_Stats'] for game in day_games] +
        [game['Away_Stats'] for game in day_games]
    )[existing_features].values

    y_train = pd.DataFrame([
        {'Actual_Points': game['Odds']['Score.1']} for game in day_games
    ] + [
        {'Actual_Points': game['Odds']['Score']} for game in day_games
    ]).values

    # Fine-tune the model with the day's data
    model.fit(X_train, y_train, epochs=1, verbose=0)

    # Append games for the day to final results
    final_results.extend(day_games)

    print(f"Finished processing {game_date}")


Processing games for 2024-10-22 00:00:00
Finished processing 2024-10-22 00:00:00
Processing games for 2024-10-23 00:00:00
Finished processing 2024-10-23 00:00:00
Processing games for 2024-10-24 00:00:00
Finished processing 2024-10-24 00:00:00
Processing games for 2024-10-25 00:00:00
Finished processing 2024-10-25 00:00:00
Processing games for 2024-10-26 00:00:00
Finished processing 2024-10-26 00:00:00
Processing games for 2024-10-27 00:00:00
Finished processing 2024-10-27 00:00:00
Processing games for 2024-10-28 00:00:00
Finished processing 2024-10-28 00:00:00
Processing games for 2024-10-29 00:00:00
Finished processing 2024-10-29 00:00:00
Processing games for 2024-10-30 00:00:00
Finished processing 2024-10-30 00:00:00
Processing games for 2024-10-31 00:00:00
Finished processing 2024-10-31 00:00:00
Processing games for 2024-11-01 00:00:00
Finished processing 2024-11-01 00:00:00
Processing games for 2024-11-02 00:00:00
Finished processing 2024-11-02 00:00:00
Processing games for 2024-11

In [188]:
# Convert merged_data to a DataFrame
merged_df = pd.DataFrame(merged_data)

# Save to a CSV file
merged_df.to_csv("2024_predictions.csv", index=False)

# Print a preview of the DataFrame
# print(merged_df.head())

   Game_Date Home Away                                         Home_Stats  \
0 2024-10-22  BOS   NY  {'fieldGoalPercentage_my_player_1': 0.778, 'fi...   
1 2024-10-23  PHI  MIL  {'fieldGoalPercentage_my_player_1': 0.323, 'fi...   
2 2024-10-23  MIA  ORL  {'fieldGoalPercentage_my_player_1': 0.333, 'fi...   
3 2024-10-23  DET  IND  {'fieldGoalPercentage_my_player_1': 0.434, 'fi...   
4 2024-10-23  NOP  CHI  {'fieldGoalPercentage_my_player_1': 0.167, 'fi...   

                                          Away_Stats  Home_Predicted_Points  \
0  {'fieldGoalPercentage_my_player_1': 0.538, 'fi...             137.828476   
1  {'fieldGoalPercentage_my_player_1': 0.473, 'fi...             110.457451   
2  {'fieldGoalPercentage_my_player_1': 0.5, 'fiel...              93.571198   
3  {'fieldGoalPercentage_my_player_1': 0.5, 'fiel...             112.195015   
4  {'fieldGoalPercentage_my_player_1': 0.587, 'fi...             112.367386   

   Away_Predicted_Points  Predicted_Spread  \
0             11

## Run Analytics on the 2024 Predictions Compared to 2024 Season Actual Results

In [193]:
prediction_errors = []
spread_errors = []
ats_hits = 0
total_games = len(merged_data)

for game in merged_data:
    # Predicted vs actual points 
    home_error = abs(game['Home_Predicted_Points'] - game['Odds']['Score.1'])
    away_error = abs(game['Away_Predicted_Points'] - game['Odds']['Score'])
    prediction_errors.extend([home_error, away_error])

    # Spread analysis
    calculated_spread = game['Predicted_Spread']
    actual_spread = game['Odds']['Home Spread']
    spread_error = abs(calculated_spread - actual_spread)
    spread_errors.append(spread_error)

    # ATS Calculations
    actual_result_spread = game['Odds']['Score.1'] - game['Odds']['Score']
    if (calculated_spread > 0 and actual_result_spread > actual_spread) or (calculated_spread < 0 and actual_result_spread < actual_spread):
            ats_hits += 1

# General performance stats
prediction_mae = np.mean(prediction_errors)
prediction_mse = np.mean(np.square(prediction_errors))
prediction_rmse = np.sqrt(prediction_mse)

spread_mae = np.mean(spread_errors)

ats_percentage = (ats_hits/total_games) * 100

# Display Results
print("General Statistics for Predictions:")
print(f"Mean Absolute Error (Points): {prediction_mae:.2f}")
print(f"Mean Squared Error (Points): {prediction_mse:.2f}")
print(f"Root Mean Squared Error (Points): {prediction_rmse:.2f}")

print("\nSpread Analysis:")
print(f"Mean Absolute Error (Spread): {spread_mae:.2f}")

print("\nATS Results:")
print(f"Total Games: {total_games}")
print(f"ATS Hits: {ats_hits}")
print(f"ATS %: {ats_percentage:.2f}%")

General Statistics for Predictions:
Mean Absolute Error (Points): 8.30
Mean Squared Error (Points): 105.59
Root Mean Squared Error (Points): 10.28

Spread Analysis:
Mean Absolute Error (Spread): 13.35

ATS Results:
Total Games: 207
ATS Hits: 138
ATS %: 66.67%
