In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [22]:
# Load 2024-25 odds
season_odds_24 = pd.read_csv("data_generation/2024-25_odds_cleaned.csv")

In [23]:
# Load stats
#season_20 = pd.read_csv('data_generation/output_data/2020-21_data.csv')
#season_21 = pd.read_csv('data_generation/output_data/2021-22_data.csv')
season_22 = pd.read_csv('data_generation/output_data/2022-23_data.csv')
season_23 = pd.read_csv('data_generation/output_data/2023-24_data.csv')
season_24 = pd.read_csv('data_generation/output_data/2024-25_data.csv')

### Data pre-processing

In [24]:
# Combind and clean
combined_seasons = pd.concat([season_22, season_23], ignore_index=True)
combined_seasons = combined_seasons.drop_duplicates()
combined_seasons = combined_seasons.reset_index(drop=True)

In [25]:
# Input and target
input = combined_seasons.drop(columns=['PTS'])
target = combined_seasons['PTS']

# Selected features for model training
spearman_corr = pd.read_csv('data_generation/output_data/spearman_corr_features.csv')
selected_features = spearman_corr['Feature'].tolist()

existing_features = [feature for feature in selected_features if feature in input.columns] # check no mismatching features

input = combined_seasons[existing_features]
input = input.dropna(axis=1)

# Split into train and test set
input_train, input_test, target_train, target_test = train_test_split(input, target, test_size=0.2, random_state=0)

In [26]:
# Define model
rf = RandomForestRegressor()

# Search space of hyperparameters
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearch
rf_cv = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='r2',n_jobs=-1)
rf_cv.fit(input_train, target_train)

# Output best lambda and score
print("Best Parameters:", rf_cv.best_params_)
print("Best Cross-Validation R2 Score:", rf_cv.best_score_)

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Cross-Validation R2 Score: 0.3967263888294036


### Model evaluation

In [35]:
rf = RandomForestRegressor(
    max_depth=20,
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=200
)

rf.fit(input_train, target_train)

test_loss = mean_squared_error(target_test, rf.predict(input_test))
test_mae = mean_absolute_error(target_test, rf.predict(input_test))
print(f"Test Loss (MSE): {test_loss}")
print(f"Test MAE: {test_mae}")

# Target prediction
sample_predictions = rf.predict(input_test)
print("\nSample Predictions (PTS):", sample_predictions[:10])
print("Actual Values (PTS):", target_test[:10].values)

mse = mean_squared_error(target_test, sample_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(target_test, sample_predictions)

print("\n--- Regression Metrics ---")
print(f"Test Loss (MSE from model): {test_loss:.2f}")
print(f"Test MAE (from model): {test_mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-Squared (R²): {r2:.2f}")

Test Loss (MSE): 87.25829001202962
Test MAE: 7.437235888885815

Sample Predictions (PTS): [110.99723561 116.56484001 112.90907053 111.40591937 120.16453986
 105.00706334 124.90042361 122.51938925 117.75574344 109.21524459]
Actual Values (PTS): [119 106 130 106 108 106 118 115 109 101]

--- Regression Metrics ---
Test Loss (MSE from model): 87.26
Test MAE (from model): 7.44
Mean Squared Error (MSE): 87.26
Root Mean Squared Error (RMSE): 9.34
R-Squared (R²): 0.41


In [36]:
def get_feature_importance(features, importances):
    importance_df = pd.DataFrame({
        'Feature': features,
        'Importance': importances
    })
    return importance_df.sort_values(by='Importance', ascending=False)
    
rf_importance = rf.feature_importances_
importance_df = get_feature_importance(input_train.columns, rf_importance)
importance_df.to_csv('rf_feature_importance.csv', index=False)

In [37]:
TEAM_CITY_ABBREVIATIONS = {
    1610612737: "ATL",
    1610612738: "BOS",
    1610612739: "CLE",
    1610612740: "NOP",
    1610612741: "CHI",
    1610612742: "DAL",
    1610612743: "DEN",
    1610612744: "GS",
    1610612745: "HOU",
    1610612746: "LAL",
    1610612747: "LAC",
    1610612748: "MIA",
    1610612749: "MIL",
    1610612750: "MIN",
    1610612751: "BKN",
    1610612752: "NY",
    1610612753: "ORL",
    1610612754: "IND",
    1610612755: "PHI",
    1610612756: "PHX",
    1610612757: "POR",
    1610612758: "SAC",
    1610612759: "SAS",
    1610612760: "OKC",
    1610612761: "TOR",
    1610612762: "UTA",
    1610612763: "MEM",
    1610612764: "WAS",
    1610612765: "DET",
    1610612766: "CHA",
}

In [38]:
# Sort both Dfs by date 
# Ensure we process games only up to 12/1/2024
season_24['GAME_DATE_EST'] = pd.to_datetime(season_24['GAME_DATE_EST'])
end_date = datetime(2024, 12, 1)
season_24 = season_24[season_24['GAME_DATE_EST'] <= end_date]
season_24['TEAM_ABBREVIATION'] = season_24['TEAM_ID'].map(TEAM_CITY_ABBREVIATIONS)

# Convert the 'Date' in odds data to datetime format
season_odds_24['Date'] = pd.to_datetime(season_odds_24['Date'], format='%d-%b-%y')

season_odds_24 = season_odds_24.sort_values(by='Date')

# Convert the 'GAME_DATE_EST' in season stats to datetime format
season_24['GAME_DATE_EST'] = pd.to_datetime(season_24['GAME_DATE_EST'])
season_24 = season_24.sort_values(by='GAME_DATE_EST')

features = pd.read_csv('data_generation/output_data/spearman_corr_features.csv')

features_columns = features['Feature'].tolist()

existing_features = [feature for feature in features_columns if feature in season_24.columns]

In [39]:
merged_data = []

# Iterate through each unique game date
for game_date in season_odds_24['Date'].unique():
    day_odds = season_odds_24[season_odds_24['Date'] == game_date]

    # Filter games that exist in the 2024 Seasons Data
    day_games = season_24[season_24['GAME_DATE_EST'] == game_date]

    for _, game in day_odds.iterrows():
        # Get HOME and AWAY stats
        home_stats = day_games[day_games['TEAM_ABBREVIATION'] == game['Home']]
        away_stats = day_games[day_games['TEAM_ABBREVIATION'] == game['Away']]

        # Check to see if both teams are found
        if not home_stats.empty and not away_stats.empty:
            # Filter stats to include only relevant features
            home_filtered_stats = home_stats.iloc[0][existing_features].to_dict()
            away_filtered_stats = away_stats.iloc[0][existing_features].to_dict()

            # Predict points for each team
            home_features_df = pd.DataFrame([home_filtered_stats])[existing_features]
            away_features_df = pd.DataFrame([away_filtered_stats])[existing_features]

            home_predicted_points = rf.predict(home_features_df)
            away_predicted_points = rf.predict(away_features_df)

            predicted_spread = home_predicted_points - away_predicted_points

            # Append game entry to merged_data
            merged_data.append({
                'Game_Date': game_date,
                'Home': game['Home'],
                'Away': game['Away'],
                'Home_Stats': home_filtered_stats,
                'Away_Stats': away_filtered_stats,
                'Home_Predicted_Points': home_predicted_points,
                'Away_Predicted_Points': away_predicted_points,
                'Predicted_Spread': predicted_spread,
                'Odds': game.to_dict()
            })

print(f"Finished processing games")

Finished processing games


In [40]:
final_results = []

unique_dates = sorted({entry['Game_Date'] for entry in merged_data})

# Iterate through each game date
for game_date in unique_dates:
    print(f"Processing games for {game_date}")

    # Filter games for the current day
    day_games = [entry for entry in merged_data if entry["Game_Date"] == game_date]

    # Prepare training data for the current day
    X_train = pd.DataFrame(
        [game['Home_Stats'] for game in day_games] +
        [game['Away_Stats'] for game in day_games]
    )[existing_features].values

    y_train = pd.DataFrame([
        {'Actual_Points': game['Odds']['Score.1']} for game in day_games
    ] + [
        {'Actual_Points': game['Odds']['Score']} for game in day_games
    ]).values

    # Fine-tune the model with the day's data
    rf.fit(X_train, y_train.flatten())

    # Append games for the day to final results
    final_results.extend(day_games)

    print(f"Finished processing {game_date}")

Processing games for 2024-10-22 00:00:00
Finished processing 2024-10-22 00:00:00
Processing games for 2024-10-23 00:00:00
Finished processing 2024-10-23 00:00:00
Processing games for 2024-10-24 00:00:00
Finished processing 2024-10-24 00:00:00
Processing games for 2024-10-25 00:00:00
Finished processing 2024-10-25 00:00:00
Processing games for 2024-10-26 00:00:00
Finished processing 2024-10-26 00:00:00
Processing games for 2024-10-27 00:00:00
Finished processing 2024-10-27 00:00:00
Processing games for 2024-10-28 00:00:00
Finished processing 2024-10-28 00:00:00
Processing games for 2024-10-29 00:00:00
Finished processing 2024-10-29 00:00:00
Processing games for 2024-10-30 00:00:00
Finished processing 2024-10-30 00:00:00
Processing games for 2024-10-31 00:00:00
Finished processing 2024-10-31 00:00:00
Processing games for 2024-11-01 00:00:00
Finished processing 2024-11-01 00:00:00
Processing games for 2024-11-02 00:00:00
Finished processing 2024-11-02 00:00:00
Processing games for 2024-11

In [41]:
# Convert merged_data to a DataFrame
merged_df = pd.DataFrame(merged_data)

# Save to a CSV file
merged_df.to_csv("2024_predictions_by_rf.csv", index=False)

### Run Analytics on the 2024 Predictions Compared to 2024 Season Actual Results

In [42]:
prediction_errors = []
spread_errors = []
ats_hits = 0
total_games = len(merged_data)

for game in merged_data:
    # Predicted vs actual points 
    home_error = abs(game['Home_Predicted_Points'] - game['Odds']['Score.1'])
    away_error = abs(game['Away_Predicted_Points'] - game['Odds']['Score'])
    prediction_errors.extend([home_error, away_error])

    # Spread analysis
    calculated_spread = game['Predicted_Spread']
    actual_spread = game['Odds']['Home Spread']
    spread_error = abs(calculated_spread - actual_spread)
    spread_errors.append(spread_error)

    # ATS Calculations
    actual_result_spread = game['Odds']['Score.1'] - game['Odds']['Score']
    if (calculated_spread > 0 and actual_result_spread > actual_spread) or (calculated_spread < 0 and actual_result_spread < actual_spread):
            ats_hits += 1

# General performance stats
prediction_mae = np.mean(prediction_errors)
prediction_mse = np.mean(np.square(prediction_errors))
prediction_rmse = np.sqrt(prediction_mse)

spread_mae = np.mean(spread_errors)

ats_percentage = (ats_hits/total_games) * 100

# Display Results
print("General Statistics for Predictions:")
print(f"Mean Absolute Error (Points): {prediction_mae:.2f}")
print(f"Mean Squared Error (Points): {prediction_mse:.2f}")
print(f"Root Mean Squared Error (Points): {prediction_rmse:.2f}")

print("\nSpread Analysis:")
print(f"Mean Absolute Error (Spread): {spread_mae:.2f}")

print("\nATS Results:")
print(f"Total Games: {total_games}")
print(f"ATS Hits: {ats_hits}")
print(f"ATS %: {ats_percentage:.2f}%")

General Statistics for Predictions:
Mean Absolute Error (Points): 8.05
Mean Squared Error (Points): 96.81
Root Mean Squared Error (Points): 9.84

Spread Analysis:
Mean Absolute Error (Spread): 10.81

ATS Results:
Total Games: 207
ATS Hits: 139
ATS %: 67.15%
