In [41]:
%pip install pandas scikit-learn
%pip install numpy --upgrade

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [42]:
import pandas as pd
import pickle
import numpy as np

In [43]:
nba_data = pd.read_csv('data/nba_advanced_data.csv')

targets = ['MIN', 'FGM', 'FGA', 'PTS', '3FM', '3FA', 'FTM', 'FTA', 'REB', 'AST', 'STL', 'BLK', 'OREB', 'TO', 'PF']

features = nba_data.columns.difference(targets + ['PlayerID', 'TeamID', 'OpponentID', 'Season'])

In [44]:
X = nba_data[features]
y = nba_data[targets]

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [46]:
from sklearn.ensemble import RandomForestRegressor

In [47]:
rfr = RandomForestRegressor(random_state=42, n_jobs=-1, verbose=True)

rfr.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   25.0s finished


In [48]:
y_pred = rfr.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns=targets, index=X_test.index)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished


In [49]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [50]:
# Evaluate each target variable
for var in targets:
    mse = mean_squared_error(y_test[var], y_pred[var])
    mae = mean_absolute_error(y_test[var], y_pred[var])
    r2 = r2_score(y_test[var], y_pred[var])
    print(f'Performance metrics for {var}:')
    print(f'  Mean Squared Error (MSE): {mse:.2f}')
    print(f'  Mean Absolute Error (MAE): {mae:.2f}')
    print(f'  R² Score: {r2:.2f}')
    print('---')

Performance metrics for MIN:
  Mean Squared Error (MSE): 26.81
  Mean Absolute Error (MAE): 4.06
  R² Score: 0.78
---
Performance metrics for FGM:
  Mean Squared Error (MSE): 1.59
  Mean Absolute Error (MAE): 0.93
  R² Score: 0.86
---
Performance metrics for FGA:
  Mean Squared Error (MSE): 8.51
  Mean Absolute Error (MAE): 2.23
  R² Score: 0.78
---
Performance metrics for PTS:
  Mean Squared Error (MSE): 6.73
  Mean Absolute Error (MAE): 1.94
  R² Score: 0.92
---
Performance metrics for 3FM:
  Mean Squared Error (MSE): 1.21
  Mean Absolute Error (MAE): 0.78
  R² Score: 0.45
---
Performance metrics for 3FA:
  Mean Squared Error (MSE): 5.06
  Mean Absolute Error (MAE): 1.68
  R² Score: 0.44
---
Performance metrics for FTM:
  Mean Squared Error (MSE): 3.07
  Mean Absolute Error (MAE): 1.25
  R² Score: 0.47
---
Performance metrics for FTA:
  Mean Squared Error (MSE): 4.43
  Mean Absolute Error (MAE): 1.54
  R² Score: 0.46
---
Performance metrics for REB:
  Mean Squared Error (MSE): 2.80
 

In [56]:
params = {
    'n_estimators' : [100, 200],
    'max_depth': [None, 10, 200],
    'min_samples_split' : [2, 5],
    'min_samples_leaf' : [1, 2]
}

In [57]:
from sklearn.model_selection import GridSearchCV

In [58]:
rfr_cv = GridSearchCV(estimator=rfr, cv=3, param_grid=params, scoring="neg_mean_squared_error", n_jobs=-1, verbose=True)

In [59]:
rfr_cv.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  34 tasks     

KeyboardInterrupt: 

In [None]:
with open("models/nba_random_forest_multi_target_advanced_data.pkl", "wb") as file:
    pickle.dump(rfr_cv, file)