# Capstone 2 Modeling
## NBA Salary Predictor and Trade Suggestion
## Austin Cody
#### Models to attempt: Lasso Regression, Random Forest Regressor, XGBoost
#### Tuning techniques: GridsearchCV, Bayesian Optimization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
def print_metrics(y_test,y_pred):
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'Mean Squared Error: {rmse}')

    mae = mean_absolute_error(y_test, y_pred)
    print(f'Mean Absolute Error: {mae}')

    r2 = r2_score(y_test, y_pred)
    print(f'R-squared: {r2}')

In [3]:
# reading in csv, splitting data into train/test
df = pd.read_csv('nba_salaries_dummies.csv')
features = ['age','2_pointers_pg','3_pointers_pg','free_throws_pg','assists_pg','points_pg', 'minutes_pg']

X = df[features].values
y = df['salary'].values

# We will do optimization and kfold cross validation on the training set and we don't even touch our test set until the final eval
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [4]:
#scaling from previous notebook
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Linear Regression using Lasso Regularization and hyperparameter tuning using RandomSearchCV

In [5]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold

params = {'alpha': [0.01, 0.1, 1.0, 10.0]}
lasso = Lasso(max_iter=100000)
lasso_gscv = GridSearchCV(lasso, param_grid=params, cv=5, scoring='r2')
lasso_gscv.fit(X_train_scaled,y_train)

best_alpha = lasso_gscv.best_params_

best_lasso = Lasso(alpha=best_alpha['alpha'], max_iter=100000)
best_lasso.fit(X_train_scaled, y_train)
y_pred = best_lasso.predict(X_test_scaled)

In [6]:
print("Optimal Parameters for Lasso Regression")
print_metrics(y_test,y_pred)

Optimal Parameters for Lasso Regression
Mean Squared Error: 6412006.254384973
Mean Absolute Error: 4609206.600191674
R-squared: 0.7155166799046566


In [7]:
#cross validation using the optimized random forest regressor
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=5, shuffle=True)
cv_results = cross_val_score(best_lasso, X, y, cv=kf, scoring='r2')
np.mean(cv_results)

0.603088943001665

## Random Forest Regressor Bayesian Optimization 

In [8]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score, KFold

#optimizes r2 for our random forest regressor
def rfr_objective_function(n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,max_leaf_nodes):
    rfr = RandomForestRegressor(n_estimators=int(n_estimators),
                                max_depth=int(max_depth),
                                min_samples_split=int(min_samples_split),
                                min_samples_leaf=int(min_samples_leaf),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes))

    # cross validating inside our objective functions avoids overfitting to our individual set of data
    kf = KFold(n_splits=5, shuffle=True)
    cv_scores = cross_val_score(rfr, X_train_scaled, y_train, cv=kf, scoring='r2')

    return np.mean(cv_scores)

rfr_bo = BayesianOptimization(f = rfr_objective_function,
                                pbounds = {'n_estimators':(10,200),
                                           'max_depth':(1,20),
                                           'min_samples_split':(2,10),
                                           'min_samples_leaf':(1,10),
                                           'max_features':(3,len(features)),
                                           'max_leaf_nodes':(2,50)})

rfr_bo.maximize(n_iter=15, init_points=5)

|   iter    |  target   | max_depth | max_fe... | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6721   [0m | [0m7.512    [0m | [0m4.365    [0m | [0m14.59    [0m | [0m2.062    [0m | [0m9.264    [0m | [0m173.3    [0m |
| [0m2        [0m | [0m0.5597   [0m | [0m2.798    [0m | [0m3.882    [0m | [0m31.09    [0m | [0m2.266    [0m | [0m7.336    [0m | [0m36.38    [0m |
| [0m3        [0m | [0m0.3554   [0m | [0m7.109    [0m | [0m4.445    [0m | [0m2.041    [0m | [0m5.362    [0m | [0m7.698    [0m | [0m101.4    [0m |
| [0m4        [0m | [0m0.6068   [0m | [0m16.22    [0m | [0m3.157    [0m | [0m41.76    [0m | [0m9.881    [0m | [0m8.475    [0m | [0m34.33    [0m |
| [0m5        [0m | [0m0.6417   [0m | [0m13.97    [0m | [0m4.861    [0m | [0m8.185    [0m | [0m8.006    [0m | [0m4.285    [0m | [0m199.0    

In [9]:
print("Optimal Parameters for Random Forest Regressor")
for key, value in rfr_bo.max['params'].items():
    print("{}: {}".format(key,value))

Optimal Parameters for Random Forest Regressor
max_depth: 20.0
max_features: 3.0
max_leaf_nodes: 39.02008696395414
min_samples_leaf: 1.0
min_samples_split: 10.0
n_estimators: 156.66834376080791


In [10]:
rfr_optimized = RandomForestRegressor(max_depth = int(rfr_bo.max['params']['max_depth']),
                                      max_features = int(rfr_bo.max['params']['max_features']),
                                      max_leaf_nodes = int(rfr_bo.max['params']['max_leaf_nodes']),
                                      min_samples_leaf = int(rfr_bo.max['params']['min_samples_leaf']),
                                      min_samples_split = int(rfr_bo.max['params']['min_samples_split']),
                                      n_estimators = int(rfr_bo.max['params']['n_estimators']))
rfr_optimized.fit(X_train_scaled,y_train)
y_pred = rfr_optimized.predict(X_test_scaled)

print('Bayesian Optimized Random Forest Regressor:')
print_metrics(y_test,y_pred)

Bayesian Optimized Random Forest Regressor:
Mean Squared Error: 6165325.348730452
Mean Absolute Error: 4068783.5999314766
R-squared: 0.7369847487188208


In [11]:
#cross validation using the optimized random forest regressor
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=5, shuffle=True)
cv_results = cross_val_score(rfr_optimized, X, y, cv=kf, scoring='r2')
np.mean(cv_results)

0.7218020154618174

## XGBoost with Bayesian Optimization 

In [12]:
from xgboost import XGBRegressor

#optimizes rmse for our linear model
def xgb_objective_function(learning_rate,max_depth,subsample,colsample_bytree,reg_lambda,reg_alpha):
    # instantiating xgbregressor
    xgb = XGBRegressor(objective='reg:squarederror',
                       learning_rate=learning_rate, 
                       max_depth=int(max_depth), 
                       subsample=subsample, 
                       colsample_bytree=colsample_bytree,
                       reg_lambda=reg_lambda, 
                       reg_alpha=reg_alpha)

    # cross validating inside our objective functions avoids overfitting to our individual set of data
    kf = KFold(n_splits=5, shuffle=True)
    cv_scores = cross_val_score(xgb, X_train_scaled, y_train, cv=kf, scoring='r2')

    return np.mean(cv_scores)

xgb_bo = BayesianOptimization(f = xgb_objective_function,
                                pbounds = {'learning_rate':(0.01,0.3),
                                           'max_depth':(3,10), 
                                           'subsample':(0.5,1.0),
                                           'colsample_bytree':(0.5,1.0),
                                           'reg_lambda':(0.01,10),
                                           'reg_alpha':(0.01,10)})

xgb_bo.maximize(n_iter=50, init_points=5)

|   iter    |  target   | colsam... | learni... | max_depth | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.5912   [0m | [0m0.649    [0m | [0m0.2938   [0m | [0m7.808    [0m | [0m1.758    [0m | [0m1.517    [0m | [0m0.7357   [0m |
| [95m2        [0m | [95m0.6388   [0m | [95m0.7887   [0m | [95m0.1511   [0m | [95m9.613    [0m | [95m1.029    [0m | [95m9.394    [0m | [95m0.5657   [0m |
| [0m3        [0m | [0m0.5025   [0m | [0m0.5287   [0m | [0m0.01317  [0m | [0m8.479    [0m | [0m1.653    [0m | [0m0.6026   [0m | [0m0.9172   [0m |
| [0m4        [0m | [0m0.6103   [0m | [0m0.6153   [0m | [0m0.2624   [0m | [0m9.786    [0m | [0m3.885    [0m | [0m8.559    [0m | [0m0.8429   [0m |
| [0m5        [0m | [0m0.6003   [0m | [0m0.558    [0m | [0m0.08555  [0m | [0m7.795    [0m | [0m9.084    [0m | [0m4.084    [0m | [0m0.

In [13]:
print("Optimal Parameters for XGBRegressor")
for key, value in xgb_bo.max['params'].items():
    print("{}: {}".format(key,value))

Optimal Parameters for XGBRegressor
colsample_bytree: 0.9982992616656767
learning_rate: 0.08073118145558139
max_depth: 4.693932292067143
reg_alpha: 5.457384174235592
reg_lambda: 0.7921501767353445
subsample: 0.9235536582962283


In [14]:
# using the best hyperparameters when creating our XGBRegressor object
xgb_optimized = XGBRegressor(learning_rate = xgb_bo.max['params']['learning_rate'],
                                      max_depth = int(xgb_bo.max['params']['max_depth']),
                                      subsample = xgb_bo.max['params']['subsample'],
                                      colsample_bytree = xgb_bo.max['params']['colsample_bytree'],
                                      reg_lambda = xgb_bo.max['params']['reg_lambda'],
                                      reg_alpha = xgb_bo.max['params']['reg_alpha'])
xgb_optimized.fit(X_train_scaled,y_train)
y_pred = xgb_optimized.predict(X_test_scaled)

print('Bayesian Optimized XGBoost:')
print_metrics(y_test,y_pred)

Bayesian Optimized XGBoost:
Mean Squared Error: 5635339.8217299655
Mean Absolute Error: 3555351.6209016396
R-squared: 0.7802599833357848


In [35]:
#cross validation using the optimized xgbregressor
kf = KFold(n_splits=5, shuffle=True)
cv_results = cross_val_score(xgb_optimized, X, y, cv=kf, scoring='r2')
np.mean(cv_results)

0.7190084592817823

# Best Model
Our Bayesian optimized XGBoost and Random Forest Regressor models seemed to be the strongest and were very similar in performance. Probably our XGBoost model is the best way to go. It has lower error and though it had a higher r2 when evaluated using the test set, it performs less well using cross validation on our data as a whole.

# Identifying overvalued and undervalued players