# Capstone 2 Modeling
## NBA Salary Predictor and Trade Suggestion
## Austin Cody
#### Models to attempt: Linear Regression, Random Forest Regressor, XGBoost, Support Vector Machine
#### Tuning techniques: RandomsearchCV, Bayesian Optimization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
#defining functions
def print_metrics(y_test,y_pred):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'Mean Squared Error: {rmse}')

    mae = mean_absolute_error(y_test, y_pred)
    print(f'Mean Absolute Error: {mae}')

    r2 = r2_score(y_test, y_pred)
    print(f'R-squared: {r2}')

In [3]:
# reading in csv, splitting data into train/test
df = pd.read_csv('nba_salaries_dummies.csv')
features = ['age','2_pointers_pg','3_pointers_pg','free_throws_pg','assists_pg','points_pg', 'minutes_pg']

X = df[features].values
y = df['salary'].values

# we don't need to stratify because this data is necessarily balanced
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [4]:
#scaling from previous notebook
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Linear Regression using Lasso Regularization and hyperparameter tuning using Bayesian Optimization 

In [None]:
from sklearn.linear_model import Lasso
from bayes_opt import BayesianOptimization

#optimizes rmse for our linear model
def lasso_eval(alpha):
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train_scaled, y_train)
    y_pred = lasso.predict(X_test_scaled)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return -rmse # we want to minimize rmse so we make it negative

In [None]:
lasso_bo = BayesianOptimization(f = lasso_eval,
                                pbounds = {'alpha': (0.01, 1.0)})

lasso_bo.maximize(n_iter=1000, 
                  init_points=5)

In [None]:
#### K FOLD CROSS VALIDATION ON YOUR DATA

## Random Forest Regressor using hyperparameter tuning with Bayesian Optimization 

In [8]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

#optimizes rmse for our linear model
def rfr_eval(n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,max_leaf_nodes):
    rfr = RandomForestRegressor(n_estimators=int(n_estimators),
                                max_depth=int(max_depth),
                                min_samples_split=int(min_samples_split),
                                min_samples_leaf=int(min_samples_leaf),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes))
    rfr.fit(X_train_scaled, y_train)
    y_pred = rfr.predict(X_test_scaled)
    
    #rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # we want to minimize rmse so we make it negative
    r2 = r2_score(y_test, y_pred)
    return r2

rfr_bo = BayesianOptimization(f = rfr_eval,
                                pbounds = {'n_estimators':(10,200),
                                           'max_depth':(1,20),
                                           'min_samples_split':(2,10),
                                           'min_samples_leaf':(1,10),
                                           'max_features':(3,len(features)),
                                           'max_leaf_nodes':(2,50)})

rfr_bo.maximize(n_iter=50, 
                  init_points=5)

|   iter    |  target   | max_depth | max_fe... | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.787    [0m | [0m19.74    [0m | [0m3.496    [0m | [0m38.49    [0m | [0m2.623    [0m | [0m8.213    [0m | [0m40.61    [0m |
| [0m2        [0m | [0m0.7013   [0m | [0m2.317    [0m | [0m5.066    [0m | [0m39.29    [0m | [0m1.353    [0m | [0m7.64     [0m | [0m109.5    [0m |
| [95m3        [0m | [95m0.798    [0m | [95m12.86    [0m | [95m6.24     [0m | [95m23.26    [0m | [95m6.852    [0m | [95m7.59     [0m | [95m16.47    [0m |
| [0m4        [0m | [0m0.7412   [0m | [0m7.596    [0m | [0m3.421    [0m | [0m24.4     [0m | [0m8.136    [0m | [0m5.21     [0m | [0m113.2    [0m |
| [0m5        [0m | [0m0.7962   [0m | [0m14.94    [0m | [0m5.619    [0m | [0m19.49    [0m | [0m7.829    [0m | [0m9.656    [0m | [0m88

In [11]:
rfr_bo.max

{'target': 0.8254060412153806,
 'params': {'max_depth': 9.050099131458282,
  'max_features': 7.0,
  'max_leaf_nodes': 20.864830126658855,
  'min_samples_leaf': 1.0,
  'min_samples_split': 2.0,
  'n_estimators': 27.40089684840415}}

In [13]:
rfr_bo.max['params']['max_features']

7.0

In [46]:
rfr_optimized = RandomForestRegressor(max_depth = int(rfr_bo.max['params']['max_depth']),
                                      max_features = int(rfr_bo.max['params']['max_features']),
                                      max_leaf_nodes = int(rfr_bo.max['params']['max_leaf_nodes']),
                                      min_samples_leaf = int(rfr_bo.max['params']['min_samples_leaf']),
                                      min_samples_split = int(rfr_bo.max['params']['min_samples_split']),
                                      n_estimators = int(rfr_bo.max['params']['n_estimators']))
rfr_optimized.fit(X_train_scaled,y_train)
y_pred = rfr_optimized.predict(X_test_scaled)

print('Bayesian Optimized Random Forest Regressor:')
print_metrics(y_test,y_pred)

Bayesian Optimized Random Forest Regressor:
Mean Squared Error: 5119270.354739264
Mean Absolute Error: 3507382.0342028835
R-squared: 0.821973621486979


In [51]:
#cross validation
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=5, shuffle=True)
cv_results = cross_val_score(rfr_optimized, X, y, cv=kf)
np.mean(cv_results)

0.6880962665756261