In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [2]:
# load the data from a local CSV file
data = pd.read_csv("C:/Users/user/Documents/GitHub/R5D5-Project/model/ML_Model_Dataset_03/complete_data_model_NBA_Player_Stats_Playoffs.csv")

In [3]:
data.head()

Unnamed: 0,Rk,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,...,BLK,TOV,PF,PTS,salary,Pos_C,Pos_PF,Pos_PG,Pos_SF,Pos_SG
0,2,28,7,5,16.3,1.3,3.0,0.429,0.0,0.0,...,0.1,0.6,1.7,3.4,0.639241,1,0,0,0,0
1,3,24,18,18,34.1,5.8,9.7,0.594,0.0,0.1,...,0.7,2.1,3.1,14.8,0.077918,1,0,0,0,0
2,4,23,1,0,5.0,2.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.065652,0,0,0,0,1
3,5,26,12,5,25.4,3.1,6.8,0.451,1.6,4.0,...,0.3,0.8,1.8,8.3,0.052227,0,0,0,0,1
4,7,28,12,1,18.4,2.4,4.3,0.569,0.3,1.0,...,0.6,0.8,1.8,6.0,0.218796,0,1,0,0,0


In [4]:
# print the column names
print(data.columns)

Index(['Rk', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
       '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'salary', 'Pos_C', 'Pos_PF',
       'Pos_PG', 'Pos_SF', 'Pos_SG'],
      dtype='object')


In [31]:
selected_features = ['Age','TOV','AST','PTS','MP','Rk','DRB','BLK','2P%','ORB','3P%','eFG%']
target_column = 'salary'

In [32]:
# split the data into training and testing sets
X = data[selected_features]
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# train a gradient boosting regressor on the training set
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)


GradientBoostingRegressor()

In [34]:
# make predictions on the testing set
y_pred = gb.predict(X_test)


In [35]:
accuracy = gb.score(X_test, y_test) * 100
print("Accuracy: {:.2f}%".format(accuracy))


Accuracy: 29.72%


In [10]:
# split the data into training and testing sets
X = data[selected_features]
y = data['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# create a Gradient Boosting Regressor model
gb = GradientBoostingRegressor(random_state=0)

In [12]:
# define the hyperparameters to tune
hyperparameters = {'n_estimators': [100, 200, 300],
                   'max_depth': [3, 4, 5],
                   'learning_rate': [0.1, 0.05, 0.01]}

In [13]:
# perform a grid search to find the best combination of hyperparameters
grid_search = GridSearchCV(gb, hyperparameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': [0.1, 0.05, 0.01],
                         'max_depth': [3, 4, 5],
                         'n_estimators': [100, 200, 300]},
             scoring='neg_mean_squared_error')

In [14]:
# print the best hyperparameters and mean squared error
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best MSE: ", -grid_search.best_score_)

Best Hyperparameters:  {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
Best MSE:  0.033561182917278234


In [60]:
# create a new Gradient Boosting Regressor model with the best hyperparameters found
gb_final = GradientBoostingRegressor(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=0)


In [61]:
# train the model on the entire training dataset
gb_final.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.05, n_estimators=200, random_state=0)

In [62]:
# make predictions on the testing dataset
y_pred = gb_final.predict(X_test)

In [63]:
# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [64]:
print("Mean Squared Error: ", mse)
print("Mean Absolute Error: ", mae)
print("R-squared: ", r2)

Mean Squared Error:  0.052197771275375796
Mean Absolute Error:  0.16557790341636489
R-squared:  0.2995889923735212


In [65]:
r2 = r2_score(y_test, y_pred)
r2_percent = r2 * 100
print("R-squared: {:.2f}%".format(r2_percent))


R-squared: 29.96%


In [21]:
# split the data into training and testing sets
X = data[selected_features]
y = data['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# define the hyperparameter search space
param_grid = {
    'learning_rate': Real(0.01, 0.1, prior='log-uniform'),
    'n_estimators': Integer(100, 500),
    'max_depth': Integer(2, 6),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 10),
    'subsample': Real(0.5, 1.0, prior='uniform'),
    'max_features': Categorical(['sqrt', 'log2', None])
}

In [23]:
# perform hyperparameter search using Bayesian Optimization
gb = GradientBoostingRegressor(random_state=42)
bayes_search = BayesSearchCV(gb, param_grid, cv=5, n_iter=50, random_state=42, verbose=2)
bayes_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
Fitting 5 folds for each o

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
Fitting 5 folds 

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=105, subsample=0.6243250435069779; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=105, subsample=0.6243250435069779; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=105, subsample=0.6243250435069779; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=105, subsample=0.6243250435069779; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=105, subsample=0.6243250435069779; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.1, max_depth

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.06788023920445821, max_depth=3, max_features=log2, min_samples_leaf=3, min_samples_split=3, n_estimators=147, subsample=0.7538210671851298; total time=   0.0s
[CV] END learning_rate=0.06788023920445821, max_depth=3, max_features=log2, min_samples_leaf=3, min_samples_split=3, n_estimators=147, subsample=0.7538210671851298; total time=   0.0s
[CV] END learning_rate=0.06788023920445821, max_depth=3, max_features=log2, min_samples_leaf=3, min_samples_split=3, n_estimators=147, subsample=0.7538210671851298; total time=   0.0s
[CV] END learning_rate=0.06788023920445821, max_depth=3, max_features=log2, min_samples_leaf=3, min_samples_split=3, n_estimators=147, subsample=0.7538210671851298; total time=   0.0s
[CV] END learning_rate=0.06788023920445821, max_depth=3, max_features=log2, min_samples_leaf=3, min_samples_split=3, n_estimators=147, subsample=0.7538210671851298; total time=   0.0s
Fitting 5 folds for e

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.036217081950753474, max_depth=3, max_features=None, min_samples_leaf=5, min_samples_split=8, n_estimators=100, subsample=0.6912714032933461; total time=   0.0s
[CV] END learning_rate=0.036217081950753474, max_depth=3, max_features=None, min_samples_leaf=5, min_samples_split=8, n_estimators=100, subsample=0.6912714032933461; total time=   0.0s
[CV] END learning_rate=0.036217081950753474, max_depth=3, max_features=None, min_samples_leaf=5, min_samples_split=8, n_estimators=100, subsample=0.6912714032933461; total time=   0.0s
[CV] END learning_rate=0.036217081950753474, max_depth=3, max_features=None, min_samples_leaf=5, min_samples_split=8, n_estimators=100, subsample=0.6912714032933461; total time=   0.0s
[CV] END learning_rate=0.036217081950753474, max_depth=3, max_features=None, min_samples_leaf=5, min_samples_split=8, n_estimators=100, subsample=0.6912714032933461; total time=   0.0s
Fitting 5 folds 

[CV] END learning_rate=0.04015175974313044, max_depth=6, max_features=None, min_samples_leaf=3, min_samples_split=10, n_estimators=199, subsample=0.541577222245321; total time=   0.0s
[CV] END learning_rate=0.04015175974313044, max_depth=6, max_features=None, min_samples_leaf=3, min_samples_split=10, n_estimators=199, subsample=0.541577222245321; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.03317153582368152, max_depth=6, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=128, subsample=0.9297463492153211; total time=   0.0s
[CV] END learning_rate=0.03317153582368152, max_depth=6, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=128, subsample=0.9297463492153211; total time=   0.0s
[CV] END learning_rate=0.03317153582368152, max_depth=6, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=128, subsample=0.9297463492153211; total time=   0.0s
[CV] END learning_

BayesSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
              random_state=42,
              search_spaces={'learning_rate': Real(low=0.01, high=0.1, prior='log-uniform', transform='normalize'),
                             'max_depth': Integer(low=2, high=6, prior='uniform', transform='normalize'),
                             'max_features': Categorical(categories=('sqrt', 'log2', None), prior=None),
                             'min_samples_leaf': Integer(low=1, high=10, prior='uniform', transform='normalize'),
                             'min_samples_split': Integer(low=2, high=10, prior='uniform', transform='normalize'),
                             'n_estimators': Integer(low=100, high=500, prior='uniform', transform='normalize'),
                             'subsample': Real(low=0.5, high=1.0, prior='uniform', transform='normalize')},
              verbose=2)

In [67]:
# evaluate the performance of the best model on the testing set
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test) * 100
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 34.58%


In [36]:
# print the best hyperparameters
print("Best hyperparameters: ", bayes_search.best_params_)

Best hyperparameters:  OrderedDict([('learning_rate', 0.04295268354485657), ('max_depth', 6), ('max_features', None), ('min_samples_leaf', 3), ('min_samples_split', 10), ('n_estimators', 225), ('subsample', 0.8204906329264618)])
