In [23]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical



In [2]:
# load the data from a local CSV file
data = pd.read_csv("C:/Users/user/Documents/GitHub/R5D5-Project/model/ML_Model_Dataset_04/complete_data_model_NBA_Player_Stats_Regular.csv")

In [3]:
data.head()

Unnamed: 0,Rk,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,...,Pos_C,Pos_C-PF,Pos_PF,Pos_PF-SF,Pos_PG,Pos_PG-SG,Pos_SF,Pos_SF-SG,Pos_SG,Pos_SG-SF
0,2,28,76,75,26.3,2.8,5.1,0.547,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,3,24,56,56,32.6,7.3,13.0,0.557,0.0,0.1,...,1,0,0,0,0,0,0,0,0,0
2,5,36,47,12,22.3,5.4,9.7,0.55,0.3,1.0,...,1,0,0,0,0,0,0,0,0,0
3,6,23,65,21,22.6,3.9,10.5,0.372,1.6,5.2,...,0,0,0,0,0,0,0,0,1,0
4,6,23,50,19,26.3,4.7,12.6,0.375,1.9,6.1,...,0,0,0,0,0,0,0,0,1,0


In [4]:
# print the column names
print(data.columns)

Index(['Rk', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
       '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'salary', 'Pos_C', 'Pos_C-PF',
       'Pos_PF', 'Pos_PF-SF', 'Pos_PG', 'Pos_PG-SG', 'Pos_SF', 'Pos_SF-SG',
       'Pos_SG', 'Pos_SG-SF'],
      dtype='object')


In [30]:
selected_features = ['Age','2PA','2P','FT','BLK','AST','TRB','DRB','FGA','G','Rk','PTS','PF','ORB','3P%','GS','Pos_PF']
target_column = 'salary'

In [31]:
# split the data into training and testing sets
X = data[selected_features]
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# train a gradient boosting regressor on the training set
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)


GradientBoostingRegressor()

In [33]:
# make predictions on the testing set
y_pred = gb.predict(X_test)


In [34]:
accuracy = gb.score(X_test, y_test) * 100
print("Accuracy: {:.2f}%".format(accuracy))


Accuracy: 55.28%


In [35]:
# split the data into training and testing sets
X = data[selected_features]
y = data['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# create a Gradient Boosting Regressor model
gb = GradientBoostingRegressor(random_state=0)

In [37]:
# define the hyperparameters to tune
hyperparameters = {'n_estimators': [100, 200, 300],
                   'max_depth': [3, 4, 5],
                   'learning_rate': [0.1, 0.05, 0.01]}

In [38]:
# perform a grid search to find the best combination of hyperparameters
grid_search = GridSearchCV(gb, hyperparameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': [0.1, 0.05, 0.01],
                         'max_depth': [3, 4, 5],
                         'n_estimators': [100, 200, 300]},
             scoring='neg_mean_squared_error')

In [39]:
# print the best hyperparameters and mean squared error
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best MSE: ", -grid_search.best_score_)

Best Hyperparameters:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best MSE:  0.02435771501765257


In [40]:
# create a new Gradient Boosting Regressor model with the best hyperparameters found
gb_final = GradientBoostingRegressor(n_estimators=200, max_depth=3, learning_rate=0.1, random_state=0)


In [41]:
# train the model on the entire training dataset
gb_final.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=200, random_state=0)

In [42]:
# make predictions on the testing dataset
y_pred = gb_final.predict(X_test)

In [43]:
# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [44]:
print("Mean Squared Error: ", mse)
print("Mean Absolute Error: ", mae)
print("R-squared: ", r2)

Mean Squared Error:  0.031583727722899954
Mean Absolute Error:  0.12987304316427262
R-squared:  0.555604933385166


In [45]:
r2 = r2_score(y_test, y_pred)
r2_percent = r2 * 100
print("R-squared: {:.2f}%".format(r2_percent))


R-squared: 55.56%


In [46]:
# define the hyperparameter search space
param_grid = {
    'learning_rate': Real(0.01, 0.1, prior='log-uniform'),
    'n_estimators': Integer(100, 500),
    'max_depth': Integer(2, 6),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 10),
    'subsample': Real(0.5, 1.0, prior='uniform'),
    'max_features': Categorical(['sqrt', 'log2', None])
}

In [47]:
# perform hyperparameter search using Bayesian Optimization
gb = GradientBoostingRegressor(random_state=42)
bayes_search = BayesSearchCV(gb, param_grid, cv=5, n_iter=50, random_state=42, verbose=2)
bayes_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.2s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.1s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.1s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.1s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.1s
Fitting 5 folds for each o

[CV] END learning_rate=0.09025639567934764, max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=185, subsample=0.5000305253763367; total time=   0.0s
[CV] END learning_rate=0.09025639567934764, max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=185, subsample=0.5000305253763367; total time=   0.0s
[CV] END learning_rate=0.09025639567934764, max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=185, subsample=0.5000305253763367; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_r

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.05118496077892468, max_depth=6, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=185, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.05118496077892468, max_depth=6, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=185, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.05118496077892468, max_depth=6, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=185, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.05118496077892468, max_depth=6, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=185, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.05118496077892468, max_depth=6, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=185, subsample=1.0; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.049216152912

[CV] END learning_rate=0.01, max_depth=4, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=500, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=4, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=500, subsample=0.5; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.04473096410350535, max_depth=6, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estimators=231, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.04473096410350535, max_depth=6, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estimators=231, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.04473096410350535, max_depth=6, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estimators=231, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.04473096410350535, max_depth=6, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estim

[CV] END learning_rate=0.030805264974473393, max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=421, subsample=1.0; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.05987417122299561, max_depth=5, max_features=None, min_samples_leaf=3, min_samples_split=8, n_estimators=500, subsample=0.5; total time=   0.2s
[CV] END learning_rate=0.05987417122299561, max_depth=5, max_features=None, min_samples_leaf=3, min_samples_split=8, n_estimators=500, subsample=0.5; total time=   0.2s
[CV] END learning_rate=0.05987417122299561, max_depth=5, max_features=None, min_samples_leaf=3, min_samples_split=8, n_estimators=500, subsample=0.5; total time=   0.2s
[CV] END learning_rate=0.05987417122299561, max_depth=5, max_features=None, min_samples_leaf=3, min_samples_split=8, n_estimators=500, subsample=0.5; total time=   0.2s
[CV] END learning_rate=0.05987417122299561, max_depth=5, max_features=None, min_samples_leaf=3,

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.014606856857460347, max_depth=2, max_features=log2, min_samples_leaf=10, min_samples_split=9, n_estimators=101, subsample=0.7741254515242885; total time=   0.0s
[CV] END learning_rate=0.014606856857460347, max_depth=2, max_features=log2, min_samples_leaf=10, min_samples_split=9, n_estimators=101, subsample=0.7741254515242885; total time=   0.0s
[CV] END learning_rate=0.014606856857460347, max_depth=2, max_features=log2, min_samples_leaf=10, min_samples_split=9, n_estimators=101, subsample=0.7741254515242885; total time=   0.0s
[CV] END learning_rate=0.014606856857460347, max_depth=2, max_features=log2, min_samples_leaf=10, min_samples_split=9, n_estimators=101, subsample=0.7741254515242885; total time=   0.0s
[CV] END learning_rate=0.014606856857460347, max_depth=2, max_features=log2, min_samples_leaf=10, min_samples_split=9, n_estimators=101, subsample=0.7741254515242885; total time=   0.0s
Fitting 5 f

BayesSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
              random_state=42,
              search_spaces={'learning_rate': Real(low=0.01, high=0.1, prior='log-uniform', transform='normalize'),
                             'max_depth': Integer(low=2, high=6, prior='uniform', transform='normalize'),
                             'max_features': Categorical(categories=('sqrt', 'log2', None), prior=None),
                             'min_samples_leaf': Integer(low=1, high=10, prior='uniform', transform='normalize'),
                             'min_samples_split': Integer(low=2, high=10, prior='uniform', transform='normalize'),
                             'n_estimators': Integer(low=100, high=500, prior='uniform', transform='normalize'),
                             'subsample': Real(low=0.5, high=1.0, prior='uniform', transform='normalize')},
              verbose=2)

In [48]:
# evaluate the performance of the best model on the testing set
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test) * 100
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 58.10%


In [49]:
# print the best hyperparameters
print("Best hyperparameters: ", bayes_search.best_params_)

Best hyperparameters:  OrderedDict([('learning_rate', 0.0353207336623921), ('max_depth', 5), ('max_features', None), ('min_samples_leaf', 3), ('min_samples_split', 2), ('n_estimators', 486), ('subsample', 0.5)])
