In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [2]:
# load the data from a local CSV file
data = pd.read_csv("C:/Users/user/Documents/GitHub/R5D5-Project/model/ML_Model_Dataset_02/complete_data_model_2k_ratings.csv")

In [3]:
data.head()

Unnamed: 0,Close Shot,Mid-Range Shot,Three-Point Shot,Free Throw,Shot IQ,Offensive Consistency,Speed,Acceleration,Strength,Vertical,...,Perimeter Defense,Steal,Block,Lateral Quickness,Help Defense IQ,Pass Perception,Defensive Consistency,Offensive Rebound,Defensive Rebound,salary
0,76,60,74,70,97,85,71,67,72,92,...,79,40,52,74,75,68,70,52,74,0.473311
1,70,90,75,73,86,65,53,48,91,69,...,63,42,65,67,92,70,90,50,68,0.683705
2,76,47,60,64,62,70,60,58,95,79,...,52,87,80,52,84,80,75,90,97,0.660252
3,85,71,77,72,66,95,82,85,57,98,...,66,37,60,82,72,75,55,41,56,0.670896
4,90,72,74,86,90,98,79,78,87,86,...,73,63,89,71,97,82,95,75,81,0.660252


In [4]:
# print the column names
print(data.columns)

Index(['Close Shot', 'Mid-Range Shot', 'Three-Point Shot', 'Free Throw',
       'Shot IQ', 'Offensive Consistency', 'Speed', 'Acceleration', 'Strength',
       'Vertical', 'Stamina', 'Hustle', 'Overall Durability', 'Layup',
       'Standing Dunk', 'Driving Dunk', 'Post Hook', 'Post Fade',
       'Post Control', 'Draw Foul', 'Hands', 'Pass Accuracy', 'Ball Handle',
       'Speed with Ball', 'Pass IQ', 'Pass Vision', 'Interior Defense',
       'Perimeter Defense', 'Steal', 'Block', 'Lateral Quickness',
       'Help Defense IQ', 'Pass Perception', 'Defensive Consistency',
       'Offensive Rebound', 'Defensive Rebound', 'salary'],
      dtype='object')


In [5]:
#Placing selected features which has >90% contribution from gb feature selection
selected_features = ['Free Throw','Post Control','Hands','Ball Handle','Perimeter Defense','Help Defense IQ','Defensive Consistency','Defensive Rebound','Steal','Pass Perception','Post Fade','Overall Durability','Acceleration','Three-Point Shot','Offensive Rebound','Hustle']
target_column = 'salary'


In [8]:
# split the data into training and testing sets
X = data[selected_features]
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# train a gradient boosting regressor on the training set
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)


GradientBoostingRegressor()

In [10]:
# make predictions on the testing set
y_pred = gb.predict(X_test)


In [11]:
accuracy = gb.score(X_test, y_test) * 100
print("Accuracy: {:.2f}%".format(accuracy))


Accuracy: 39.16%


In [15]:
# split the data into training and testing sets
X = data[selected_features]
y = data['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# create a Gradient Boosting Regressor model
gb = GradientBoostingRegressor(random_state=0)

In [17]:
# define the hyperparameters to tune
hyperparameters = {'n_estimators': [100, 200, 300],
                   'max_depth': [3, 4, 5],
                   'learning_rate': [0.1, 0.05, 0.01]}

In [18]:
# perform a grid search to find the best combination of hyperparameters
grid_search = GridSearchCV(gb, hyperparameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': [0.1, 0.05, 0.01],
                         'max_depth': [3, 4, 5],
                         'n_estimators': [100, 200, 300]},
             scoring='neg_mean_squared_error')

In [19]:
# print the best hyperparameters and mean squared error
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best MSE: ", -grid_search.best_score_)

Best Hyperparameters:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Best MSE:  0.0737212122028261


In [20]:
# create a new Gradient Boosting Regressor model with the best hyperparameters found
gb_final = GradientBoostingRegressor(n_estimators=200, max_depth=3, learning_rate=0.1, random_state=0)


In [21]:
# train the model on the entire training dataset
gb_final.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=200, random_state=0)

In [22]:
# make predictions on the testing dataset
y_pred = gb_final.predict(X_test)

In [23]:
# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [24]:
print("Mean Squared Error: ", mse)
print("Mean Absolute Error: ", mae)
print("R-squared: ", r2)

Mean Squared Error:  0.05888121307041147
Mean Absolute Error:  0.19292732410717287
R-squared:  0.3780940405654334


In [25]:
r2 = r2_score(y_test, y_pred)
r2_percent = r2 * 100
print("R-squared: {:.2f}%".format(r2_percent))


R-squared: 37.81%


In [26]:
# split the data into training and testing sets
X = data[selected_features]
y = data['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# define the hyperparameter search space
param_grid = {
    'learning_rate': Real(0.01, 0.1, prior='log-uniform'),
    'n_estimators': Integer(100, 500),
    'max_depth': Integer(2, 6),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 10),
    'subsample': Real(0.5, 1.0, prior='uniform'),
    'max_features': Categorical(['sqrt', 'log2', None])
}

In [28]:
# perform hyperparameter search using Bayesian Optimization
gb = GradientBoostingRegressor(random_state=42)
bayes_search = BayesSearchCV(gb, param_grid, cv=5, n_iter=50, random_state=42, verbose=2)
bayes_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
Fitting 5 folds for each o

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
Fitting 5 folds 

[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=347, subsample=0.5338449779171456; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=347, subsample=0.5338449779171456; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=347, subsample=0.5338449779171456; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=5, min_samples_split=6, n_estimators=100, subsample=0.794960046314713; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=5, min_samples_split=6, n_estimators=100, subsample=0.794960046314713; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=5, min_samples_split=6, n_

[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=242, subsample=0.8423093425672756; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=8, min_samples_split=8, n_estimators=298, subsample=0.74631552150492; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=8, min_samples_split=8, n_estimators=298, subsample=0.74631552150492; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=8, min_samples_split=8, n_estimators=298, subsample=0.74631552150492; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=8, min_samples_split=8, n_estimators=298, subsample=0.74631552150492; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=6, max_features=sqrt, min_samples_leaf=8, min_samples_split=8, n_estima

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.019454664774396228, max_depth=6, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=476, subsample=0.6075040699120551; total time=   0.0s
[CV] END learning_rate=0.019454664774396228, max_depth=6, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=476, subsample=0.6075040699120551; total time=   0.0s
[CV] END learning_rate=0.019454664774396228, max_depth=6, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=476, subsample=0.6075040699120551; total time=   0.0s
[CV] END learning_rate=0.019454664774396228, max_depth=6, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=476, subsample=0.6075040699120551; total time=   0.0s
[CV] END learning_rate=0.019454664774396228, max_depth=6, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=476, subsample=0.6075040699120551; total time=   0.0s
Fitting 5 folds 

[CV] END learning_rate=0.07688294617375802, max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=9, n_estimators=498, subsample=0.8429768702756105; total time=   0.0s
[CV] END learning_rate=0.07688294617375802, max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=9, n_estimators=498, subsample=0.8429768702756105; total time=   0.0s
[CV] END learning_rate=0.07688294617375802, max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=9, n_estimators=498, subsample=0.8429768702756105; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.01, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=2, max_features=log2, min_samples_leaf=1, 

BayesSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
              random_state=42,
              search_spaces={'learning_rate': Real(low=0.01, high=0.1, prior='log-uniform', transform='normalize'),
                             'max_depth': Integer(low=2, high=6, prior='uniform', transform='normalize'),
                             'max_features': Categorical(categories=('sqrt', 'log2', None), prior=None),
                             'min_samples_leaf': Integer(low=1, high=10, prior='uniform', transform='normalize'),
                             'min_samples_split': Integer(low=2, high=10, prior='uniform', transform='normalize'),
                             'n_estimators': Integer(low=100, high=500, prior='uniform', transform='normalize'),
                             'subsample': Real(low=0.5, high=1.0, prior='uniform', transform='normalize')},
              verbose=2)

In [29]:
# evaluate the performance of the best model on the testing set
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test) * 100
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 42.02%


In [30]:
# print the best hyperparameters
print("Best hyperparameters: ", bayes_search.best_params_)

Best hyperparameters:  OrderedDict([('learning_rate', 0.1), ('max_depth', 6), ('max_features', 'sqrt'), ('min_samples_leaf', 4), ('min_samples_split', 8), ('n_estimators', 323), ('subsample', 0.7625340972035156)])
