In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import joblib


In [2]:
# load the data from a local CSV file
data = pd.read_csv("C:/Users/user/Documents/GitHub/R5D5-Project/model/ML_Model_Dataset_01/complete_data_model_nba2k_main_data_sheet.csv")

In [3]:
data.head()

Unnamed: 0,rating,height,weight,salary,draft_year,draft_round,draft_peak,TWITTER_FAVORITE_COUNT,TWITTER_RETWEET_COUNT,TWITTER_FOLLOWER_COUNT_MILLIONS,...,team_salary,position_C,position_C-F,position_F,position_F-C,position_F-G,position_G,position_G-F,country_USA,country_not USA
0,81,2.03,99.8,0.433265,2014,1,4,42.5,16.0,2.850071,...,4,0,0,1,0,0,0,0,1,0
1,73,1.83,83.9,0.047871,2018,1,23,583.257282,198.997573,2.850071,...,2,0,0,0,0,0,1,0,1,0
2,70,1.96,102.1,0.034299,2016,2,58,583.257282,198.997573,2.850071,...,6,0,0,1,0,0,0,0,0,1
3,71,1.96,109.3,0.020774,2019,2,42,583.257282,198.997573,2.850071,...,6,0,0,1,0,0,0,0,1,0
4,85,2.06,111.6,0.611183,2007,1,3,136.0,71.0,0.253,...,6,0,1,0,0,0,0,0,0,1


In [4]:
# print the column names
print(data.columns)

Index(['rating', 'height', 'weight', 'salary', 'draft_year', 'draft_round',
       'draft_peak', 'TWITTER_FAVORITE_COUNT', 'TWITTER_RETWEET_COUNT',
       'TWITTER_FOLLOWER_COUNT_MILLIONS', 'Experience', 'Age', 'bmi',
       'team_salary', 'position_C', 'position_C-F', 'position_F',
       'position_F-C', 'position_F-G', 'position_G', 'position_G-F',
       'country_USA', 'country_not USA'],
      dtype='object')


In [5]:
#Placing selected features which has >90% contribution from gb feature selection
selected_features = ['rating','draft_year','TWITTER_FOLLOWER_COUNT_MILLIONS','Experience','Age']
target_column = 'salary'


In [6]:
# split the data into training and testing sets
X = data[selected_features]
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# train a gradient boosting regressor on the training set
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)


GradientBoostingRegressor()

In [8]:
# make predictions on the testing set
y_pred = gb.predict(X_test)


In [9]:
accuracy = gb.score(X_test, y_test) * 100
print("Accuracy: {:.2f}%".format(accuracy))


Accuracy: 77.26%


In [10]:
# split the data into training and testing sets
X = data[selected_features]
y = data['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# create a Gradient Boosting Regressor model
gb = GradientBoostingRegressor(random_state=0)

In [12]:
# define the hyperparameters to tune
hyperparameters = {'n_estimators': [100, 200, 300],
                   'max_depth': [3, 4, 5],
                   'learning_rate': [0.1, 0.05, 0.01]}

In [13]:
# perform a grid search to find the best combination of hyperparameters
grid_search = GridSearchCV(gb, hyperparameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': [0.1, 0.05, 0.01],
                         'max_depth': [3, 4, 5],
                         'n_estimators': [100, 200, 300]},
             scoring='neg_mean_squared_error')

In [14]:
# print the best hyperparameters and mean squared error
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best MSE: ", -grid_search.best_score_)

Best Hyperparameters:  {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}
Best MSE:  0.011511420186510381


In [15]:
# create a new Gradient Boosting Regressor model with the best hyperparameters found
gb_final = GradientBoostingRegressor(n_estimators=300, max_depth=3, learning_rate=0.01, random_state=0)


In [16]:
# train the model on the entire training dataset
gb_final.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, n_estimators=300, random_state=0)

In [17]:
# make predictions on the testing dataset
y_pred = gb_final.predict(X_test)

In [18]:
# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [19]:
print("Mean Squared Error: ", mse)
print("Mean Absolute Error: ", mae)
print("R-squared: ", r2)

Mean Squared Error:  0.013017629853768012
Mean Absolute Error:  0.07272390012475434
R-squared:  0.7802066855155128


In [20]:
r2 = r2_score(y_test, y_pred)
r2_percent = r2 * 100
print("R-squared: {:.2f}%".format(r2_percent))


R-squared: 78.02%


In [21]:
# split the data into training and testing sets
X = data[selected_features]
y = data['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# define the hyperparameter search space
param_grid = {
    'learning_rate': Real(0.01, 0.1, prior='log-uniform'),
    'n_estimators': Integer(100, 500),
    'max_depth': Integer(2, 6),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 10),
    'subsample': Real(0.5, 1.0, prior='uniform'),
    'max_features': Categorical(['sqrt', 'log2', None])
}

In [24]:
# perform hyperparameter search using Bayesian Optimization
gb = GradientBoostingRegressor(random_state=42)
bayes_search = BayesSearchCV(gb, param_grid, cv=5, n_iter=50, random_state=42, verbose=2)
bayes_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
[CV] END learning_rate=0.02571011142608906, max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, n_estimators=266, subsample=0.675465667449572; total time=   0.0s
Fitting 5 folds for each o

[CV] END learning_rate=0.09025639567934764, max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=185, subsample=0.5000305253763367; total time=   0.0s
[CV] END learning_rate=0.09025639567934764, max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=185, subsample=0.5000305253763367; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_rate=0.010083941261235433, max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=7, n_estimators=210, subsample=0.6564196531173141; total time=   0.0s
[CV] END learning_

[CV] END learning_rate=0.01, max_depth=6, max_features=sqrt, min_samples_leaf=7, min_samples_split=4, n_estimators=393, subsample=0.530078813407623; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.01, max_depth=6, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=264, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=6, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=264, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=6, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=264, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=6, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=264, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=6, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=264, subsample=0.5; total time=   0.0

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.01, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=296, subsample=0.5673785292573924; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=296, subsample=0.5673785292573924; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=296, subsample=0.5673785292573924; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=296, subsample=0.5673785292573924; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=296, subsample=0.5673785292573924; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.014477860828

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.03914709745786944, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=253, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.03914709745786944, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=253, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.03914709745786944, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=253, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.03914709745786944, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=253, subsample=0.5; total time=   0.0s
[CV] END learning_rate=0.03914709745786944, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=253, subsample=0.5; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.024125005119

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.1, max_depth=2, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=2, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=2, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=2, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=2, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=100, subsample=1.0; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.019632151412067315, max_depth=2, max_features=log2, min_samples_leaf=1, min_sample

BayesSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
              random_state=42,
              search_spaces={'learning_rate': Real(low=0.01, high=0.1, prior='log-uniform', transform='normalize'),
                             'max_depth': Integer(low=2, high=6, prior='uniform', transform='normalize'),
                             'max_features': Categorical(categories=('sqrt', 'log2', None), prior=None),
                             'min_samples_leaf': Integer(low=1, high=10, prior='uniform', transform='normalize'),
                             'min_samples_split': Integer(low=2, high=10, prior='uniform', transform='normalize'),
                             'n_estimators': Integer(low=100, high=500, prior='uniform', transform='normalize'),
                             'subsample': Real(low=0.5, high=1.0, prior='uniform', transform='normalize')},
              verbose=2)

In [25]:
# evaluate the performance of the best model on the testing set
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test) * 100
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 80.15%


In [26]:
# print the best hyperparameters
print("Best hyperparameters: ", bayes_search.best_params_)

Best hyperparameters:  OrderedDict([('learning_rate', 0.01), ('max_depth', 2), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 417), ('subsample', 1.0)])


In [2]:
# save the model to a file
joblib.dump(best_model, 'model.joblib')

NameError: name 'best_model' is not defined