Model Reference: https://www.datacamp.com/tutorial/xgboost-in-python and https://www.datatechnotes.com/2019/06/regression-example-with-xgbregressor-in.html


Tuning Reference: https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning

In [1]:
import pandas as pd
import numpy as np
import pickle

# Models
import xgboost as xgb

#Tuning and Cross Validation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import warnings
warnings.filterwarnings("ignore")

#### Importing Data

In [2]:
x_train = pd.read_csv('../../Data Files/Training Data/x_train.csv')
x_test = pd.read_csv('../../Data Files/Training Data/x_test.csv')
y_train = pd.read_csv('../../Data Files/Training Data/y_train.csv')
y_test = pd.read_csv('../../Data Files/Training Data/y_test.csv')

#### Defining XGBoost Architecture

In [3]:
xgbr = xgb.XGBRegressor(verbosity=0) 

print(xgbr)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)


#### Training XGBoost

In [4]:
xgbr.fit(x_train, y_train)

In [5]:
y_pred = xgbr.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print("mean_squared_error = {:.3}".format(mse)) 

mean_squared_error = 2.22


In [6]:
score = xgbr.score(x_train, y_train)  

print("Training score: ", score)

Training score:  0.5268115671434962


In [7]:
#scores = cross_val_score(xgbr, x_train, y_train, cv=5)
#print("Mean cross-validation score: %.2f" % scores.mean())

#### Tuning for Hyperparameters

In [8]:
xgb_tuning = xgb.XGBRegressor(n_estimators=100, seed=0)

In [9]:
parameters = {'max_depth': [3, 18, 1],
              'gamma': [1,9],
              'reg_alpha' : [40,180,],
              'reg_lambda' : [0,1],
              'colsample_bytree' : [0.5,1],
              'min_child_weight' : [0, 10, 1],
             }

estimator = GridSearchCV(xgb_tuning, parameters, cv=3)

estimator.fit(x_train, y_train)

In [10]:
#trust your CV!
best_parameters = estimator.best_params_
#best_parameters, score, _ = max(estimator.grid_scores_, key=lambda x: x[1])

#print(best_parameters)

#### Generating Predictions

In [11]:
#best_hyperparams["max_depth"] = int(best_hyperparams["max_depth"])

In [12]:
tuned_xgboost = xgb.XGBRegressor(**best_parameters)
tuned_xgboost.fit(x_train, y_train)
y_pred_xgb = tuned_xgboost.predict(x_test)

score = xgbr.score(x_train, y_train)
print("Training score: ", score)

Training score:  0.5268115671434962


In [13]:
scores = cross_val_score(tuned_xgboost, x_train, y_train,cv=5)
print("Mean cross-validation score:                                                                                                                                        %.2f" % scores.mean())

Mean cross-validation score: 0.39


In [16]:
mse_after_tuning = mean_squared_error(y_test, y_pred)
print("mean_squared_error = {:.3}".format(mse_after_tuning)) 
mae_after_tuning = mean_absolute_error(y_test, y_pred)
print("mean_absolute_error = {:.3}".format(mae_after_tuning)) 

mean_squared_error = 2.22


#### Saving Model File and Predictions

In [15]:
save_path = '../../Data Files/'
pickle.dump(tuned_xgboost, open(save_path + 'Model Files/' + 'xgb.pkl', 'wb'))
np.savetxt(save_path + 'Predictions/' + 'xgboost_output.csv', y_pred_xgb, delimiter=",")