Reference: https://www.datacamp.com/tutorial/xgboost-in-python

In [None]:
import pandas as pd
import numpy as np
import pickle

# Models
import xgboost as xgb

#Tuning and Cross Validation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


import warnings
warnings.filterwarnings("ignore")

#### Importing Data

In [None]:
x_train = pd.read_csv('../../Data Files/Training_Data/x_train.csv')
x_test = pd.read_csv('../../Data Files/Training_Data/x_test.csv')
y_train = pd.read_csv('../../Data Files/Training_Data/y_train.csv')
y_test = pd.read_csv('../../Data Files/Training_Data/y_test.csv')

#### Defining XGBoost Architecture

In [9]:
# Create regression matrices

dtrain_reg = xgb.DMatrix(x_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(x_test, y_test, enable_categorical=True)

In [11]:
# Define hyperparameters

params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]
n = 10000

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=100,
   early_stopping_rounds=25
)

#### Training XGBoost

In [12]:
preds = model.predict(dtest_reg)

In [None]:
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE of the base model: {rmse:.3f}")

#### Tuning for Hyperparameters with Cross Validation

In [None]:
scores = cross_val_score(model, x_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Mean Absolute Error:", -scores.mean())
print("Standard deviation:", scores.std())

In [20]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)

In [None]:
best_rmse = results['test-rmse-mean'].min()
best_rmse

#### Generating Predictions

#### Saving Model File and Predictions

In [None]:
save_path = '../../Data Files/'
pickle.dump(model, open(save_path + 'Model Files/' + 'xgb.pkl', 'wb'))
np.savetxt(save_path + 'Predictions/' + 'xgboost_output.csv', y_pred_xgb, delimiter=",")