In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint


df = pd.read_csv('all_XG_data.csv')
df.head()
df.shape

(4339, 73)

In [18]:
variables = df.columns
input_variables = variables[variables != 'XG']

X = df[input_variables]
y = df['XG']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb = XGBRegressor(objective='reg:absoluteerror', random_state=90)

param_distributions = {
    'booster': ['gblinear', 'dart', 'gbtree'],  # List of discrete values
    'learning_rate': uniform(0.001, 0.1),  # Continuous uniform distribution between 0.01 and 0.2
    'max_leaves': randint(20, 200),  # Integer values between 20 and 100
    'n_estimators': randint(50, 400),  # Integer values between 50 and 300
    'max_depth': randint(3, 15),  # Integer values between 3 and 10
    'grow_policy': ['depthwise', 'lossguide'],  # List of discrete values
}

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=50,
    scoring='neg_mean_absolute_error',
    cv=3,
    random_state=2,
    verbose=3,
    n_jobs=-1
)


random_search.fit(X_train, y_train)

print(f"Best Random Search Score: {random_search.best_score_:.5f}")
print(f"Best Random Search Parameters: {random_search.best_params_}")



Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Random Search Score: -0.07259
Best Random Search Parameters: {'booster': 'gbtree', 'grow_policy': 'depthwise', 'learning_rate': np.float64(0.03418737125398759), 'max_depth': 5, 'max_leaves': 77, 'n_estimators': 274}


In [None]:
def mae(y_true, y_pred):
    return np.mean(abs((y_true - y_pred)))

#new_params = {'booster': 'gbtree',
#              'grow_policy': 'lossguide', 
#              'learning_rate': np.float64(0.05185817934340634), 
#              'max_depth': 9, 
#              'max_leaves': 35, 
#              'n_estimators': 283}

new_params ={'booster': 'gbtree', 'grow_policy': 'depthwise', 'learning_rate': np.float64(0.03418737125398759), 'max_depth': 5, 'max_leaves': 77, 'n_estimators': 274}
#both new params have mae of about 0.065

xgb_best = XGBRegressor(**new_params, objective='reg:absoluteerror', random_state=30)

xgb_best.fit(X_train, y_train)
y_score = xgb_best.predict(X_test)

meow = mae(y_test, y_score)

print(f"Mean Absolute Error (MAE): {meow}")

Mean Absolute Error (MAE): 0.065257855913322
