In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
import shap

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
df = pd.read_csv("./data/model_data.csv")
df = df.drop(columns=["ID", "WS"])
df_xgb = df.dropna()
y_col = "salary_perc"

In [3]:
model = GradientBoostingRegressor()

In [4]:
param_dist = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': np.linspace(0.1, 1.0, 10),
    'min_samples_leaf': np.linspace(0.1, 0.5, 5),
    'subsample': [0.1, 0.25, 0.5, 0.6, 0.75, 0.8, 0.9, 1.0],
    'max_features': [0.5, 0.75, 1, 'sqrt', 'log2', None],
    'loss': ['huber', 'quantile', 'squared_error', 'absolute_error']
}

In [5]:
random_search = RandomizedSearchCV(model, param_dist, cv=5, verbose=2, n_jobs=-1, scoring="neg_mean_absolute_error")

In [6]:
random_search.fit(df_xgb.drop(columns=[y_col]), df_xgb[y_col])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [7]:
results_random = pd.DataFrame(random_search.cv_results_)
results_random.sort_values(by="rank_test_score", inplace=True)
results_random

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,...,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,4.024167,0.116834,0.014973,0.004221,0.9,300,0.3,0.1,0.75,7,...,0.5,"{'subsample': 0.9, 'n_estimators': 300, 'min_s...",-0.038383,-0.031433,-0.034082,-0.034168,-0.03277,-0.034167,0.002333,1
7,1.010898,0.048819,0.015768,0.008159,0.9,100,0.5,0.3,sqrt,9,...,1.0,"{'subsample': 0.9, 'n_estimators': 100, 'min_s...",-0.04324,-0.035641,-0.037077,-0.038702,-0.036469,-0.038226,0.0027,2
0,3.238413,0.102191,0.016556,0.005104,0.75,500,0.7,0.3,1,7,...,0.01,"{'subsample': 0.75, 'n_estimators': 500, 'min_...",-0.046467,-0.036518,-0.039898,-0.040437,-0.038484,-0.040361,0.003341,3
3,3.556001,0.077513,0.008375,0.000485,0.9,300,0.4,0.4,,10,...,1.0,"{'subsample': 0.9, 'n_estimators': 300, 'min_s...",-0.044994,-0.038858,-0.039831,-0.041546,-0.0392,-0.040885,0.002253,4
6,0.294619,0.028409,0.005579,0.000487,0.8,50,0.2,0.4,1,8,...,0.01,"{'subsample': 0.8, 'n_estimators': 50, 'min_sa...",-0.068423,-0.050947,-0.055344,-0.054503,-0.054039,-0.056652,0.006071,5
4,0.856894,0.044647,0.009781,0.002313,0.25,200,0.2,0.2,sqrt,3,...,0.05,"{'subsample': 0.25, 'n_estimators': 200, 'min_...",-0.073577,-0.054305,-0.058551,-0.060104,-0.058695,-0.061046,0.006559,6
9,1.556275,0.248563,0.007282,0.003448,0.5,200,0.7,0.1,0.75,7,...,0.05,"{'subsample': 0.5, 'n_estimators': 200, 'min_s...",-0.06777,-0.059242,-0.061417,-0.063816,-0.064556,-0.06336,0.002892,7
1,3.386148,0.06143,0.013765,0.011587,0.75,500,1.0,0.3,sqrt,8,...,1.0,"{'subsample': 0.75, 'n_estimators': 500, 'min_...",-0.067773,-0.059304,-0.061556,-0.064055,-0.064473,-0.063432,0.002859,8
8,2.811169,0.159089,0.009383,0.001854,1.0,300,0.4,0.3,log2,4,...,0.1,"{'subsample': 1.0, 'n_estimators': 300, 'min_s...",-0.075055,-0.065174,-0.064,-0.064369,-0.064136,-0.066547,0.004273,9
5,2.636155,0.139077,0.009575,0.001854,0.5,500,0.7,0.3,1,10,...,0.5,"{'subsample': 0.5, 'n_estimators': 500, 'min_s...",-0.116046,-0.151737,-0.146319,-0.140215,-0.14713,-0.140289,0.012664,10


In [8]:
print(f"Best MAE from random search: {random_search.best_score_}")
print(f"Best params from random search: {random_search.best_params_}")

Best MAE from random search: -0.03416732072557566
Best params from random search: {'subsample': 0.9, 'n_estimators': 300, 'min_samples_split': 0.30000000000000004, 'min_samples_leaf': 0.1, 'max_features': 0.75, 'max_depth': 7, 'loss': 'squared_error', 'learning_rate': 0.5}


In [9]:
grid_search = GridSearchCV(model, param_dist, cv=5, verbose=2, n_jobs=-1, scoring="neg_mean_absolute_error")

In [12]:
grid_search.fit(df_xgb.drop(columns=[y_col]), df_xgb[y_col])

Fitting 5 folds for each of 2880000 candidates, totalling 14400000 fits


In [None]:
results_grid = pd.DataFrame(grid_search.cv_results_)
results_grid.sort_values(by="rank_test_score", inplace=True)
results_grid

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [None]:
print(f"Best MAE from grid search: {grid_search.best_score_}")
print(f"Best params from grid search: {grid_search.best_params_}")

In [None]:
df_shap = df.dropna()
y = df_shap["salary_perc"]
X = df_shap.drop(columns="salary_perc")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
xgb_model = GradientBoostingRegressor()

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
# Feature Importance from XGBoost
importances = xgb_model.feature_importances_
indices = sorted(range(len(importances)), key=lambda i: importances[i], reverse=True)
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{X_train.columns[indices[f]]}: {importances[indices[f]]}")