In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_regression

from xgboost import *

from shaphypetune import BoostSearch, BoostBoruta, BoostRFE

import warnings
warnings.simplefilter('ignore')

In [2]:
X_clf, y_clf = make_classification(n_samples=8000, n_features=20, n_classes=2, 
                                   n_informative=4, n_redundant=6, random_state=0)

X_clf_train, X_clf_valid, y_clf_train, y_clf_valid = train_test_split(
    X_clf, y_clf, test_size=0.3, shuffle=False)

X_regr, y_regr = make_classification(n_samples=8000, n_features=20,
                                     n_informative=7, random_state=0)

X_regr_train, X_regr_valid, y_regr_train, y_regr_valid = train_test_split(
    X_regr, y_regr, test_size=0.3, shuffle=False)

In [3]:
param_grid = {
    'n_estimators': 150,
    'learning_rate': [0.3, 0.2, 0.1],
    'max_depth': [8, 10, 12, 15]
}

param_dist = {
    'n_estimators': 150,
    'learning_rate': stats.uniform(0.09, 0.25),
    'max_bin': stats.randint(20,40),
    'max_depth': [10, 12]
}


regr_xgb = XGBRegressor(random_state=0, verbosity=0, n_jobs=-1)
clf_xgb = XGBClassifier(random_state=0, verbosity=0, use_label_encoder=False, n_jobs=-1)

# Hyperparameters Tuning

In [4]:
### HYPERPARAM TUNING WITH GRID-SEARCH ###

model = BoostSearch(clf_xgb, param_grid=param_grid)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('n_estimators', 'learning_rate', 'max_depth')

trial: 0001 ### iterations: 00029 ### eval_score: 0.25649
trial: 0002 ### iterations: 00031 ### eval_score: 0.25012
trial: 0003 ### iterations: 00027 ### eval_score: 0.24894
trial: 0004 ### iterations: 00022 ### eval_score: 0.25179
trial: 0005 ### iterations: 00121 ### eval_score: 0.24711
trial: 0006 ### iterations: 00102 ### eval_score: 0.24251
trial: 0007 ### iterations: 00057 ### eval_score: 0.24229
trial: 0008 ### iterations: 00057 ### eval_score: 0.24685
trial: 0009 ### iterations: 00069 ### eval_score: 0.24725
trial: 0010 ### iterations: 00035 ### eval_score: 0.24809
trial: 0011 ### iterations: 00045 ### eval_score: 0.24361
trial: 0012 ### iterations: 00024 ### eval_score: 0.24584


<shaphypetune.BoostSearch>

In [5]:
model.estimator_, model.best_params_, model.best_score_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.1, max_delta_step=0, max_depth=12,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=150, n_jobs=-1, num_parallel_tree=1, random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='exact', use_label_encoder=False,
               validate_parameters=1, verbosity=0),
 {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 12},
 0.24229)

In [6]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.predict(X_clf_valid, method='predict_proba').shape)

(0.9083333333333333, (2400,), (2400, 2))

In [7]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH ###

model = BoostSearch(regr_xgb, param_grid=param_dist,
                    n_iter=10, sampling_seed=0)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


10 trials detected for ('n_estimators', 'learning_rate', 'max_bin', 'max_depth')

trial: 0001 ### iterations: 00046 ### eval_score: 0.24405
trial: 0002 ### iterations: 00018 ### eval_score: 0.24631
trial: 0003 ### iterations: 00031 ### eval_score: 0.25918
trial: 0004 ### iterations: 00024 ### eval_score: 0.25991
trial: 0005 ### iterations: 00037 ### eval_score: 0.24498
trial: 0006 ### iterations: 00043 ### eval_score: 0.24373
trial: 0007 ### iterations: 00031 ### eval_score: 0.25837
trial: 0008 ### iterations: 00081 ### eval_score: 0.25303
trial: 0009 ### iterations: 00016 ### eval_score: 0.24672
trial: 0010 ### iterations: 00044 ### eval_score: 0.25405


<shaphypetune.BoostSearch>

In [8]:
model.estimator_, model.best_params_, model.best_score_

(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.14065173840722262, max_bin=25, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=150, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=0),
 {'n_estimators': 150,
  'learning_rate': 0.14065173840722262,
  'max_bin': 25,
  'max_depth': 10},
 0.243731)

In [9]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.predict(X_regr_valid, method='apply').shape)

(0.7622770212466065, (2400,), (2400, 50))

# Features Selection

In [10]:
### BORUTA ###

model = BoostBoruta(clf_xgb, max_iter=200, perc=100)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)

<shaphypetune.BoostBoruta>

In [11]:
model.estimator_, model.n_features_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.300000012, max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=100, n_jobs=-1, num_parallel_tree=1, random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='exact', use_label_encoder=False,
               validate_parameters=1, verbosity=0), 10)

In [12]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict(X_clf_valid, method='predict_proba').shape)

(0.905, (2400,), (2400, 10), (2400, 2))

In [13]:
### RECURSIVE FEATURE ELIMINATION (RFE) ###

model = BoostRFE(regr_xgb, min_features_to_select=1, step=1)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

<shaphypetune.BoostRFE>

In [14]:
model.estimator_, model.n_features_

(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0), 11)

In [15]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, method='apply').shape)

(0.7690769632066736, (2400,), (2400, 11), (2400, 85))

# Features Selection with SHAP

In [16]:
### BORUTA SHAP ###

model = BoostBoruta(clf_xgb, max_iter=200, perc=100,
                    importance_type='shap_importances', train_importance=False)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)

<shaphypetune.BoostBoruta>

In [17]:
model.estimator_, model.n_features_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.300000012, max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=100, n_jobs=-1, num_parallel_tree=1, random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='exact', use_label_encoder=False,
               validate_parameters=1, verbosity=0), 11)

In [18]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict(X_clf_valid, method='predict_proba').shape)

(0.905, (2400,), (2400, 11), (2400, 2))

In [19]:
### RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###

model = BoostRFE(regr_xgb, min_features_to_select=1, step=1,
                 importance_type='shap_importances', train_importance=False)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

<shaphypetune.BoostRFE>

In [20]:
model.estimator_, model.n_features_

(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0), 8)

In [21]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, method='apply').shape)

(0.7689351426525531, (2400,), (2400, 8), (2400, 58))

# Hyperparameters Tuning + Features Selection

In [22]:
### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA ###

model = BoostBoruta(clf_xgb, param_grid=param_grid, max_iter=200, perc=100)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('n_estimators', 'learning_rate', 'max_depth')

trial: 0001 ### iterations: 00034 ### eval_score: 0.24945
trial: 0002 ### iterations: 00025 ### eval_score: 0.23904
trial: 0003 ### iterations: 00020 ### eval_score: 0.23868
trial: 0004 ### iterations: 00024 ### eval_score: 0.24345
trial: 0005 ### iterations: 00107 ### eval_score: 0.24011
trial: 0006 ### iterations: 00097 ### eval_score: 0.23423
trial: 0007 ### iterations: 00053 ### eval_score: 0.24083
trial: 0008 ### iterations: 00058 ### eval_score: 0.23582
trial: 0009 ### iterations: 00058 ### eval_score: 0.23879
trial: 0010 ### iterations: 00029 ### eval_score: 0.23693
trial: 0011 ### iterations: 00044 ### eval_score: 0.23354
trial: 0012 ### iterations: 00028 ### eval_score: 0.23796


<shaphypetune.BoostBoruta>

In [23]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.2, max_delta_step=0, max_depth=12,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=150, n_jobs=-1, num_parallel_tree=1, random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='exact', use_label_encoder=False,
               validate_parameters=1, verbosity=0),
 {'n_estimators': 150, 'learning_rate': 0.2, 'max_depth': 12},
 0.233541,
 10)

In [24]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict(X_clf_valid, method='predict_proba').shape)

(0.9108333333333334, (2400,), (2400, 10), (2400, 2))

In [25]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) ###

model = BoostRFE(regr_xgb, param_grid=param_dist, min_features_to_select=1, step=1,
                 n_iter=10, sampling_seed=0)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


10 trials detected for ('n_estimators', 'learning_rate', 'max_bin', 'max_depth')

trial: 0001 ### iterations: 00107 ### eval_score: 0.23461
trial: 0002 ### iterations: 00022 ### eval_score: 0.23708
trial: 0003 ### iterations: 00041 ### eval_score: 0.24385
trial: 0004 ### iterations: 00028 ### eval_score: 0.24136
trial: 0005 ### iterations: 00064 ### eval_score: 0.23088
trial: 0006 ### iterations: 00045 ### eval_score: 0.23477
trial: 0007 ### iterations: 00019 ### eval_score: 0.24095
trial: 0008 ### iterations: 00092 ### eval_score: 0.24043
trial: 0009 ### iterations: 00022 ### eval_score: 0.23819
trial: 0010 ### iterations: 00039 ### eval_score: 0.23913


<shaphypetune.BoostRFE>

In [26]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2284852556251466, max_bin=31, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=150, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=0),
 {'n_estimators': 150,
  'learning_rate': 0.2284852556251466,
  'max_bin': 31,
  'max_depth': 10},
 0.230878,
 8)

In [27]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, method='apply').shape)

(0.7866884875017205, (2400,), (2400, 8), (2400, 71))

# Hyperparameters Tuning + Features Selection with SHAP

In [28]:
### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA SHAP ###

model = BoostBoruta(clf_xgb, param_grid=param_grid, max_iter=200, perc=100,
                    importance_type='shap_importances', train_importance=False)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('n_estimators', 'learning_rate', 'max_depth')

trial: 0001 ### iterations: 00034 ### eval_score: 0.24945
trial: 0002 ### iterations: 00025 ### eval_score: 0.23904
trial: 0003 ### iterations: 00022 ### eval_score: 0.24009
trial: 0004 ### iterations: 00016 ### eval_score: 0.23766
trial: 0005 ### iterations: 00107 ### eval_score: 0.24011
trial: 0006 ### iterations: 00079 ### eval_score: 0.23453
trial: 0007 ### iterations: 00053 ### eval_score: 0.24083
trial: 0008 ### iterations: 00056 ### eval_score: 0.23722
trial: 0009 ### iterations: 00058 ### eval_score: 0.23879
trial: 0010 ### iterations: 00045 ### eval_score: 0.24069
trial: 0011 ### iterations: 00028 ### eval_score: 0.23976
trial: 0012 ### iterations: 00028 ### eval_score: 0.23796


<shaphypetune.BoostBoruta>

In [29]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.1, max_delta_step=0, max_depth=10,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=150, n_jobs=-1, num_parallel_tree=1, random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='exact', use_label_encoder=False,
               validate_parameters=1, verbosity=0),
 {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 10},
 0.234531,
 11)

In [30]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict(X_clf_valid, method='predict_proba').shape)

(0.9125, (2400,), (2400, 11), (2400, 2))

In [31]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###

model = BoostRFE(regr_xgb, param_grid=param_dist, min_features_to_select=1, step=1,
                 n_iter=10, sampling_seed=0,
                 importance_type='shap_importances', train_importance=False)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


10 trials detected for ('n_estimators', 'learning_rate', 'max_bin', 'max_depth')

trial: 0001 ### iterations: 00107 ### eval_score: 0.23461
trial: 0002 ### iterations: 00022 ### eval_score: 0.23708
trial: 0003 ### iterations: 00062 ### eval_score: 0.24438
trial: 0004 ### iterations: 00028 ### eval_score: 0.24136
trial: 0005 ### iterations: 00064 ### eval_score: 0.23088
trial: 0006 ### iterations: 00045 ### eval_score: 0.23477
trial: 0007 ### iterations: 00019 ### eval_score: 0.24095
trial: 0008 ### iterations: 00083 ### eval_score: 0.24098
trial: 0009 ### iterations: 00018 ### eval_score: 0.23776
trial: 0010 ### iterations: 00055 ### eval_score: 0.24127


<shaphypetune.BoostRFE>

In [32]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2284852556251466, max_bin=31, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=150, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=0),
 {'n_estimators': 150,
  'learning_rate': 0.2284852556251466,
  'max_bin': 31,
  'max_depth': 10},
 0.230878,
 8)

In [33]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, method='apply').shape)

(0.7866884875017205, (2400,), (2400, 8), (2400, 71))

# CUSTOM EVAL METRIC SUPPORT

In [34]:
from sklearn.metrics import roc_auc_score

def AUC(y_hat, dtrain):
    y_true = dtrain.get_label()
    return 'auc', roc_auc_score(y_true, y_hat)

In [35]:
model = BoostRFE(clf_xgb, 
                 param_grid=param_grid, min_features_to_select=1, step=1,
                 greater_is_better=True)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0,
          eval_metric=AUC)


12 trials detected for ('n_estimators', 'learning_rate', 'max_depth')

trial: 0001 ### iterations: 00046 ### eval_score: 0.96727
trial: 0002 ### iterations: 00033 ### eval_score: 0.9668
trial: 0003 ### iterations: 00022 ### eval_score: 0.96586
trial: 0004 ### iterations: 00049 ### eval_score: 0.96637
trial: 0005 ### iterations: 00118 ### eval_score: 0.96669
trial: 0006 ### iterations: 00105 ### eval_score: 0.96753
trial: 0007 ### iterations: 00091 ### eval_score: 0.96737
trial: 0008 ### iterations: 00101 ### eval_score: 0.96755
trial: 0009 ### iterations: 00060 ### eval_score: 0.96673
trial: 0010 ### iterations: 00045 ### eval_score: 0.96641
trial: 0011 ### iterations: 00051 ### eval_score: 0.96716
trial: 0012 ### iterations: 00070 ### eval_score: 0.96683


<shaphypetune.BoostRFE>