In [2]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_regression

from lightgbm import *

from shaphypetune import BoostSearch, BoostBoruta, BoostRFE

import warnings
warnings.simplefilter('ignore')

In [3]:
X_clf, y_clf = make_classification(n_samples=8000, n_features=20, n_classes=2, 
                                   n_informative=4, n_redundant=6, random_state=0)

X_clf_train, X_clf_valid, y_clf_train, y_clf_valid = train_test_split(
    X_clf, y_clf, test_size=0.3, shuffle=False)

X_regr, y_regr = make_classification(n_samples=8000, n_features=20,
                                     n_informative=7, random_state=0)

X_regr_train, X_regr_valid, y_regr_train, y_regr_valid = train_test_split(
    X_regr, y_regr, test_size=0.3, shuffle=False)

In [4]:
param_grid = {
    'n_estimators': 150,
    'learning_rate': [0.2, 0.1],
    'num_leaves': [25, 30, 35],
    'max_depth': [10, 12]
}

param_dist = {
    'n_estimators': 150,
    'learning_rate': stats.uniform(0.09, 0.25),
    'num_leaves': stats.randint(20,40),
    'max_depth': [10, 12]
}


regr_lgbm = LGBMRegressor(random_state=0, n_jobs=-1)
clf_lgbm = LGBMClassifier(random_state=0, n_jobs=-1)

# Hyperparameters Tuning

In [5]:
### HYPERPARAM TUNING WITH GRID-SEARCH ###

model = BoostSearch(clf_lgbm, param_grid=param_grid)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('n_estimators', 'learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00047 ### eval_score: 0.25942
trial: 0002 ### iterations: 00029 ### eval_score: 0.26879
trial: 0003 ### iterations: 00048 ### eval_score: 0.25027
trial: 0004 ### iterations: 00035 ### eval_score: 0.26033
trial: 0005 ### iterations: 00069 ### eval_score: 0.2497
trial: 0006 ### iterations: 00044 ### eval_score: 0.25268
trial: 0007 ### iterations: 00111 ### eval_score: 0.25511
trial: 0008 ### iterations: 00107 ### eval_score: 0.25491
trial: 0009 ### iterations: 00093 ### eval_score: 0.24845
trial: 0010 ### iterations: 00107 ### eval_score: 0.24726
trial: 0011 ### iterations: 00093 ### eval_score: 0.25091
trial: 0012 ### iterations: 00082 ### eval_score: 0.25287


<shaphypetune.BoostSearch>

In [6]:
model.estimator_, model.best_params_, model.best_score_

(LGBMClassifier(max_depth=12, n_estimators=150, num_leaves=35, random_state=0),
 {'n_estimators': 150,
  'learning_rate': 0.1,
  'num_leaves': 35,
  'max_depth': 12},
 0.24725564683166487)

In [7]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.predict(X_clf_valid, method='predict_proba').shape)

(0.90875, (2400,), (2400, 2))

In [8]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH ###

model = BoostSearch(regr_lgbm, param_grid=param_dist,
                    n_iter=10, sampling_seed=0)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


10 trials detected for ('n_estimators', 'learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00056 ### eval_score: 0.0544
trial: 0002 ### iterations: 00043 ### eval_score: 0.05828
trial: 0003 ### iterations: 00038 ### eval_score: 0.06032
trial: 0004 ### iterations: 00025 ### eval_score: 0.0604
trial: 0005 ### iterations: 00017 ### eval_score: 0.06193
trial: 0006 ### iterations: 00139 ### eval_score: 0.05427
trial: 0007 ### iterations: 00040 ### eval_score: 0.06256
trial: 0008 ### iterations: 00148 ### eval_score: 0.0553
trial: 0009 ### iterations: 00053 ### eval_score: 0.06231
trial: 0010 ### iterations: 00042 ### eval_score: 0.05609


<shaphypetune.BoostSearch>

In [9]:
model.estimator_, model.best_params_, model.best_score_

(LGBMRegressor(learning_rate=0.14065173840722262, max_depth=10, n_estimators=150,
               num_leaves=25, random_state=0),
 {'n_estimators': 150,
  'learning_rate': 0.14065173840722262,
  'num_leaves': 25,
  'max_depth': 10},
 0.05426763799282632)

In [10]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7828351924698709, (2400,), (2400, 21))

# Features Selection

In [11]:
### BORUTA ###

model = BoostBoruta(clf_lgbm, max_iter=200, perc=100)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)

<shaphypetune.BoostBoruta>

In [12]:
model.estimator_, model.n_features_

(LGBMClassifier(random_state=0), 9)

In [13]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict(X_clf_valid, method='predict_proba').shape)

(0.9108333333333334, (2400,), (2400, 9), (2400, 2))

In [14]:
### RECURSIVE FEATURE ELIMINATION (RFE) ###

model = BoostRFE(regr_lgbm, min_features_to_select=1, step=1)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

<shaphypetune.BoostRFE>

In [15]:
model.estimator_, model.n_features_

(LGBMRegressor(random_state=0), 8)

In [16]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8100440630138467, (2400,), (2400, 8), (2400, 9))

# Features Selection with SHAP

In [17]:
### BORUTA SHAP ###

model = BoostBoruta(clf_lgbm, max_iter=200, perc=100,
                    importance_type='shap_importances', train_importance=False)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)

<shaphypetune.BoostBoruta>

In [18]:
model.estimator_, model.n_features_

(LGBMClassifier(random_state=0), 11)

In [19]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict(X_clf_valid, method='predict_proba').shape)

(0.91125, (2400,), (2400, 11), (2400, 2))

In [20]:
### RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###

model = BoostRFE(regr_lgbm, min_features_to_select=1, step=1,
                 importance_type='shap_importances', train_importance=False)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

<shaphypetune.BoostRFE>

In [21]:
model.estimator_, model.n_features_

(LGBMRegressor(random_state=0), 9)

In [22]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8089993072642051, (2400,), (2400, 9), (2400, 10))

# Hyperparameters Tuning + Features Selection

In [23]:
### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA ###

model = BoostBoruta(clf_lgbm, param_grid=param_grid, max_iter=200, perc=100)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('n_estimators', 'learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00066 ### eval_score: 0.23684
trial: 0002 ### iterations: 00052 ### eval_score: 0.23871
trial: 0003 ### iterations: 00049 ### eval_score: 0.24086
trial: 0004 ### iterations: 00048 ### eval_score: 0.2382
trial: 0005 ### iterations: 00049 ### eval_score: 0.23586
trial: 0006 ### iterations: 00053 ### eval_score: 0.23598
trial: 0007 ### iterations: 00126 ### eval_score: 0.23237
trial: 0008 ### iterations: 00126 ### eval_score: 0.23805
trial: 0009 ### iterations: 00101 ### eval_score: 0.23554
trial: 0010 ### iterations: 00100 ### eval_score: 0.23165
trial: 0011 ### iterations: 00084 ### eval_score: 0.23291
trial: 0012 ### iterations: 00087 ### eval_score: 0.23614


<shaphypetune.BoostBoruta>

In [24]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMClassifier(max_depth=12, n_estimators=150, num_leaves=35, random_state=0),
 {'n_estimators': 150,
  'learning_rate': 0.1,
  'num_leaves': 35,
  'max_depth': 12},
 0.23165415918498178,
 9)

In [25]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict(X_clf_valid, method='predict_proba').shape)

(0.91375, (2400,), (2400, 9), (2400, 2))

In [26]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) ###

model = BoostRFE(regr_lgbm, param_grid=param_dist, min_features_to_select=1, step=1,
                 n_iter=10, sampling_seed=0)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


10 trials detected for ('n_estimators', 'learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00150 ### eval_score: 0.04544
trial: 0002 ### iterations: 00081 ### eval_score: 0.04838
trial: 0003 ### iterations: 00080 ### eval_score: 0.04734
trial: 0004 ### iterations: 00107 ### eval_score: 0.0511
trial: 0005 ### iterations: 00088 ### eval_score: 0.04701
trial: 0006 ### iterations: 00150 ### eval_score: 0.04602
trial: 0007 ### iterations: 00086 ### eval_score: 0.0488
trial: 0008 ### iterations: 00149 ### eval_score: 0.05066
trial: 0009 ### iterations: 00150 ### eval_score: 0.0516
trial: 0010 ### iterations: 00115 ### eval_score: 0.04599


<shaphypetune.BoostRFE>

In [27]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMRegressor(learning_rate=0.1350674222191923, max_depth=10, n_estimators=150,
               num_leaves=38, random_state=0),
 {'n_estimators': 150,
  'learning_rate': 0.1350674222191923,
  'num_leaves': 38,
  'max_depth': 10},
 0.04544344630965471,
 8)

In [28]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8181472856318811, (2400,), (2400, 8), (2400, 9))

# Hyperparameters Tuning + Features Selection with SHAP

In [29]:
### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA SHAP ###

model = BoostBoruta(clf_lgbm, param_grid=param_grid, max_iter=200, perc=100,
                    importance_type='shap_importances', train_importance=False)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('n_estimators', 'learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00063 ### eval_score: 0.23678
trial: 0002 ### iterations: 00070 ### eval_score: 0.23779
trial: 0003 ### iterations: 00042 ### eval_score: 0.24026
trial: 0004 ### iterations: 00052 ### eval_score: 0.23773
trial: 0005 ### iterations: 00036 ### eval_score: 0.24991
trial: 0006 ### iterations: 00062 ### eval_score: 0.23893
trial: 0007 ### iterations: 00133 ### eval_score: 0.23946
trial: 0008 ### iterations: 00099 ### eval_score: 0.24318
trial: 0009 ### iterations: 00106 ### eval_score: 0.23646
trial: 0010 ### iterations: 00090 ### eval_score: 0.24228
trial: 0011 ### iterations: 00125 ### eval_score: 0.23753
trial: 0012 ### iterations: 00101 ### eval_score: 0.24296


<shaphypetune.BoostBoruta>

In [30]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMClassifier(max_depth=10, n_estimators=150, num_leaves=35, random_state=0),
 {'n_estimators': 150,
  'learning_rate': 0.1,
  'num_leaves': 35,
  'max_depth': 10},
 0.23645686796387438,
 11)

In [31]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict(X_clf_valid, method='predict_proba').shape)

(0.9083333333333333, (2400,), (2400, 11), (2400, 2))

In [32]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###

model = BoostRFE(regr_lgbm, param_grid=param_dist, min_features_to_select=1, step=1,
                 n_iter=10, sampling_seed=0,
                 importance_type='shap_importances', train_importance=False)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


10 trials detected for ('n_estimators', 'learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00150 ### eval_score: 0.04646
trial: 0002 ### iterations: 00094 ### eval_score: 0.04905
trial: 0003 ### iterations: 00094 ### eval_score: 0.04925
trial: 0004 ### iterations: 00059 ### eval_score: 0.0514
trial: 0005 ### iterations: 00111 ### eval_score: 0.04846
trial: 0006 ### iterations: 00150 ### eval_score: 0.04602
trial: 0007 ### iterations: 00105 ### eval_score: 0.04966
trial: 0008 ### iterations: 00149 ### eval_score: 0.05066
trial: 0009 ### iterations: 00150 ### eval_score: 0.0516
trial: 0010 ### iterations: 00115 ### eval_score: 0.04599


<shaphypetune.BoostRFE>

In [33]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMRegressor(learning_rate=0.17356342265816355, max_depth=12, n_estimators=150,
               num_leaves=33, random_state=0),
 {'n_estimators': 150,
  'learning_rate': 0.17356342265816355,
  'num_leaves': 33,
  'max_depth': 12},
 0.04598749053286013,
 9)

In [34]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8159701638077123, (2400,), (2400, 9), (2400, 10))

# CUSTOM EVAL METRIC SUPPORT

In [35]:
from sklearn.metrics import roc_auc_score

def AUC(y_true, y_hat):
    return 'auc', roc_auc_score(y_true, y_hat), True

In [36]:
model = BoostRFE(LGBMClassifier(random_state=0, metric="custom"), 
                 param_grid=param_grid, min_features_to_select=1, step=1,
                 greater_is_better=True)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0,
          eval_metric=AUC)


12 trials detected for ('n_estimators', 'learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00068 ### eval_score: 0.96755
trial: 0002 ### iterations: 00072 ### eval_score: 0.96577
trial: 0003 ### iterations: 00067 ### eval_score: 0.96698
trial: 0004 ### iterations: 00041 ### eval_score: 0.96632
trial: 0005 ### iterations: 00078 ### eval_score: 0.96679
trial: 0006 ### iterations: 00068 ### eval_score: 0.96626
trial: 0007 ### iterations: 00126 ### eval_score: 0.9669
trial: 0008 ### iterations: 00133 ### eval_score: 0.96591
trial: 0009 ### iterations: 00115 ### eval_score: 0.96816
trial: 0010 ### iterations: 00106 ### eval_score: 0.96789
trial: 0011 ### iterations: 00101 ### eval_score: 0.96727
trial: 0012 ### iterations: 00108 ### eval_score: 0.96755


<shaphypetune.BoostRFE>

# CATEGORICAL FEATURE SUPPORT

In [37]:
categorical_feature = [0,1,2]

X_clf_train[:,categorical_feature] = (X_clf_train[:,categorical_feature]+100).clip(0).astype(int)
X_clf_valid[:,categorical_feature] = (X_clf_valid[:,categorical_feature]+100).clip(0).astype(int)

In [38]:
### MANUAL PASS categorical_feature WITH NUMPY ARRAYS ###

model = BoostRFE(clf_lgbm, param_grid=param_grid, min_features_to_select=1, step=1)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0,
          categorical_feature=categorical_feature)


12 trials detected for ('n_estimators', 'learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00074 ### eval_score: 0.23709
trial: 0002 ### iterations: 00076 ### eval_score: 0.23733
trial: 0003 ### iterations: 00048 ### eval_score: 0.23524
trial: 0004 ### iterations: 00042 ### eval_score: 0.24027
trial: 0005 ### iterations: 00064 ### eval_score: 0.23977
trial: 0006 ### iterations: 00055 ### eval_score: 0.23727
trial: 0007 ### iterations: 00128 ### eval_score: 0.23446
trial: 0008 ### iterations: 00143 ### eval_score: 0.23278
trial: 0009 ### iterations: 00111 ### eval_score: 0.23298
trial: 0010 ### iterations: 00090 ### eval_score: 0.23798
trial: 0011 ### iterations: 00123 ### eval_score: 0.23489
trial: 0012 ### iterations: 00128 ### eval_score: 0.23242


<shaphypetune.BoostRFE>

In [39]:
X_clf_train = pd.DataFrame(X_clf_train)
X_clf_train[categorical_feature] = X_clf_train[categorical_feature].astype('category')

X_clf_valid = pd.DataFrame(X_clf_valid)
X_clf_valid[categorical_feature] = X_clf_valid[categorical_feature].astype('category')

In [40]:
### PASS category COLUMNS IN PANDAS DF ###

model = BoostRFE(clf_lgbm, param_grid=param_grid, min_features_to_select=1, step=1)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('n_estimators', 'learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00074 ### eval_score: 0.23709
trial: 0002 ### iterations: 00076 ### eval_score: 0.23733
trial: 0003 ### iterations: 00048 ### eval_score: 0.23524
trial: 0004 ### iterations: 00042 ### eval_score: 0.24027
trial: 0005 ### iterations: 00064 ### eval_score: 0.23977
trial: 0006 ### iterations: 00055 ### eval_score: 0.23727
trial: 0007 ### iterations: 00128 ### eval_score: 0.23446
trial: 0008 ### iterations: 00143 ### eval_score: 0.23278
trial: 0009 ### iterations: 00111 ### eval_score: 0.23298
trial: 0010 ### iterations: 00090 ### eval_score: 0.23798
trial: 0011 ### iterations: 00123 ### eval_score: 0.23489
trial: 0012 ### iterations: 00128 ### eval_score: 0.23242


<shaphypetune.BoostRFE>