In [1]:
import numpy as np
import pandas as pd
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_regression

from hyperopt import hp
from hyperopt import Trials

from lightgbm import *

from shaphypetune import BoostSearch, BoostBoruta, BoostRFE, BoostRFA

import warnings
warnings.simplefilter('ignore')

In [2]:
X_clf, y_clf = make_classification(n_samples=8000, n_features=20, n_classes=2, 
                                   n_informative=4, n_redundant=6, random_state=0)

X_clf_train, X_clf_valid, y_clf_train, y_clf_valid = train_test_split(
    X_clf, y_clf, test_size=0.3, shuffle=False)

X_regr, y_regr = make_classification(n_samples=8000, n_features=20,
                                     n_informative=7, random_state=0)

X_regr_train, X_regr_valid, y_regr_train, y_regr_valid = train_test_split(
    X_regr, y_regr, test_size=0.3, shuffle=False)

In [3]:
param_grid = {
    'learning_rate': [0.2, 0.1],
    'num_leaves': [25, 30, 35],
    'max_depth': [10, 12]
}

param_dist = {
    'learning_rate': stats.uniform(0.09, 0.25),
    'num_leaves': stats.randint(20,40),
    'max_depth': [10, 12]
}

param_dist_hyperopt = {
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart']),
    'max_depth': 15 + hp.randint('num_leaves', 5), 
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0)
}


regr_lgbm = LGBMRegressor(n_estimators=150, random_state=0, n_jobs=-1)
clf_lgbm = LGBMClassifier(n_estimators=150, random_state=0, n_jobs=-1)

# Hyperparameters Tuning

In [4]:
### HYPERPARAM TUNING WITH GRID-SEARCH ###

model = BoostSearch(clf_lgbm, param_grid=param_grid)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00047 ### eval_score: 0.25942
trial: 0002 ### iterations: 00029 ### eval_score: 0.26879
trial: 0003 ### iterations: 00048 ### eval_score: 0.25027
trial: 0004 ### iterations: 00035 ### eval_score: 0.26033
trial: 0005 ### iterations: 00069 ### eval_score: 0.2497
trial: 0006 ### iterations: 00044 ### eval_score: 0.25268
trial: 0007 ### iterations: 00111 ### eval_score: 0.25511
trial: 0008 ### iterations: 00107 ### eval_score: 0.25491
trial: 0009 ### iterations: 00093 ### eval_score: 0.24845
trial: 0010 ### iterations: 00107 ### eval_score: 0.24726
trial: 0011 ### iterations: 00093 ### eval_score: 0.25091
trial: 0012 ### iterations: 00082 ### eval_score: 0.25287


BoostSearch(estimator=LGBMClassifier(n_estimators=150, random_state=0),
            param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                        'num_leaves': [25, 30, 35]})

In [5]:
model.estimator_, model.best_params_, model.best_score_

(LGBMClassifier(max_depth=12, n_estimators=150, num_leaves=35, random_state=0),
 {'learning_rate': 0.1, 'num_leaves': 35, 'max_depth': 12},
 0.24725564683166487)

In [6]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.predict_proba(X_clf_valid).shape)

(0.90875, (2400,), (2400, 2))

In [7]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH ###

model = BoostSearch(
    regr_lgbm, param_grid=param_dist,
    n_iter=10, sampling_seed=0
)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


10 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00019 ### eval_score: 0.05974
trial: 0002 ### iterations: 00023 ### eval_score: 0.0625
trial: 0003 ### iterations: 00150 ### eval_score: 0.05638
trial: 0004 ### iterations: 00032 ### eval_score: 0.05881
trial: 0005 ### iterations: 00026 ### eval_score: 0.05976
trial: 0006 ### iterations: 00150 ### eval_score: 0.05593
trial: 0007 ### iterations: 00023 ### eval_score: 0.06166
trial: 0008 ### iterations: 00039 ### eval_score: 0.06223
trial: 0009 ### iterations: 00031 ### eval_score: 0.06256
trial: 0010 ### iterations: 00054 ### eval_score: 0.06231


BoostSearch(estimator=LGBMRegressor(n_estimators=150, random_state=0),
            n_iter=10,
            param_grid={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000018567F78208>,
                        'max_depth': [10, 12],
                        'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000018566F33248>},
            sampling_seed=0)

In [8]:
model.estimator_, model.best_params_, model.best_score_

(LGBMRegressor(learning_rate=0.1132770716679645, max_depth=12, n_estimators=150,
               num_leaves=23, random_state=0),
 {'learning_rate': 0.1132770716679645, 'num_leaves': 23, 'max_depth': 12},
 0.05593005637236878)

In [9]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7761826315554707, (2400,), (2400, 21))

In [10]:
### HYPERPARAM TUNING WITH HYPEROPT ###

model = BoostSearch(
    regr_lgbm, param_grid=param_dist_hyperopt,
    n_iter=10, sampling_seed=0
)
model.fit(
    X_regr_train, y_regr_train, trials=Trials(), 
    eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0
)


10 trials detected for ('boosting_type', 'max_depth', 'learning_rate', 'colsample_bytree')

trial: 0001 ### iterations: 00150 ### eval_score: 0.05742
trial: 0002 ### iterations: 00119 ### eval_score: 0.05542
trial: 0003 ### iterations: 00036 ### eval_score: 0.05962
trial: 0004 ### iterations: 00150 ### eval_score: 0.10881
trial: 0005 ### iterations: 00150 ### eval_score: 0.16047
trial: 0006 ### iterations: 00150 ### eval_score: 0.05253
trial: 0007 ### iterations: 00052 ### eval_score: 0.05757
trial: 0008 ### iterations: 00150 ### eval_score: 0.05405
trial: 0009 ### iterations: 00108 ### eval_score: 0.05791
trial: 0010 ### iterations: 00150 ### eval_score: 0.09827


BoostSearch(estimator=LGBMRegressor(n_estimators=150, random_state=0),
            n_iter=10,
            param_grid={'boosting_type': <hyperopt.pyll.base.Apply object at 0x000001855ED04288>,
                        'colsample_bytree': <hyperopt.pyll.base.Apply object at 0x0000018567F7E3C8>,
                        'learning_rate': <hyperopt.pyll.base.Apply object at 0x0000018567F78FC8>,
                        'max_depth': <hyperopt.pyll.base.Apply object at 0x0000018567F78B08>},
            sampling_seed=0)

In [11]:
model.estimator_, model.best_params_, model.best_score_

(LGBMRegressor(colsample_bytree=0.7597292534356749,
               learning_rate=0.059836658149176665, max_depth=16,
               n_estimators=150, random_state=0),
 {'boosting_type': 'gbdt',
  'colsample_bytree': 0.7597292534356749,
  'learning_rate': 0.059836658149176665,
  'max_depth': 16},
 0.052526187351329794)

In [12]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7898040197004537, (2400,), (2400, 21))

# Features Selection

In [13]:
### BORUTA ###

model = BoostBoruta(clf_lgbm, max_iter=200, perc=100)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)

BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),
            max_iter=200)

In [14]:
model.estimator_, model.n_features_

(LGBMClassifier(n_estimators=150, random_state=0), 9)

In [15]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict_proba(X_clf_valid).shape)

(0.9108333333333334, (2400,), (2400, 9), (2400, 2))

In [16]:
### RECURSIVE FEATURE ELIMINATION (RFE) ###

model = BoostRFE(regr_lgbm, min_features_to_select=1, step=1)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         min_features_to_select=1)

In [17]:
model.estimator_, model.n_features_

(LGBMRegressor(n_estimators=150, random_state=0), 9)

In [18]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8171773316485389, (2400,), (2400, 9), (2400, 10))

In [19]:
### RECURSIVE FEATURE ADDITION (RFA) ###

model = BoostRFA(regr_lgbm, min_features_to_select=1, step=1)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         min_features_to_select=1)

In [20]:
model.estimator_, model.n_features_

(LGBMRegressor(n_estimators=150, random_state=0), 9)

In [21]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8171773316485389, (2400,), (2400, 9), (2400, 10))

# Features Selection with SHAP

In [22]:
### BORUTA SHAP ###

model = BoostBoruta(
    clf_lgbm, max_iter=200, perc=100,
    importance_type='shap_importances', train_importance=False
)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)

BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),
            importance_type='shap_importances', max_iter=200,
            train_importance=False)

In [23]:
model.estimator_, model.n_features_

(LGBMClassifier(n_estimators=150, random_state=0), 11)

In [24]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict_proba(X_clf_valid).shape)

(0.91125, (2400,), (2400, 11), (2400, 2))

In [25]:
### RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###

model = BoostRFE(
    regr_lgbm, min_features_to_select=1, step=1,
    importance_type='shap_importances', train_importance=False
)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         importance_type='shap_importances', min_features_to_select=1,
         train_importance=False)

In [26]:
model.estimator_, model.n_features_

(LGBMRegressor(n_estimators=150, random_state=0), 9)

In [27]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8171773316485389, (2400,), (2400, 9), (2400, 10))

In [28]:
### RECURSIVE FEATURE ADDITION (RFA) SHAP ###

model = BoostRFA(
    regr_lgbm, min_features_to_select=1, step=1,
    importance_type='shap_importances', train_importance=False
)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         importance_type='shap_importances', min_features_to_select=1,
         train_importance=False)

In [29]:
model.estimator_, model.n_features_

(LGBMRegressor(n_estimators=150, random_state=0), 9)

In [30]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8171773316485389, (2400,), (2400, 9), (2400, 10))

# Hyperparameters Tuning + Features Selection

In [31]:
### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA ###

model = BoostBoruta(clf_lgbm, param_grid=param_grid, max_iter=200, perc=100)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00066 ### eval_score: 0.23684
trial: 0002 ### iterations: 00052 ### eval_score: 0.23871
trial: 0003 ### iterations: 00049 ### eval_score: 0.24086
trial: 0004 ### iterations: 00048 ### eval_score: 0.2382
trial: 0005 ### iterations: 00049 ### eval_score: 0.23586
trial: 0006 ### iterations: 00053 ### eval_score: 0.23598
trial: 0007 ### iterations: 00126 ### eval_score: 0.23237
trial: 0008 ### iterations: 00126 ### eval_score: 0.23805
trial: 0009 ### iterations: 00101 ### eval_score: 0.23554
trial: 0010 ### iterations: 00100 ### eval_score: 0.23165
trial: 0011 ### iterations: 00084 ### eval_score: 0.23291
trial: 0012 ### iterations: 00087 ### eval_score: 0.23614


BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),
            max_iter=200,
            param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                        'num_leaves': [25, 30, 35]})

In [32]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMClassifier(max_depth=12, n_estimators=150, num_leaves=35, random_state=0),
 {'learning_rate': 0.1, 'num_leaves': 35, 'max_depth': 12},
 0.23165415918498178,
 9)

In [33]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict_proba(X_clf_valid).shape)

(0.91375, (2400,), (2400, 9), (2400, 2))

In [34]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) ###

model = BoostRFE(
    regr_lgbm, param_grid=param_dist, min_features_to_select=1, step=1,
    n_iter=10, sampling_seed=0
)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


10 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00052 ### eval_score: 0.04664
trial: 0002 ### iterations: 00123 ### eval_score: 0.04862
trial: 0003 ### iterations: 00150 ### eval_score: 0.04846
trial: 0004 ### iterations: 00073 ### eval_score: 0.04777
trial: 0005 ### iterations: 00148 ### eval_score: 0.04512
trial: 0006 ### iterations: 00150 ### eval_score: 0.04881
trial: 0007 ### iterations: 00057 ### eval_score: 0.04667
trial: 0008 ### iterations: 00148 ### eval_score: 0.05036
trial: 0009 ### iterations: 00080 ### eval_score: 0.04935
trial: 0010 ### iterations: 00150 ### eval_score: 0.04769


BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         min_features_to_select=1, n_iter=10,
         param_grid={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000018567F78208>,
                     'max_depth': [10, 12],
                     'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000018566F33248>},
         sampling_seed=0)

In [35]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMRegressor(learning_rate=0.19192175702007153, max_depth=12, n_estimators=150,
               random_state=0),
 {'learning_rate': 0.19192175702007153, 'num_leaves': 31, 'max_depth': 12},
 0.045118545913448174,
 8)

In [36]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8194474515248205, (2400,), (2400, 8), (2400, 9))

In [37]:
### HYPERPARAM TUNING WITH HYPEROPT + RECURSIVE FEATURE ADDITION (RFA) ###

model = BoostRFA(
    regr_lgbm, param_grid=param_dist_hyperopt, min_features_to_select=1, step=1,
    n_iter=10, sampling_seed=0
)
model.fit(
    X_regr_train, y_regr_train, trials=Trials(), 
    eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0
)


10 trials detected for ('boosting_type', 'max_depth', 'learning_rate', 'colsample_bytree')

trial: 0001 ### iterations: 00150 ### eval_score: 0.05458
trial: 0002 ### iterations: 00148 ### eval_score: 0.0459
trial: 0003 ### iterations: 00111 ### eval_score: 0.04903
trial: 0004 ### iterations: 00150 ### eval_score: 0.10326
trial: 0005 ### iterations: 00150 ### eval_score: 0.15731
trial: 0006 ### iterations: 00150 ### eval_score: 0.05281
trial: 0007 ### iterations: 00071 ### eval_score: 0.0542
trial: 0008 ### iterations: 00150 ### eval_score: 0.05331
trial: 0009 ### iterations: 00150 ### eval_score: 0.05053
trial: 0010 ### iterations: 00150 ### eval_score: 0.09654


BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         min_features_to_select=1, n_iter=10,
         param_grid={'boosting_type': <hyperopt.pyll.base.Apply object at 0x000001855ED04288>,
                     'colsample_bytree': <hyperopt.pyll.base.Apply object at 0x0000018567F7E3C8>,
                     'learning_rate': <hyperopt.pyll.base.Apply object at 0x0000018567F78FC8>,
                     'max_depth': <hyperopt.pyll.base.Apply object at 0x0000018567F78B08>},
         sampling_seed=0)

In [38]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMRegressor(colsample_bytree=0.8515260655364685,
               learning_rate=0.13520045129619862, max_depth=18, n_estimators=150,
               random_state=0),
 {'boosting_type': 'gbdt',
  'colsample_bytree': 0.8515260655364685,
  'learning_rate': 0.13520045129619862,
  'max_depth': 18},
 0.04590403848291353,
 9)

In [39]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8163041169524399, (2400,), (2400, 9), (2400, 10))

# Hyperparameters Tuning + Features Selection with SHAP

In [40]:
### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA SHAP ###

model = BoostBoruta(
    clf_lgbm, param_grid=param_grid, max_iter=200, perc=100,
    importance_type='shap_importances', train_importance=False
)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00063 ### eval_score: 0.23678
trial: 0002 ### iterations: 00070 ### eval_score: 0.23779
trial: 0003 ### iterations: 00042 ### eval_score: 0.24026
trial: 0004 ### iterations: 00052 ### eval_score: 0.23773
trial: 0005 ### iterations: 00036 ### eval_score: 0.24991
trial: 0006 ### iterations: 00062 ### eval_score: 0.23893
trial: 0007 ### iterations: 00133 ### eval_score: 0.23946
trial: 0008 ### iterations: 00099 ### eval_score: 0.24318
trial: 0009 ### iterations: 00106 ### eval_score: 0.23646
trial: 0010 ### iterations: 00090 ### eval_score: 0.24228
trial: 0011 ### iterations: 00125 ### eval_score: 0.23753
trial: 0012 ### iterations: 00101 ### eval_score: 0.24296


BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),
            importance_type='shap_importances', max_iter=200,
            param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                        'num_leaves': [25, 30, 35]},
            train_importance=False)

In [41]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMClassifier(max_depth=10, n_estimators=150, num_leaves=35, random_state=0),
 {'learning_rate': 0.1, 'num_leaves': 35, 'max_depth': 10},
 0.23645686796387438,
 11)

In [42]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict_proba(X_clf_valid).shape)

(0.9083333333333333, (2400,), (2400, 11), (2400, 2))

In [43]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###

model = BoostRFE(
    regr_lgbm, param_grid=param_dist, min_features_to_select=1, step=1,
    n_iter=10, sampling_seed=0,
    importance_type='shap_importances', train_importance=False
)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


10 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00063 ### eval_score: 0.05079
trial: 0002 ### iterations: 00123 ### eval_score: 0.04862
trial: 0003 ### iterations: 00150 ### eval_score: 0.04846
trial: 0004 ### iterations: 00064 ### eval_score: 0.04951
trial: 0005 ### iterations: 00119 ### eval_score: 0.04796
trial: 0006 ### iterations: 00150 ### eval_score: 0.04898
trial: 0007 ### iterations: 00067 ### eval_score: 0.04754
trial: 0008 ### iterations: 00148 ### eval_score: 0.05036
trial: 0009 ### iterations: 00080 ### eval_score: 0.04935
trial: 0010 ### iterations: 00150 ### eval_score: 0.04769


BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         importance_type='shap_importances', min_features_to_select=1,
         n_iter=10,
         param_grid={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000018567F78208>,
                     'max_depth': [10, 12],
                     'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000018566F33248>},
         sampling_seed=0, train_importance=False)

In [44]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMRegressor(learning_rate=0.2506724789465198, max_depth=12, n_estimators=150,
               num_leaves=35, random_state=0),
 {'learning_rate': 0.2506724789465198, 'num_leaves': 35, 'max_depth': 12},
 0.04753836235238756,
 9)

In [45]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8097639828746837, (2400,), (2400, 9), (2400, 10))

In [46]:
### HYPERPARAM TUNING WITH HYPEROPT + RECURSIVE FEATURE ADDITION (RFA) SHAP ###

model = BoostRFA(
    regr_lgbm, param_grid=param_dist_hyperopt, min_features_to_select=1, step=1,
    n_iter=10, sampling_seed=0,
    importance_type='shap_importances', train_importance=False
)
model.fit(
    X_regr_train, y_regr_train, trials=Trials(), 
    eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0
)


10 trials detected for ('boosting_type', 'max_depth', 'learning_rate', 'colsample_bytree')

trial: 0001 ### iterations: 00150 ### eval_score: 0.05458
trial: 0002 ### iterations: 00148 ### eval_score: 0.0459
trial: 0003 ### iterations: 00111 ### eval_score: 0.04903
trial: 0004 ### iterations: 00150 ### eval_score: 0.10224
trial: 0005 ### iterations: 00150 ### eval_score: 0.15875
trial: 0006 ### iterations: 00150 ### eval_score: 0.04944
trial: 0007 ### iterations: 00126 ### eval_score: 0.0471
trial: 0008 ### iterations: 00150 ### eval_score: 0.04857
trial: 0009 ### iterations: 00150 ### eval_score: 0.05053
trial: 0010 ### iterations: 00150 ### eval_score: 0.09805


BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         importance_type='shap_importances', min_features_to_select=1,
         n_iter=10,
         param_grid={'boosting_type': <hyperopt.pyll.base.Apply object at 0x000001855ED04288>,
                     'colsample_bytree': <hyperopt.pyll.base.Apply object at 0x0000018567F7E3C8>,
                     'learning_rate': <hyperopt.pyll.base.Apply object at 0x0000018567F78FC8>,
                     'max_depth': <hyperopt.pyll.base.Apply object at 0x0000018567F78B08>},
         sampling_seed=0, train_importance=False)

In [47]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMRegressor(colsample_bytree=0.8515260655364685,
               learning_rate=0.13520045129619862, max_depth=18, n_estimators=150,
               random_state=0),
 {'boosting_type': 'gbdt',
  'colsample_bytree': 0.8515260655364685,
  'learning_rate': 0.13520045129619862,
  'max_depth': 18},
 0.04590403848291353,
 9)

In [48]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.8163041169524399, (2400,), (2400, 9), (2400, 10))

# CUSTOM EVAL METRIC SUPPORT

In [49]:
from sklearn.metrics import roc_auc_score

def AUC(y_true, y_hat):
    return 'auc', roc_auc_score(y_true, y_hat), True

In [50]:
model = BoostRFE(
    LGBMClassifier(n_estimators=150, random_state=0, metric="custom"), 
    param_grid=param_grid, min_features_to_select=1, step=1,
    greater_is_better=True
)
model.fit(
    X_clf_train, y_clf_train, 
    eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0, 
    eval_metric=AUC
)


12 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00006 ### eval_score: 0.53515
trial: 0002 ### iterations: 00004 ### eval_score: 0.5908
trial: 0003 ### iterations: 00007 ### eval_score: 0.53593
trial: 0004 ### iterations: 00007 ### eval_score: 0.52933
trial: 0005 ### iterations: 00014 ### eval_score: 0.58782
trial: 0006 ### iterations: 00011 ### eval_score: 0.53378
trial: 0007 ### iterations: 00007 ### eval_score: 0.58935
trial: 0008 ### iterations: 00007 ### eval_score: 0.53138
trial: 0009 ### iterations: 00003 ### eval_score: 0.53113
trial: 0010 ### iterations: 00017 ### eval_score: 0.58623
trial: 0011 ### iterations: 00009 ### eval_score: 0.58735
trial: 0012 ### iterations: 00007 ### eval_score: 0.58713


BoostRFE(estimator=LGBMClassifier(metric='custom', random_state=0),
         greater_is_better=True, min_features_to_select=1,
         param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                     'num_leaves': [25, 30, 35]})

# CATEGORICAL FEATURE SUPPORT

In [51]:
categorical_feature = [0,1,2]

X_clf_train[:,categorical_feature] = (X_clf_train[:,categorical_feature]+100).clip(0).astype(int)
X_clf_valid[:,categorical_feature] = (X_clf_valid[:,categorical_feature]+100).clip(0).astype(int)

In [52]:
### MANUAL PASS categorical_feature WITH NUMPY ARRAYS ###

model = BoostRFE(clf_lgbm, param_grid=param_grid, min_features_to_select=1, step=1)
model.fit(
    X_clf_train, y_clf_train, 
    eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0,
    categorical_feature=categorical_feature
)


12 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00074 ### eval_score: 0.23709
trial: 0002 ### iterations: 00076 ### eval_score: 0.23733
trial: 0003 ### iterations: 00048 ### eval_score: 0.23524
trial: 0004 ### iterations: 00042 ### eval_score: 0.24027
trial: 0005 ### iterations: 00064 ### eval_score: 0.23977
trial: 0006 ### iterations: 00055 ### eval_score: 0.23727
trial: 0007 ### iterations: 00128 ### eval_score: 0.23446
trial: 0008 ### iterations: 00143 ### eval_score: 0.23278
trial: 0009 ### iterations: 00111 ### eval_score: 0.23298
trial: 0010 ### iterations: 00090 ### eval_score: 0.23798
trial: 0011 ### iterations: 00123 ### eval_score: 0.23489
trial: 0012 ### iterations: 00128 ### eval_score: 0.23242


BoostRFE(estimator=LGBMClassifier(n_estimators=150, random_state=0),
         min_features_to_select=1,
         param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                     'num_leaves': [25, 30, 35]})

In [53]:
X_clf_train = pd.DataFrame(X_clf_train)
X_clf_train[categorical_feature] = X_clf_train[categorical_feature].astype('category')

X_clf_valid = pd.DataFrame(X_clf_valid)
X_clf_valid[categorical_feature] = X_clf_valid[categorical_feature].astype('category')

In [54]:
### PASS category COLUMNS IN PANDAS DF ###

model = BoostRFE(clf_lgbm, param_grid=param_grid, min_features_to_select=1, step=1)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


12 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00074 ### eval_score: 0.23709
trial: 0002 ### iterations: 00076 ### eval_score: 0.23733
trial: 0003 ### iterations: 00048 ### eval_score: 0.23524
trial: 0004 ### iterations: 00042 ### eval_score: 0.24027
trial: 0005 ### iterations: 00064 ### eval_score: 0.23977
trial: 0006 ### iterations: 00055 ### eval_score: 0.23727
trial: 0007 ### iterations: 00128 ### eval_score: 0.23446
trial: 0008 ### iterations: 00143 ### eval_score: 0.23278
trial: 0009 ### iterations: 00111 ### eval_score: 0.23298
trial: 0010 ### iterations: 00090 ### eval_score: 0.23798
trial: 0011 ### iterations: 00123 ### eval_score: 0.23489
trial: 0012 ### iterations: 00128 ### eval_score: 0.23242


BoostRFE(estimator=LGBMClassifier(n_estimators=150, random_state=0),
         min_features_to_select=1,
         param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                     'num_leaves': [25, 30, 35]})