In [1]:
import numpy as np
import pandas as pd
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_regression

from hyperopt import hp
from hyperopt import Trials

from lightgbm import *

try:
    from shaphypetune import BoostSearch, BoostBoruta, BoostRFE, BoostRFA
except:
    !pip install --upgrade shap-hypetune
    from shaphypetune import BoostSearch, BoostBoruta, BoostRFE, BoostRFA

import warnings
warnings.simplefilter('ignore')

In [2]:
X_clf, y_clf = make_classification(n_samples=6000, n_features=20, n_classes=2, 
                                   n_informative=4, n_redundant=6, random_state=0)

X_clf_train, X_clf_valid, y_clf_train, y_clf_valid = train_test_split(
    X_clf, y_clf, test_size=0.3, shuffle=False)

X_regr, y_regr = make_classification(n_samples=6000, n_features=20,
                                     n_informative=7, random_state=0)

X_regr_train, X_regr_valid, y_regr_train, y_regr_valid = train_test_split(
    X_regr, y_regr, test_size=0.3, shuffle=False)

In [3]:
param_grid = {
    'learning_rate': [0.2, 0.1],
    'num_leaves': [25, 35],
    'max_depth': [10, 12]
}

param_dist = {
    'learning_rate': stats.uniform(0.09, 0.25),
    'num_leaves': stats.randint(20,40),
    'max_depth': [10, 12]
}

param_dist_hyperopt = {
    'max_depth': 15 + hp.randint('num_leaves', 5), 
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0)
}


regr_lgbm = LGBMRegressor(n_estimators=150, random_state=0, n_jobs=-1)
clf_lgbm = LGBMClassifier(n_estimators=150, random_state=0, n_jobs=-1)

# Hyperparameters Tuning

In [4]:
### HYPERPARAM TUNING WITH GRID-SEARCH ###

model = BoostSearch(clf_lgbm, param_grid=param_grid)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00023 ### eval_score: 0.2085
trial: 0002 ### iterations: 00019 ### eval_score: 0.21112
trial: 0003 ### iterations: 00026 ### eval_score: 0.21162
trial: 0004 ### iterations: 00032 ### eval_score: 0.20747
trial: 0005 ### iterations: 00054 ### eval_score: 0.20244
trial: 0006 ### iterations: 00071 ### eval_score: 0.20052
trial: 0007 ### iterations: 00047 ### eval_score: 0.20306
trial: 0008 ### iterations: 00050 ### eval_score: 0.20506


BoostSearch(estimator=LGBMClassifier(n_estimators=150, random_state=0),
            param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                        'num_leaves': [25, 35]})

In [5]:
model.estimator_, model.best_params_, model.best_score_

(LGBMClassifier(max_depth=12, n_estimators=150, num_leaves=25, random_state=0),
 {'learning_rate': 0.1, 'num_leaves': 25, 'max_depth': 12},
 0.20051586840398297)

In [6]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.predict_proba(X_clf_valid).shape)

(0.9183333333333333, (1800,), (1800, 2))

In [7]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH ###

model = BoostSearch(
    regr_lgbm, param_grid=param_dist,
    n_iter=8, sampling_seed=0
)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00023 ### eval_score: 0.07643
trial: 0002 ### iterations: 00052 ### eval_score: 0.06818
trial: 0003 ### iterations: 00062 ### eval_score: 0.07042
trial: 0004 ### iterations: 00033 ### eval_score: 0.07035
trial: 0005 ### iterations: 00032 ### eval_score: 0.07153
trial: 0006 ### iterations: 00012 ### eval_score: 0.07547
trial: 0007 ### iterations: 00041 ### eval_score: 0.07355
trial: 0008 ### iterations: 00025 ### eval_score: 0.07805


BoostSearch(estimator=LGBMRegressor(n_estimators=150, random_state=0), n_iter=8,
            param_grid={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd50407f2d0>,
                        'max_depth': [10, 12],
                        'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd50407f590>},
            sampling_seed=0)

In [8]:
model.estimator_, model.best_params_, model.best_score_

(LGBMRegressor(learning_rate=0.1350674222191923, max_depth=10, n_estimators=150,
               num_leaves=38, random_state=0),
 {'learning_rate': 0.1350674222191923, 'num_leaves': 38, 'max_depth': 10},
 0.06817737242646997)

In [9]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7272820930747703, (1800,), (1800, 21))

In [10]:
### HYPERPARAM TUNING WITH HYPEROPT ###

model = BoostSearch(
    regr_lgbm, param_grid=param_dist_hyperopt,
    n_iter=8, sampling_seed=0
)
model.fit(
    X_regr_train, y_regr_train, trials=Trials(), 
    eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0
)


8 trials detected for ('max_depth', 'learning_rate', 'colsample_bytree')

trial: 0001 ### iterations: 00149 ### eval_score: 0.06979
trial: 0002 ### iterations: 00055 ### eval_score: 0.07039
trial: 0003 ### iterations: 00056 ### eval_score: 0.0716
trial: 0004 ### iterations: 00150 ### eval_score: 0.07352
trial: 0005 ### iterations: 00150 ### eval_score: 0.07936
trial: 0006 ### iterations: 00147 ### eval_score: 0.06833
trial: 0007 ### iterations: 00032 ### eval_score: 0.07261
trial: 0008 ### iterations: 00096 ### eval_score: 0.07074


BoostSearch(estimator=LGBMRegressor(n_estimators=150, random_state=0), n_iter=8,
            param_grid={'colsample_bytree': <hyperopt.pyll.base.Apply object at 0x7fd50407fd10>,
                        'learning_rate': <hyperopt.pyll.base.Apply object at 0x7fd50407fa50>,
                        'max_depth': <hyperopt.pyll.base.Apply object at 0x7fd50407f710>},
            sampling_seed=0)

In [11]:
model.estimator_, model.best_params_, model.best_score_

(LGBMRegressor(colsample_bytree=0.7597292534356749,
               learning_rate=0.059836658149176665, max_depth=16,
               n_estimators=150, random_state=0),
 {'colsample_bytree': 0.7597292534356749,
  'learning_rate': 0.059836658149176665,
  'max_depth': 16},
 0.06832542425080958)

In [12]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7266898674988451, (1800,), (1800, 21))

# Features Selection

In [13]:
### BORUTA ###

model = BoostBoruta(clf_lgbm, max_iter=200, perc=100)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)

BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),
            max_iter=200)

In [14]:
model.estimator_, model.n_features_

(LGBMClassifier(n_estimators=150, random_state=0), 10)

In [15]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict_proba(X_clf_valid).shape)

(0.91, (1800,), (1800, 10), (1800, 2))

In [16]:
### RECURSIVE FEATURE ELIMINATION (RFE) ###

model = BoostRFE(regr_lgbm, min_features_to_select=1, step=1)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         min_features_to_select=1)

In [17]:
model.estimator_, model.n_features_

(LGBMRegressor(n_estimators=150, random_state=0), 7)

In [18]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7766363424352807, (1800,), (1800, 7), (1800, 8))

In [19]:
### RECURSIVE FEATURE ADDITION (RFA) ###

model = BoostRFA(regr_lgbm, min_features_to_select=1, step=1)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         min_features_to_select=1)

In [20]:
model.estimator_, model.n_features_

(LGBMRegressor(n_estimators=150, random_state=0), 8)

In [21]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7723191919698336, (1800,), (1800, 8), (1800, 9))

# Features Selection with SHAP

In [22]:
### BORUTA SHAP ###

model = BoostBoruta(
    clf_lgbm, max_iter=200, perc=100,
    importance_type='shap_importances', train_importance=False
)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)

BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),
            importance_type='shap_importances', max_iter=200,
            train_importance=False)

In [23]:
model.estimator_, model.n_features_

(LGBMClassifier(n_estimators=150, random_state=0), 9)

In [24]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict_proba(X_clf_valid).shape)

(0.9111111111111111, (1800,), (1800, 9), (1800, 2))

In [25]:
### RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###

model = BoostRFE(
    regr_lgbm, min_features_to_select=1, step=1,
    importance_type='shap_importances', train_importance=False
)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         importance_type='shap_importances', min_features_to_select=1,
         train_importance=False)

In [26]:
model.estimator_, model.n_features_

(LGBMRegressor(n_estimators=150, random_state=0), 7)

In [27]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7766363424352807, (1800,), (1800, 7), (1800, 8))

In [28]:
### RECURSIVE FEATURE ADDITION (RFA) SHAP ###

model = BoostRFA(
    regr_lgbm, min_features_to_select=1, step=1,
    importance_type='shap_importances', train_importance=False
)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)

BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         importance_type='shap_importances', min_features_to_select=1,
         train_importance=False)

In [29]:
model.estimator_, model.n_features_

(LGBMRegressor(n_estimators=150, random_state=0), 9)

In [30]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7699366468805918, (1800,), (1800, 9), (1800, 10))

# Hyperparameters Tuning + Features Selection

In [31]:
### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA ###

model = BoostBoruta(clf_lgbm, param_grid=param_grid, max_iter=200, perc=100)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00023 ### eval_score: 0.19868
trial: 0002 ### iterations: 00030 ### eval_score: 0.19844
trial: 0003 ### iterations: 00023 ### eval_score: 0.19695
trial: 0004 ### iterations: 00026 ### eval_score: 0.19949
trial: 0005 ### iterations: 00067 ### eval_score: 0.19583
trial: 0006 ### iterations: 00051 ### eval_score: 0.1949
trial: 0007 ### iterations: 00045 ### eval_score: 0.19675
trial: 0008 ### iterations: 00055 ### eval_score: 0.19906


BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),
            max_iter=200,
            param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                        'num_leaves': [25, 35]})

In [32]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMClassifier(max_depth=12, n_estimators=150, num_leaves=25, random_state=0),
 {'learning_rate': 0.1, 'num_leaves': 25, 'max_depth': 12},
 0.19489866976777023,
 9)

In [33]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict_proba(X_clf_valid).shape)

(0.915, (1800,), (1800, 9), (1800, 2))

In [34]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) ###

model = BoostRFE(
    regr_lgbm, param_grid=param_dist, min_features_to_select=1, step=1,
    n_iter=8, sampling_seed=0
)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00107 ### eval_score: 0.06016
trial: 0002 ### iterations: 00095 ### eval_score: 0.05711
trial: 0003 ### iterations: 00121 ### eval_score: 0.05926
trial: 0004 ### iterations: 00103 ### eval_score: 0.05688
trial: 0005 ### iterations: 00119 ### eval_score: 0.05618
trial: 0006 ### iterations: 00049 ### eval_score: 0.06188
trial: 0007 ### iterations: 00150 ### eval_score: 0.05538
trial: 0008 ### iterations: 00083 ### eval_score: 0.06084


BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         min_features_to_select=1, n_iter=8,
         param_grid={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd50407f2d0>,
                     'max_depth': [10, 12],
                     'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd50407f590>},
         sampling_seed=0)

In [35]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMRegressor(learning_rate=0.13639381870463482, max_depth=12, n_estimators=150,
               num_leaves=25, random_state=0),
 {'learning_rate': 0.13639381870463482, 'num_leaves': 25, 'max_depth': 12},
 0.0553821617278472,
 7)

In [36]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7784645155736596, (1800,), (1800, 7), (1800, 8))

In [37]:
### HYPERPARAM TUNING WITH HYPEROPT + RECURSIVE FEATURE ADDITION (RFA) ###

model = BoostRFA(
    regr_lgbm, param_grid=param_dist_hyperopt, min_features_to_select=1, step=1,
    n_iter=8, sampling_seed=0
)
model.fit(
    X_regr_train, y_regr_train, trials=Trials(), 
    eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0
)


8 trials detected for ('max_depth', 'learning_rate', 'colsample_bytree')

trial: 0001 ### iterations: 00150 ### eval_score: 0.06507
trial: 0002 ### iterations: 00075 ### eval_score: 0.05784
trial: 0003 ### iterations: 00095 ### eval_score: 0.06088
trial: 0004 ### iterations: 00150 ### eval_score: 0.06976
trial: 0005 ### iterations: 00150 ### eval_score: 0.07593
trial: 0006 ### iterations: 00149 ### eval_score: 0.05995
trial: 0007 ### iterations: 00058 ### eval_score: 0.05916
trial: 0008 ### iterations: 00150 ### eval_score: 0.06366


BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         min_features_to_select=1, n_iter=8,
         param_grid={'colsample_bytree': <hyperopt.pyll.base.Apply object at 0x7fd50407fd10>,
                     'learning_rate': <hyperopt.pyll.base.Apply object at 0x7fd50407fa50>,
                     'max_depth': <hyperopt.pyll.base.Apply object at 0x7fd50407f710>},
         sampling_seed=0)

In [38]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMRegressor(colsample_bytree=0.8515260655364685,
               learning_rate=0.13520045129619862, max_depth=18, n_estimators=150,
               random_state=0),
 {'colsample_bytree': 0.8515260655364685,
  'learning_rate': 0.13520045129619862,
  'max_depth': 18},
 0.0578369356489881,
 8)

In [39]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7686451168212334, (1800,), (1800, 8), (1800, 9))

# Hyperparameters Tuning + Features Selection with SHAP

In [40]:
### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA SHAP ###

model = BoostBoruta(
    clf_lgbm, param_grid=param_grid, max_iter=200, perc=100,
    importance_type='shap_importances', train_importance=False
)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00036 ### eval_score: 0.19716
trial: 0002 ### iterations: 00030 ### eval_score: 0.19818
trial: 0003 ### iterations: 00031 ### eval_score: 0.19881
trial: 0004 ### iterations: 00026 ### eval_score: 0.19949
trial: 0005 ### iterations: 00067 ### eval_score: 0.19583
trial: 0006 ### iterations: 00051 ### eval_score: 0.1949
trial: 0007 ### iterations: 00045 ### eval_score: 0.19675
trial: 0008 ### iterations: 00057 ### eval_score: 0.19284


BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),
            importance_type='shap_importances', max_iter=200,
            param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                        'num_leaves': [25, 35]},
            train_importance=False)

In [41]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMClassifier(max_depth=12, n_estimators=150, num_leaves=35, random_state=0),
 {'learning_rate': 0.1, 'num_leaves': 35, 'max_depth': 12},
 0.1928371931511303,
 10)

In [42]:
(model.score(X_clf_valid, y_clf_valid), 
 model.predict(X_clf_valid).shape, 
 model.transform(X_clf_valid).shape,
 model.predict_proba(X_clf_valid).shape)

(0.9111111111111111, (1800,), (1800, 10), (1800, 2))

In [43]:
### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###

model = BoostRFE(
    regr_lgbm, param_grid=param_dist, min_features_to_select=1, step=1,
    n_iter=8, sampling_seed=0,
    importance_type='shap_importances', train_importance=False
)
model.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)


8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00107 ### eval_score: 0.06016
trial: 0002 ### iterations: 00102 ### eval_score: 0.05525
trial: 0003 ### iterations: 00150 ### eval_score: 0.05869
trial: 0004 ### iterations: 00149 ### eval_score: 0.05863
trial: 0005 ### iterations: 00119 ### eval_score: 0.05618
trial: 0006 ### iterations: 00049 ### eval_score: 0.06188
trial: 0007 ### iterations: 00150 ### eval_score: 0.05538
trial: 0008 ### iterations: 00083 ### eval_score: 0.06084


BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         importance_type='shap_importances', min_features_to_select=1, n_iter=8,
         param_grid={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd50407f2d0>,
                     'max_depth': [10, 12],
                     'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd50407f590>},
         sampling_seed=0, train_importance=False)

In [44]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMRegressor(learning_rate=0.1350674222191923, max_depth=10, n_estimators=150,
               num_leaves=38, random_state=0),
 {'learning_rate': 0.1350674222191923, 'num_leaves': 38, 'max_depth': 10},
 0.05524518772497125,
 9)

In [45]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.779012428496056, (1800,), (1800, 9), (1800, 10))

In [46]:
### HYPERPARAM TUNING WITH HYPEROPT + RECURSIVE FEATURE ADDITION (RFA) SHAP ###

model = BoostRFA(
    regr_lgbm, param_grid=param_dist_hyperopt, min_features_to_select=1, step=1,
    n_iter=8, sampling_seed=0,
    importance_type='shap_importances', train_importance=False
)
model.fit(
    X_regr_train, y_regr_train, trials=Trials(), 
    eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0
)


8 trials detected for ('max_depth', 'learning_rate', 'colsample_bytree')

trial: 0001 ### iterations: 00150 ### eval_score: 0.06508
trial: 0002 ### iterations: 00091 ### eval_score: 0.05997
trial: 0003 ### iterations: 00094 ### eval_score: 0.06078
trial: 0004 ### iterations: 00150 ### eval_score: 0.06773
trial: 0005 ### iterations: 00150 ### eval_score: 0.07565
trial: 0006 ### iterations: 00150 ### eval_score: 0.05935
trial: 0007 ### iterations: 00083 ### eval_score: 0.06047
trial: 0008 ### iterations: 00150 ### eval_score: 0.05966


BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),
         importance_type='shap_importances', min_features_to_select=1, n_iter=8,
         param_grid={'colsample_bytree': <hyperopt.pyll.base.Apply object at 0x7fd50407fd10>,
                     'learning_rate': <hyperopt.pyll.base.Apply object at 0x7fd50407fa50>,
                     'max_depth': <hyperopt.pyll.base.Apply object at 0x7fd50407f710>},
         sampling_seed=0, train_importance=False)

In [47]:
model.estimator_, model.best_params_, model.best_score_, model.n_features_

(LGBMRegressor(colsample_bytree=0.7597292534356749,
               learning_rate=0.059836658149176665, max_depth=16,
               n_estimators=150, random_state=0),
 {'colsample_bytree': 0.7597292534356749,
  'learning_rate': 0.059836658149176665,
  'max_depth': 16},
 0.059352961644604275,
 9)

In [48]:
(model.score(X_regr_valid, y_regr_valid), 
 model.predict(X_regr_valid).shape, 
 model.transform(X_regr_valid).shape,
 model.predict(X_regr_valid, pred_contrib=True).shape)

(0.7625808256692885, (1800,), (1800, 9), (1800, 10))

# CUSTOM EVAL METRIC SUPPORT

In [49]:
from sklearn.metrics import roc_auc_score

def AUC(y_true, y_hat):
    return 'auc', roc_auc_score(y_true, y_hat), True

In [50]:
model = BoostRFE(
    LGBMClassifier(n_estimators=150, random_state=0, metric="custom"), 
    param_grid=param_grid, min_features_to_select=1, step=1,
    greater_is_better=True
)
model.fit(
    X_clf_train, y_clf_train, 
    eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0, 
    eval_metric=AUC
)


8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00028 ### eval_score: 0.97581
trial: 0002 ### iterations: 00016 ### eval_score: 0.97514
trial: 0003 ### iterations: 00015 ### eval_score: 0.97574
trial: 0004 ### iterations: 00032 ### eval_score: 0.97549
trial: 0005 ### iterations: 00075 ### eval_score: 0.97551
trial: 0006 ### iterations: 00041 ### eval_score: 0.97597
trial: 0007 ### iterations: 00076 ### eval_score: 0.97592
trial: 0008 ### iterations: 00060 ### eval_score: 0.97539


BoostRFE(estimator=LGBMClassifier(metric='custom', n_estimators=150,
                                  random_state=0),
         greater_is_better=True, min_features_to_select=1,
         param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                     'num_leaves': [25, 35]})

# CATEGORICAL FEATURE SUPPORT

In [51]:
categorical_feature = [0,1,2]

X_clf_train[:,categorical_feature] = (X_clf_train[:,categorical_feature]+100).clip(0).astype(int)
X_clf_valid[:,categorical_feature] = (X_clf_valid[:,categorical_feature]+100).clip(0).astype(int)

In [52]:
### MANUAL PASS categorical_feature WITH NUMPY ARRAYS ###

model = BoostRFE(clf_lgbm, param_grid=param_grid, min_features_to_select=1, step=1)
model.fit(
    X_clf_train, y_clf_train, 
    eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0,
    categorical_feature=categorical_feature
)


8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00029 ### eval_score: 0.2036
trial: 0002 ### iterations: 00030 ### eval_score: 0.2034
trial: 0003 ### iterations: 00027 ### eval_score: 0.20617
trial: 0004 ### iterations: 00024 ### eval_score: 0.20003
trial: 0005 ### iterations: 00060 ### eval_score: 0.20332
trial: 0006 ### iterations: 00063 ### eval_score: 0.20329
trial: 0007 ### iterations: 00054 ### eval_score: 0.20136
trial: 0008 ### iterations: 00052 ### eval_score: 0.19959


BoostRFE(estimator=LGBMClassifier(n_estimators=150, random_state=0),
         min_features_to_select=1,
         param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                     'num_leaves': [25, 35]})

In [53]:
X_clf_train = pd.DataFrame(X_clf_train)
X_clf_train[categorical_feature] = X_clf_train[categorical_feature].astype('category')

X_clf_valid = pd.DataFrame(X_clf_valid)
X_clf_valid[categorical_feature] = X_clf_valid[categorical_feature].astype('category')

In [54]:
### PASS category COLUMNS IN PANDAS DF ###

model = BoostRFE(clf_lgbm, param_grid=param_grid, min_features_to_select=1, step=1)
model.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)


8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')

trial: 0001 ### iterations: 00029 ### eval_score: 0.2036
trial: 0002 ### iterations: 00030 ### eval_score: 0.2034
trial: 0003 ### iterations: 00027 ### eval_score: 0.20617
trial: 0004 ### iterations: 00024 ### eval_score: 0.20003
trial: 0005 ### iterations: 00060 ### eval_score: 0.20332
trial: 0006 ### iterations: 00063 ### eval_score: 0.20329
trial: 0007 ### iterations: 00054 ### eval_score: 0.20136
trial: 0008 ### iterations: 00052 ### eval_score: 0.19959


BoostRFE(estimator=LGBMClassifier(n_estimators=150, random_state=0),
         min_features_to_select=1,
         param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],
                     'num_leaves': [25, 35]})