# Extending Auto-Sklearn with LGB  Component

In [1]:
from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \
    UniformIntegerHyperparameter, CategoricalHyperparameter

import sklearn.metrics
import autosklearn.regression
import autosklearn.pipeline.components.regression
from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, \
    SIGNED_DATA, UNSIGNED_DATA, PREDICTIONS

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

import xgboost as xgb
print("xgb", xgb.__version__)
print("autosklearn", autosklearn.__version__)


xgb 1.2.1
autosklearn 0.10.0


  self.re = re.compile(self.reString)


## Generate data

In [2]:
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Test LGB Regressor 

In [3]:
# lgb_model = lgb_model = lgb.LGBMRegressor(
#     boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
#     importance_type='split', learning_rate=0.1, max_depth=-1,
#     min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
#     n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
#     random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
#     subsample=1.0, subsample_for_bin=200000, subsample_freq=0
# )

# lgb_model.fit(X_train, y_train, verbose=1)
# lgb_model.predict(X_test)

## Implement LGBcomponent

In [4]:
class LGBReg(AutoSklearnRegressionAlgorithm):
    def __init__(self, boosting_type, num_leaves, max_depth, learning_rate, n_estimators, reg_alpha, reg_lambda
                 ,random_state=None):
        self.boosting_type=boosting_type
        self.num_leaves=num_leaves
        self.max_depth=max_depth
        self.learning_rate=learning_rate
        self.n_estimators=n_estimators
        self.reg_alpha=reg_alpha
        self.reg_lambda=reg_lambda
        
        self.random_state = random_state
        self.estimator = None
        
    def fit(self, X, y):
        import lightgbm as lgb
        self.estimator = lgb.LGBMRegressor(boosting_type=self.boosting_type
                                           ,num_leaves=self.num_leaves
                                           ,max_depth=self.max_depth
                                           ,learning_rate=self.learning_rate
                                           ,n_estimators=self.n_estimators
                                           ,reg_alpha=self.reg_alpha
                                           ,reg_lambda=self.reg_lambda
                                            )
        
        self.estimator.fit(X, y)
        return self
    
    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)
    
    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'LGBReg',
                'name': 'LGB Regression',
                'handles_regression': True,
                'handles_classification': False,
                'handles_multiclass': False,
                'handles_multilabel': False,
                'handles_multioutput': True,
                'is_deterministic': True,
                'input': (SPARSE, DENSE, UNSIGNED_DATA, SIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()
        boosting_type = CategoricalHyperparameter(
            name='boosting_type',choices=['gbdt','dart','goss','rf'],default_value='gbdt')
        num_leaves=UniformIntegerHyperparameter(
            name='num_leaves', lower=1, upper=1000, default_value=31)
        max_depth=UniformIntegerHyperparameter(
            name='max_depth', lower=-1, upper=1000, default_value=-1)
        learning_rate = UniformFloatHyperparameter(
            name='learning_rate', lower=0.000000001, upper=1,default_value=0.1) #log=True
        n_estimators=UniformIntegerHyperparameter(
            name='n_estimators', lower=1, upper=2000, default_value=100)
        reg_alpha = UniformFloatHyperparameter(
            name='reg_alpha', lower=0.0000000, upper=1, default_value=0.0) #log=True
        reg_lambda = UniformFloatHyperparameter(
            name='reg_lambda', lower=0.0000000, upper=1, default_value=1) #log=True
        cs.add_hyperparameters([boosting_type,num_leaves, max_depth, learning_rate,n_estimators, reg_alpha,reg_lambda])
        return cs
    
# Add XGB component to auto-sklearn.
autosklearn.pipeline.components.regression.add_regressor(LGBReg)
cs = LGBReg.get_hyperparameter_search_space()
print(cs)

Configuration space object:
  Hyperparameters:
    boosting_type, Type: Categorical, Choices: {gbdt, dart, goss, rf}, Default: gbdt
    learning_rate, Type: UniformFloat, Range: [1e-09, 1.0], Default: 0.1
    max_depth, Type: UniformInteger, Range: [-1, 1000], Default: -1
    n_estimators, Type: UniformInteger, Range: [1, 2000], Default: 100
    num_leaves, Type: UniformInteger, Range: [1, 1000], Default: 31
    reg_alpha, Type: UniformFloat, Range: [0.0, 1.0], Default: 0.0
    reg_lambda, Type: UniformFloat, Range: [0.0, 1.0], Default: 1.0



## Build and Fit the model using the created LGB component

In [5]:
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=30 
    ,per_run_time_limit=10
    ,ensemble_size=1
    ,include_estimators=['LGBReg'] 
    ,initial_configurations_via_metalearning=0,
)

In [6]:
automl.fit(X_train, y_train)



AutoSklearnRegressor(dask_client=None,
                     delete_output_folder_after_terminate=True,
                     delete_tmp_folder_after_terminate=True,
                     disable_evaluator_output=False, ensemble_memory_limit=1024,
                     ensemble_nbest=50, ensemble_size=1,
                     exclude_estimators=None, exclude_preprocessors=None,
                     get_smac_object_callback=None,
                     include_estimators=['LGBReg'], include_preprocessors=None,
                     initial_configurations_via_metalearning=0,
                     logging_config=None, max_models_on_disc=50,
                     metadata_directory=None, metric=None, ml_memory_limit=3072,
                     n_jobs=None, output_folder=None, per_run_time_limit=10,
                     resampling_strategy='holdout',
                     resampling_strategy_arguments=None, seed=1,
                     smac_scenario_args=None, time_left_for_this_task=30,
              

## Print prediction 



In [7]:
y_pred = automl.predict(X_test)
y_pred

array([ 66.53259277, 103.89299774, 136.52035522, 293.59533691,
       186.27400208,  76.51495361,  56.70431519, 112.90505219,
       130.87165833, 116.31547546, 124.48735809, 118.27326965,
       141.48805237, 187.00762939, 155.06814575, 131.96495056,
       194.85415649, 237.56455994, 144.42370605,  71.26400757,
       134.99624634, 285.84072876, 126.3681488 , 162.30859375,
        23.5545826 , 142.72735596,  61.02006912, 205.13531494,
       100.60227203, 246.43344116, 115.33290863,  54.67824936,
       180.66041565, 201.71412659, 219.90782166,  27.64725113,
        69.43060303,  71.19359589,  61.78504944, 159.76596069,
       124.44114685, 215.11134338, 150.71516418, 150.51506042,
       140.82475281, 102.83168793,  67.84287262,  50.54925537,
       196.36239624, 219.18096924,  94.47736359, 257.85559082,
       121.86736298, 128.85049438, 246.40852356, 178.89678955,
       142.73390198, 197.31465149, 245.0425415 , 174.84680176,
       108.14919281, 155.99594116,  65.76758575,  87.46

## Print search results

In [8]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: 998d86a16f123feaa038754a008b44b3
  Metric: r2
  Best validation score: 0.419776
  Number of target algorithm runs: 18
  Number of successful target algorithm runs: 11
  Number of crashed target algorithm runs: 7
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0



## Print model parameters

In [9]:
print("r2 score: ", sklearn.metrics.r2_score(y_pred, y_test))
print(automl.show_models())

r2 score:  0.18172291187203238
[(1.000000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'fast_ica', 'regressor:__choice__': 'LGBReg', 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.06411384473064781, 'feature_preprocessor:fast_ica:algorithm': 'deflation', 'feature_preprocessor:fast_ica:fun': 'logcosh', 'feature_preprocessor:fast_ica:whiten': 'False', 'regressor:LGBReg:boosting_type': 'dart', 'regressor:LGBReg:learning_rate': 0.42089639838202075, 'regressor:LGBReg:max_depth': 980, 'regressor:LGBReg:n_estimators': 183, 'regressor:LGBReg:num_leaves': 363, 'regressor:LG