# Extending Auto-Sklearn with XGBboost Component

In [1]:
from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \
    UniformIntegerHyperparameter, CategoricalHyperparameter

import sklearn.metrics
import autosklearn.regression
import autosklearn.pipeline.components.regression
from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, \
    SIGNED_DATA, UNSIGNED_DATA, PREDICTIONS

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

import xgboost as xgb
print("xgb", xgb.__version__)
print("autosklearn", autosklearn.__version__)



  self.re = re.compile(self.reString)


xgb 1.2.1
autosklearn 0.10.0


## Generate data

In [2]:
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Test XBG

In [3]:
# xgb_model = xgb.XGBRegressor(n_estimators=1)
# xgb_model.fit(X_train, y_train, verbose=1)
# xgb_model.predict(X_test)

## Implement XBG component

In [4]:
class XGBReg(AutoSklearnRegressionAlgorithm):
    def __init__(self, learning_rate, booster, tree_method, gamma, reg_alpha, reg_lambda, base_score, n_estimators, 
                 max_depth, min_child_weight, random_state=None):
        self.learning_rate=learning_rate
        self.booster = booster
        self.tree_method = tree_method
        self.gamma=gamma
        self.reg_alpha=reg_alpha
        self.reg_lambda=reg_lambda
        self.base_score=base_score
        self.n_estimators=n_estimators
        self.max_depth=max_depth
        self.min_child_weight=min_child_weight
        
        self.random_state = random_state
        self.estimator = None
        
    def fit(self, X, y):
        import xgboost as xgb
        self.estimator = xgb.XGBRegressor(learning_rate=self.learning_rate, booster=self.booster, 
                                          tree_method=self.tree_method, gamma=self.gamma, reg_alpha=self.reg_alpha,
                                          reg_lambda=self.reg_lambda, base_score=self.base_score, 
                                          n_estimators=self.n_estimators, max_depth=self.max_depth,
                                          min_child_weight = self.min_child_weight)
        self.estimator.fit(X, y)
        return self
    
    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)
    
    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'XGBReg',
                'name': 'XGB Regression',
                'handles_regression': True,
                'handles_classification': False,
                'handles_multiclass': False,
                'handles_multilabel': False,
                'handles_multioutput': True,
                'is_deterministic': True,
                'input': (SPARSE, DENSE, UNSIGNED_DATA, SIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()
        learning_rate = UniformFloatHyperparameter(
            name='learning_rate', lower=10 ** -7, upper=1,default_value=0.3) #log=True
        booster = CategoricalHyperparameter(
            name='booster',choices=['gbtree','gblinear','dart'],default_value='gbtree') 
        tree_method = CategoricalHyperparameter(
            name='tree_method', choices=['auto','exact','approx', 'hist'], default_value='auto') #'gpu_hist'
        gamma = UniformFloatHyperparameter(
            name='gamma', lower=0.0000000, upper=1, default_value=0.0) #log=True
        reg_alpha = UniformFloatHyperparameter(
            name='reg_alpha', lower=0.0000000, upper=1, default_value=0.0) #log=True
        reg_lambda = UniformFloatHyperparameter(
            name='reg_lambda', lower=0.0000000, upper=1, default_value=1) #log=True
        base_score = UniformFloatHyperparameter(
            name='base_score', lower=0.1, upper=1, default_value=0.5) #log=True
        
        n_estimators=UniformIntegerHyperparameter(
            name='n_estimators', lower=1, upper=2000, default_value=100)
        max_depth=UniformIntegerHyperparameter(
            name='max_depth', lower=1, upper=10, default_value=6)
        min_child_weight= UniformIntegerHyperparameter(
            name='min_child_weight', lower=1, upper=10, default_value=1)

        
        cs.add_hyperparameters([learning_rate, booster, tree_method, gamma,reg_alpha, reg_lambda,base_score,n_estimators,
                               max_depth, min_child_weight])
        return cs
    
# Add XGB component to auto-sklearn.
autosklearn.pipeline.components.regression.add_regressor(XGBReg)
cs = XGBReg.get_hyperparameter_search_space()
print(cs)

Configuration space object:
  Hyperparameters:
    base_score, Type: UniformFloat, Range: [0.1, 1.0], Default: 0.5
    booster, Type: Categorical, Choices: {gbtree, gblinear, dart}, Default: gbtree
    gamma, Type: UniformFloat, Range: [0.0, 1.0], Default: 0.0
    learning_rate, Type: UniformFloat, Range: [1e-07, 1.0], Default: 0.3
    max_depth, Type: UniformInteger, Range: [1, 10], Default: 6
    min_child_weight, Type: UniformInteger, Range: [1, 10], Default: 1
    n_estimators, Type: UniformInteger, Range: [1, 2000], Default: 100
    reg_alpha, Type: UniformFloat, Range: [0.0, 1.0], Default: 0.0
    reg_lambda, Type: UniformFloat, Range: [0.0, 1.0], Default: 1.0
    tree_method, Type: Categorical, Choices: {auto, exact, approx, hist}, Default: auto



## Fit the model using the created XBG component

In [5]:
reg = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=30,
    per_run_time_limit=10,
    include_estimators=['XGBReg'] 
    ,ensemble_size=1
    ,initial_configurations_via_metalearning=0,
)
reg.fit(X_train, y_train)

Parameters: { gamma, max_depth, min_child_weight, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { gamma, max_depth, min_child_weight, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { gamma, max_depth, min_child_weight, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { gamma, max_depth,



AutoSklearnRegressor(dask_client=None,
                     delete_output_folder_after_terminate=True,
                     delete_tmp_folder_after_terminate=True,
                     disable_evaluator_output=False, ensemble_memory_limit=1024,
                     ensemble_nbest=50, ensemble_size=1,
                     exclude_estimators=None, exclude_preprocessors=None,
                     get_smac_object_callback=None,
                     include_estimators=['XGBReg'], include_preprocessors=None,
                     initial_configurations_via_metalearning=0,
                     logging_config=None, max_models_on_disc=50,
                     metadata_directory=None, metric=None, ml_memory_limit=3072,
                     n_jobs=None, output_folder=None, per_run_time_limit=10,
                     resampling_strategy='holdout',
                     resampling_strategy_arguments=None, seed=1,
                     smac_scenario_args=None, time_left_for_this_task=30,
              

## Print prediction 



In [6]:
y_pred = reg.predict(X_test)
y_pred

array([158.2543335 , 165.25517273, 232.31369019, 171.4210968 ,
       180.06288147, 128.54096985, 118.91077423, 171.2124939 ,
       212.3092041 , 128.57797241, 116.26277924, 141.31118774,
        74.54495239, 118.4146347 , 240.20515442, 185.80239868,
       137.2454071 , 117.47792053,  97.5131073 , 247.29804993,
       135.97555542, 264.20507812, 102.75144196, 100.00663757,
       187.87504578, 144.37947083, 128.80046082, 264.92733765,
       181.68066406, 130.91119385, 164.19534302,  65.94696045,
       168.15518188, 229.20506287, 104.61754608, 208.60090637,
       139.85713196, 140.45094299, 152.35533142, 115.55531311,
       181.44374084, 116.09146118, 189.69267273, 129.76060486,
       156.42976379, 188.09292603, 153.91339111, 107.44093323,
       175.9956665 , 137.43299866, 151.84436035, 154.20431519,
       211.23632812, 177.24919128, 216.05328369, 145.0786438 ,
       152.38664246,  57.05230331, 129.16923523,  83.51231384,
       212.88713074, 106.92141724, 175.31744385,  80.62

## Print search results

In [7]:
print(reg.sprint_statistics())

auto-sklearn results:
  Dataset name: f5006b84b27d0942603109061919c7e5
  Metric: r2
  Best validation score: 0.585748
  Number of target algorithm runs: 11
  Number of successful target algorithm runs: 10
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 0



## Print model paramerts

In [8]:
print("r2 score: ", sklearn.metrics.r2_score(y_pred, y_test))
print(reg.show_models())

r2 score:  -0.15646095406938665
[(1.000000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'robust_scaler', 'feature_preprocessor:__choice__': 'select_percentile_regression', 'regressor:__choice__': 'XGBReg', 'data_preprocessing:numerical_transformer:rescaling:robust_scaler:q_max': 0.7412066674374952, 'data_preprocessing:numerical_transformer:rescaling:robust_scaler:q_min': 0.28631016731182696, 'feature_preprocessor:select_percentile_regression:percentile': 71.2593340979654, 'feature_preprocessor:select_percentile_regression:score_func': 'f_regression', 'regressor:XGBReg:base_score': 0.3197409069902052, 'regressor:XGBReg:booster': 'gblinear', 'regressor:XGBReg:gamma': 0.7142038042875509