In [1]:
!pip3 install lightgbm

Collecting lightgbm
  Using cached https://files.pythonhosted.org/packages/70/cd/2b7783e8c250f8191b72e9a0010e0429a799d3305c27764d7bf113dfd078/lightgbm-3.1.1-py2.py3-none-manylinux1_x86_64.whl
Collecting wheel (from lightgbm)
  Using cached https://files.pythonhosted.org/packages/65/63/39d04c74222770ed1589c0eaba06c05891801219272420b40311cd60c880/wheel-0.36.2-py2.py3-none-any.whl
Collecting numpy (from lightgbm)
  Using cached https://files.pythonhosted.org/packages/45/b2/6c7545bb7a38754d63048c7696804a0d947328125d81bf12beaa692c3ae3/numpy-1.19.5-cp36-cp36m-manylinux1_x86_64.whl
Collecting scikit-learn!=0.22.0 (from lightgbm)
  Using cached https://files.pythonhosted.org/packages/a4/11/e5862273960aef46cf98e571db5433bdabe5e816ef3317260dcdabc9b438/scikit_learn-0.24.1-cp36-cp36m-manylinux1_x86_64.whl
Collecting scipy (from lightgbm)
  Using cached https://files.pythonhosted.org/packages/c8/89/63171228d5ced148f5ced50305c89e8576ffc695a90b58fe5bb602b910c2/scipy-1.5.4-cp36-cp36m-manylinux1_x86_64

In [1]:
from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, UniformFloatHyperparameter

import sklearn
from sklearn.datasets import load_breast_cancer, load_wine
from sklearn.model_selection import train_test_split

import autosklearn.classification
import autosklearn.pipeline.components.classification
from autosklearn.pipeline.components.classification \
    import AutoSklearnClassificationAlgorithm
from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, SIGNED_DATA, PREDICTIONS
from autosklearn.metrics import balanced_accuracy, precision, recall, f1


############################################################################
# Subclass auto-sklearn's lightgbm classifier
# ================================================

# This classifier only has one of the hyperparameter's of auto-sklearn's
# default parametrization (``max_features``). Instead, it also
# tunes the number of estimators (``n_estimators``).

class CustomLightGBM(AutoSklearnClassificationAlgorithm):
    def __init__(self,
                 n_estimators,
                 num_leaves,
                 random_state=None,
                 ):
        self.n_estimators = n_estimators
        self.num_leaves = num_leaves
        self.random_state = random_state

    def fit(self, X, y):
        from lightgbm import LGBMClassifier

        self.n_estimators = int(self.n_estimators)
        self.num_leaves = int(self.num_leaves)

        self.estimator = LGBMClassifier(
            n_estimators=self.n_estimators,
            max_features=self.num_leaves,
            random_state=self.random_state,
        )
        self.estimator.fit(X, y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'LGBM',
                'name': 'LightGBM Classifier',
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': True,
                'handles_multioutput': True,
                'is_deterministic': True,
                'input': (DENSE, SIGNED_DATA, UNSIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        num_leaves = UniformFloatHyperparameter("num_leaves", 10, 100, default_value=30)
        n_estimators = UniformIntegerHyperparameter("n_estimators", 200, 300, default_value=200)

        cs.add_hyperparameters([num_leaves, n_estimators])
        return cs


# Add custom random forest classifier component to auto-sklearn.
autosklearn.pipeline.components.classification.add_classifier(CustomLightGBM)
cs = CustomLightGBM.get_hyperparameter_search_space()
print(cs)

############################################################################
# Data Loading
# ============

X, y = load_breast_cancer(return_X_y=True)
# X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

############################################################################
# Fit Random forest classifier to the data
# ========================================

# n_jobs를 설정하지 않으면 무지막지하게 느려진다.
clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=600,
    per_run_time_limit=240,
    # Here we exclude auto-sklearn's default random forest component
    include_estimators=['CustomLightGBM'],
    n_jobs=1,
    # Bellow two flags are provided to speed up calculations
    # Not recommended for a real implementation
    initial_configurations_via_metalearning=0,
    scoring_functions=[balanced_accuracy],
    smac_scenario_args={'runcount_limit': 2},
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
print(clf.show_models())
# print(clf.show_models())
############################################################################
# Print the configuration space
# =============================

# Observe that this configuration space only contains our custom random
# forest, but not auto-sklearn's ``random_forest``
# cs = clf.get_configuration_space(X_train, y_train)
# assert 'random_forest' not in str(cs)
# print(cs)

Configuration space object:
  Hyperparameters:
    n_estimators, Type: UniformInteger, Range: [200, 300], Default: 200
    num_leaves, Type: UniformFloat, Range: [10.0, 100.0], Default: 30.0

accuracy:  0.9440559440559441
[(0.880000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'CustomLightGBM', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'no_preprocessing', 'classifier:CustomLightGBM:n_estimators': 200, 'classifier:CustomLightGBM:num_leaves': 30.0, 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.01},
dataset_properties={
  'task': 1,
  'sparse': False,
  

In [2]:
print(clf.automl_.runhistory_.data)

OrderedDict([(RunKey(config_id=1, instance_id='{"task_id": "aa099a9c-808f-11eb-8376-b42e99e8c846"}', seed=0, budget=0.0), RunValue(cost=0.049645390070921946, time=0.2700483798980713, status=<StatusType.SUCCESS: 1>, starttime=1615264008.8356369, endtime=1615264009.118551, additional_info={'balanced_accuracy': 0.06228559176672377, 'accuracy': 0.049645390070921946, 'duration': 0.2559242248535156, 'num_run': 2, 'train_loss': 0.0, 'configuration_origin': 'Default'})), (RunKey(config_id=2, instance_id='{"task_id": "aa099a9c-808f-11eb-8376-b42e99e8c846"}', seed=0, budget=0.0), RunValue(cost=0.05673758865248224, time=0.2784600257873535, status=<StatusType.SUCCESS: 1>, starttime=1615264009.7946033, endtime=1615264010.0850773, additional_info={'balanced_accuracy': 0.07171955403087482, 'accuracy': 0.05673758865248224, 'duration': 0.26608824729919434, 'num_run': 3, 'train_loss': 0.0, 'configuration_origin': 'Random Search (sorted)'}))])


In [3]:
print(clf.cv_results_)

{'mean_test_score': array([0.95035461, 0.94326241]), 'metric_balanced_accuracy': masked_array(data=[0.9377144082332762, 0.9282804459691252],
             mask=[False, False],
       fill_value=1e+20), 'mean_fit_time': array([0.27004838, 0.27846003]), 'params': [{'balancing:strategy': 'none', 'classifier:__choice__': 'CustomLightGBM', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'no_preprocessing', 'classifier:CustomLightGBM:n_estimators': 200, 'classifier:CustomLightGBM:num_leaves': 30.0, 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.01}, {'balancing:strategy': 'none', 'classifier:__choice__': 