In [1]:
!pip install auto-sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting auto-sklearn
  Downloading auto-sklearn-0.15.0.tar.gz (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 4.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
Collecting ConfigSpace<0.5,>=0.4.21
  Downloading ConfigSpace-0.4.21-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 33.6 MB/s 
Collecting pynisher<0.7,>=0.6.3
  Downloading pynisher-0.6.4.tar.gz (11 kB)
Collecting pyrfr<0.9,>=0.8.1
  Downloading pyrfr-0.8.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 40.8 MB/s 
[?25hCollecting scikit-learn<0.25.0,>=0.24.0
  Downloading scikit_learn-0.24

In [2]:
%matplotlib inline


# Extending Auto-Sklearn with Classification Component

The following example demonstrates how to create a new classification
component for using in auto-sklearn.


In [1]:
from typing import Optional
from pprint import pprint

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
    CategoricalHyperparameter,
    UniformIntegerHyperparameter,
    UniformFloatHyperparameter,
)

import sklearn.metrics

from autosklearn.askl_typing import FEAT_TYPE_TYPE
import autosklearn.classification
import autosklearn.pipeline.components.classification
from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm
from autosklearn.pipeline.constants import (
    DENSE,
    SIGNED_DATA,
    UNSIGNED_DATA,
    PREDICTIONS,
)

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

## Create MLP classifier component for auto-sklearn



In [2]:
class MLPClassifier(AutoSklearnClassificationAlgorithm):
    def __init__(
        self,
        hidden_layer_depth,
        num_nodes_per_layer,
        activation,
        alpha,
        solver,
        random_state=None,
    ):
        self.hidden_layer_depth = hidden_layer_depth
        self.num_nodes_per_layer = num_nodes_per_layer
        self.activation = activation
        self.alpha = alpha
        self.solver = solver
        self.random_state = random_state

    def fit(self, X, y):
        self.num_nodes_per_layer = int(self.num_nodes_per_layer)
        self.hidden_layer_depth = int(self.hidden_layer_depth)
        self.alpha = float(self.alpha)

        from sklearn.neural_network import MLPClassifier

        hidden_layer_sizes = tuple(
            self.num_nodes_per_layer for i in range(self.hidden_layer_depth)
        )

        self.estimator = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=self.activation,
            alpha=self.alpha,
            solver=self.solver,
            random_state=self.random_state,
        )
        self.estimator.fit(X, y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "MLP Classifier",
            "name": "MLP CLassifier",
            "handles_regression": False,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": False,
            "handles_multioutput": False,
            "is_deterministic": False,
            # Both input and output must be tuple(iterable)
            "input": [DENSE, SIGNED_DATA, UNSIGNED_DATA],
            "output": [PREDICTIONS],
        }

    @staticmethod
    def get_hyperparameter_search_space(
        feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        cs = ConfigurationSpace()
        hidden_layer_depth = UniformIntegerHyperparameter(
            name="hidden_layer_depth", lower=1, upper=3, default_value=1
        )
        num_nodes_per_layer = UniformIntegerHyperparameter(
            name="num_nodes_per_layer", lower=16, upper=216, default_value=32
        )
        activation = CategoricalHyperparameter(
            name="activation",
            choices=["identity", "logistic", "tanh", "relu"],
            default_value="relu",
        )
        alpha = UniformFloatHyperparameter(
            name="alpha", lower=0.0001, upper=1.0, default_value=0.0001
        )
        solver = CategoricalHyperparameter(
            name="solver", choices=["lbfgs", "sgd", "adam"], default_value="adam"
        )
        cs.add_hyperparameters(
            [
                hidden_layer_depth,
                num_nodes_per_layer,
                activation,
                alpha,
                solver,
            ]
        )
        return cs


# Add MLP classifier component to auto-sklearn.
autosklearn.pipeline.components.classification.add_classifier(MLPClassifier)
cs = MLPClassifier.get_hyperparameter_search_space()
print(cs)

Configuration space object:
  Hyperparameters:
    activation, Type: Categorical, Choices: {identity, logistic, tanh, relu}, Default: relu
    alpha, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.0001
    hidden_layer_depth, Type: UniformInteger, Range: [1, 3], Default: 1
    num_nodes_per_layer, Type: UniformInteger, Range: [16, 216], Default: 32
    solver, Type: Categorical, Choices: {lbfgs, sgd, adam}, Default: adam



## Data Loading



In [3]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Fit MLP classifier to the data



In [4]:
clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=30,
    per_run_time_limit=10,
    include={"classifier": ["MLPClassifier"]},
    # Bellow two flags are provided to speed up calculations
    # Not recommended for a real implementation
    initial_configurations_via_metalearning=0,
    smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)

AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      include={'classifier': ['MLPClassifier']},
                      initial_configurations_via_metalearning=0,
                      per_run_time_limit=10,
                      smac_scenario_args={'runcount_limit': 5},
                      time_left_for_this_task=30)

## Print test accuracy and statistics



In [5]:
y_pred = clf.predict(X_test)
print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
pprint(clf.show_models(), indent=4)

accuracy:  0.965034965034965
{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f607fef48d0>,
           'cost': 0.021276595744680882,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f607fcafc90>,
           'ensemble_weight': 0.06,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f607fef47d0>,
           'model_id': 2,
           'rank': 1,
           'sklearn_classifier': MLPClassifier(hidden_layer_sizes=(32,), random_state=1)},
    3: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f6080100f50>,
           'cost': 0.06382978723404253,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocess

In [6]:
%matplotlib inline


# Extending Auto-Sklearn with Data Preprocessor Component

The following example demonstrates how to turn off data preprocessing step in auto-skearn.


In [7]:
from typing import Optional
from pprint import pprint

import autosklearn.classification
import autosklearn.pipeline.components.data_preprocessing
import sklearn.metrics
from ConfigSpace.configuration_space import ConfigurationSpace

from autosklearn.askl_typing import FEAT_TYPE_TYPE
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

## Create NoPreprocessing component for auto-sklearn



In [8]:
class NoPreprocessing(AutoSklearnPreprocessingAlgorithm):
    def __init__(self, **kwargs):
        """This preprocessors does not change the data"""
        # Some internal checks makes sure parameters are set
        for key, val in kwargs.items():
            setattr(self, key, val)

    def fit(self, X, Y=None):
        return self

    def transform(self, X):
        return X

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "NoPreprocessing",
            "name": "NoPreprocessing",
            "handles_regression": True,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": True,
            "handles_multioutput": True,
            "is_deterministic": True,
            "input": (SPARSE, DENSE, UNSIGNED_DATA),
            "output": (INPUT,),
        }

    @staticmethod
    def get_hyperparameter_search_space(
        feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        return ConfigurationSpace()  # Return an empty configuration as there is None


# Add NoPreprocessing component to auto-sklearn.
autosklearn.pipeline.components.data_preprocessing.add_preprocessor(NoPreprocessing)

## Create dataset



In [9]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Fit the model without performing data preprocessing



In [10]:
clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    include={"data_preprocessor": ["NoPreprocessing"]},
    # Bellow two flags are provided to speed up calculations
    # Not recommended for a real implementation
    initial_configurations_via_metalearning=0,
    smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)

# To check that models were found without issue when running examples
assert len(clf.get_models_with_weights()) > 0
print(clf.sprint_statistics())

auto-sklearn results:
  Dataset name: 209f37da-58bb-11ed-80b5-0242ac1c0002
  Metric: accuracy
  Best validation score: 0.971631
  Number of target algorithm runs: 5
  Number of successful target algorithm runs: 5
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0



## Print prediction score and statistics



In [11]:
y_pred = clf.predict(X_test)
print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
pprint(clf.show_models(), indent=4)

accuracy:  0.958041958041958
{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f607d166550>,
           'cost': 0.028368794326241176,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f607ffc9d10>,
           'ensemble_weight': 0.2,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f607d166ed0>,
           'model_id': 2,
           'rank': 1,
           'sklearn_classifier': RandomForestClassifier(max_features=5, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
    3: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f607ff82550>,
           'cost': 0.099290780141844,
           'data_preprocessor': <autos

In [None]:
%matplotlib inline


# Extending Auto-Sklearn with Preprocessor Component

The following example demonstrates how to create a wrapper around the linear
discriminant analysis (LDA) algorithm from sklearn and use it as a preprocessor
in auto-sklearn.


In [12]:
from typing import Optional
from pprint import pprint

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
    UniformFloatHyperparameter,
    CategoricalHyperparameter,
)
from ConfigSpace.conditions import InCondition

import sklearn.metrics

from autosklearn.askl_typing import FEAT_TYPE_TYPE
import autosklearn.classification
import autosklearn.pipeline.components.feature_preprocessing
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, UNSIGNED_DATA
from autosklearn.util.common import check_none

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

## Create LDA component for auto-sklearn



In [13]:
class LDA(AutoSklearnPreprocessingAlgorithm):
    def __init__(self, solver, tol, shrinkage=None, random_state=None):
        self.solver = solver
        self.shrinkage = shrinkage
        self.tol = tol
        self.random_state = random_state
        self.preprocessor = None

    def fit(self, X, y=None):
        if check_none(self.shrinkage):
            self.shrinkage = None
        else:
            self.shrinkage = float(self.shrinkage)
        self.tol = float(self.tol)

        import sklearn.discriminant_analysis

        self.preprocessor = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            shrinkage=self.shrinkage,
            solver=self.solver,
            tol=self.tol,
        )
        self.preprocessor.fit(X, y)
        return self

    def transform(self, X):
        if self.preprocessor is None:
            raise NotImplementedError()
        return self.preprocessor.transform(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "LDA",
            "name": "Linear Discriminant Analysis",
            "handles_regression": False,
            "handles_classification": True,
            "handles_multiclass": False,
            "handles_multilabel": False,
            "handles_multioutput": False,
            "is_deterministic": True,
            "input": (DENSE, UNSIGNED_DATA, SIGNED_DATA),
            "output": (DENSE, UNSIGNED_DATA, SIGNED_DATA),
        }

    @staticmethod
    def get_hyperparameter_search_space(
        feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        cs = ConfigurationSpace()
        solver = CategoricalHyperparameter(
            name="solver", choices=["svd", "lsqr", "eigen"], default_value="svd"
        )
        shrinkage = UniformFloatHyperparameter(
            name="shrinkage", lower=0.0, upper=1.0, default_value=0.5
        )
        tol = UniformFloatHyperparameter(
            name="tol", lower=0.0001, upper=1, default_value=0.0001
        )
        cs.add_hyperparameters([solver, shrinkage, tol])
        shrinkage_condition = InCondition(shrinkage, solver, ["lsqr", "eigen"])
        cs.add_condition(shrinkage_condition)
        return cs


# Add LDA component to auto-sklearn.
autosklearn.pipeline.components.feature_preprocessing.add_preprocessor(LDA)

## Create dataset



In [14]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Configuration space



In [15]:
cs = LDA.get_hyperparameter_search_space()
print(cs)

Configuration space object:
  Hyperparameters:
    shrinkage, Type: UniformFloat, Range: [0.0, 1.0], Default: 0.5
    solver, Type: Categorical, Choices: {svd, lsqr, eigen}, Default: svd
    tol, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.0001
  Conditions:
    shrinkage | solver in {'lsqr', 'eigen'}



## Fit the model using LDA as preprocessor



In [16]:
clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=30,
    include={"feature_preprocessor": ["LDA"]},
    # Bellow two flags are provided to speed up calculations
    # Not recommended for a real implementation
    initial_configurations_via_metalearning=0,
    smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)

AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      include={'feature_preprocessor': ['LDA']},
                      initial_configurations_via_metalearning=0,
                      per_run_time_limit=3,
                      smac_scenario_args={'runcount_limit': 5},
                      time_left_for_this_task=30)

## Print prediction score and statistics



In [17]:
y_pred = clf.predict(X_test)
print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
pprint(clf.show_models(), indent=4)

accuracy:  0.9790209790209791
{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f607d314c90>,
           'cost': 0.05673758865248224,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f607d2e0a90>,
           'ensemble_weight': 0.18,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f609fb60110>,
           'model_id': 2,
           'rank': 1,
           'sklearn_classifier': RandomForestClassifier(max_features=1, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
    3: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f607ffd2110>,
           'cost': 0.028368794326241176,
           'data_preprocessor': <a

In [18]:
%matplotlib inline


# Extending Auto-Sklearn with Regression Component

The following example demonstrates how to create a new regression
component for using in auto-sklearn.


In [19]:
from typing import Optional
from pprint import pprint

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
    UniformFloatHyperparameter,
    UniformIntegerHyperparameter,
    CategoricalHyperparameter,
)
from ConfigSpace.conditions import EqualsCondition

import sklearn.metrics

from autosklearn.askl_typing import FEAT_TYPE_TYPE
import autosklearn.regression
import autosklearn.pipeline.components.regression
from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm
from autosklearn.pipeline.constants import (
    SPARSE,
    DENSE,
    SIGNED_DATA,
    UNSIGNED_DATA,
    PREDICTIONS,
)

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

## Implement kernel ridge regression component for auto-sklearn



In [20]:
class KernelRidgeRegression(AutoSklearnRegressionAlgorithm):
    def __init__(self, alpha, kernel, gamma, degree, coef0, random_state=None):
        self.alpha = alpha
        self.kernel = kernel
        self.gamma = gamma
        self.degree = degree
        self.coef0 = coef0
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, y):
        self.alpha = float(self.alpha)
        self.gamma = float(self.gamma)
        self.degree = int(self.degree)
        self.coef0 = float(self.coef0)

        import sklearn.kernel_ridge

        self.estimator = sklearn.kernel_ridge.KernelRidge(
            alpha=self.alpha,
            kernel=self.kernel,
            gamma=self.gamma,
            degree=self.degree,
            coef0=self.coef0,
        )
        self.estimator.fit(X, y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "KRR",
            "name": "Kernel Ridge Regression",
            "handles_regression": True,
            "handles_classification": False,
            "handles_multiclass": False,
            "handles_multilabel": False,
            "handles_multioutput": True,
            "is_deterministic": True,
            "input": (SPARSE, DENSE, UNSIGNED_DATA, SIGNED_DATA),
            "output": (PREDICTIONS,),
        }

    @staticmethod
    def get_hyperparameter_search_space(
        feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        cs = ConfigurationSpace()
        alpha = UniformFloatHyperparameter(
            name="alpha", lower=10**-5, upper=1, log=True, default_value=1.0
        )
        kernel = CategoricalHyperparameter(
            name="kernel",
            # We restrict ourselves to two possible kernels for this example
            choices=["polynomial", "rbf"],
            default_value="polynomial",
        )
        gamma = UniformFloatHyperparameter(
            name="gamma", lower=0.00001, upper=1, default_value=0.1, log=True
        )
        degree = UniformIntegerHyperparameter(
            name="degree", lower=2, upper=5, default_value=3
        )
        coef0 = UniformFloatHyperparameter(
            name="coef0",
            lower=1e-2,
            upper=1e2,
            log=True,
            default_value=1,
        )
        cs.add_hyperparameters([alpha, kernel, gamma, degree, coef0])
        degree_condition = EqualsCondition(degree, kernel, "polynomial")
        coef0_condition = EqualsCondition(coef0, kernel, "polynomial")
        cs.add_conditions([degree_condition, coef0_condition])
        return cs


# Add KRR component to auto-sklearn.
autosklearn.pipeline.components.regression.add_regressor(KernelRidgeRegression)
cs = KernelRidgeRegression.get_hyperparameter_search_space()
print(cs)

Configuration space object:
  Hyperparameters:
    alpha, Type: UniformFloat, Range: [1e-05, 1.0], Default: 1.0, on log-scale
    coef0, Type: UniformFloat, Range: [0.01, 100.0], Default: 1.0, on log-scale
    degree, Type: UniformInteger, Range: [2, 5], Default: 3
    gamma, Type: UniformFloat, Range: [1e-05, 1.0], Default: 0.1, on log-scale
    kernel, Type: Categorical, Choices: {polynomial, rbf}, Default: polynomial
  Conditions:
    coef0 | kernel == 'polynomial'
    degree | kernel == 'polynomial'



## Generate data



In [22]:
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Fit the model using KRR



In [23]:
reg = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=30,
    per_run_time_limit=10,
    include={"regressor": ["KernelRidgeRegression"]},
    # Bellow two flags are provided to speed up calculations
    # Not recommended for a real implementation
    initial_configurations_via_metalearning=0,
    smac_scenario_args={"runcount_limit": 5},
)
reg.fit(X_train, y_train)

AutoSklearnRegressor(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                     include={'regressor': ['KernelRidgeRegression']},
                     initial_configurations_via_metalearning=0,
                     per_run_time_limit=10,
                     smac_scenario_args={'runcount_limit': 5},
                     time_left_for_this_task=30)

## Print prediction score and statistics



In [24]:
y_pred = reg.predict(X_test)
print("r2 score: ", sklearn.metrics.r2_score(y_pred, y_test))
pprint(reg.show_models(), indent=4)

r2 score:  -0.45930565031627846
{   2: {   'cost': 0.7061804240788295,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f607ff9ac50>,
           'ensemble_weight': 0.24,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f607c720990>,
           'model_id': 2,
           'rank': 1,
           'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0x7f607c720490>,
           'sklearn_regressor': KernelRidge(alpha=1.0, coef0=1.0, gamma=0.1, kernel='polynomial')},
    5: {   'cost': 0.5995304732284444,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f607ff9ac10>,
           'ensemble_weight': 0.76,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f607c7009d0>,
     

In [None]:
%matplotlib inline


# Restricting the number of hyperparameters for an existing component

The following example demonstrates how to replace an existing
component with a new component, implementing the same classifier,
but with different hyperparameters .


In [25]:
from typing import Optional

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
    UniformIntegerHyperparameter,
    UniformFloatHyperparameter,
)

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from autosklearn.askl_typing import FEAT_TYPE_TYPE
import autosklearn.classification
import autosklearn.pipeline.components.classification
from autosklearn.pipeline.components.classification import (
    AutoSklearnClassificationAlgorithm,
)
from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE

## Subclass auto-sklearn's random forest classifier



In [26]:
# This classifier only has one of the hyperparameter's of auto-sklearn's
# default parametrization (``max_features``). Instead, it also
# tunes the number of estimators (``n_estimators``).


class CustomRandomForest(AutoSklearnClassificationAlgorithm):
    def __init__(self, n_estimators, max_features, random_state=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.random_state = random_state

    def fit(self, X, y):
        from sklearn.ensemble import RandomForestClassifier

        self.n_estimators = int(self.n_estimators)

        if self.max_features not in ("sqrt", "log2", "auto"):
            max_features = int(X.shape[1] ** float(self.max_features))
        else:
            max_features = self.max_features

        self.estimator = RandomForestClassifier(
            n_estimators=self.n_estimators,
            max_features=max_features,
            random_state=self.random_state,
        )
        self.estimator.fit(X, y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "RF",
            "name": "Random Forest Classifier",
            "handles_regression": False,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": True,
            "handles_multioutput": False,
            "is_deterministic": True,
            "input": (DENSE, SPARSE, UNSIGNED_DATA),
            "output": (PREDICTIONS,),
        }

    @staticmethod
    def get_hyperparameter_search_space(
        feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        cs = ConfigurationSpace()

        # The maximum number of features used in the forest is calculated as m^max_features, where
        # m is the total number of features, and max_features is the hyperparameter specified below.
        # The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This
        # corresponds with Geurts' heuristic.
        max_features = UniformFloatHyperparameter(
            "max_features", 0.0, 1.0, default_value=0.5
        )
        n_estimators = UniformIntegerHyperparameter(
            "n_estimators", 10, 1000, default_value=100
        )

        cs.add_hyperparameters([max_features, n_estimators])
        return cs


# Add custom random forest classifier component to auto-sklearn.
autosklearn.pipeline.components.classification.add_classifier(CustomRandomForest)
cs = CustomRandomForest.get_hyperparameter_search_space()
print(cs)

Configuration space object:
  Hyperparameters:
    max_features, Type: UniformFloat, Range: [0.0, 1.0], Default: 0.5
    n_estimators, Type: UniformInteger, Range: [10, 1000], Default: 100



## Data Loading



In [27]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Fit Random forest classifier to the data



In [28]:
clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=30,
    per_run_time_limit=10,
    # Here we exclude auto-sklearn's default random forest component
    exclude={"classifier": ["random_forest"]},
    # Bellow two flags are provided to speed up calculations
    # Not recommended for a real implementation
    initial_configurations_via_metalearning=0,
    smac_scenario_args={"runcount_limit": 1},
)
clf.fit(X_train, y_train)

AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      exclude={'classifier': ['random_forest']},
                      initial_configurations_via_metalearning=0,
                      per_run_time_limit=10,
                      smac_scenario_args={'runcount_limit': 1},
                      time_left_for_this_task=30)

## Print the configuration space



In [29]:
# Observe that this configuration space only contains our custom random
# forest, but not auto-sklearn's ``random_forest``
cs = clf.get_configuration_space(X_train, y_train)
assert "random_forest" not in str(cs)
print(cs)

Configuration space object:
  Hyperparameters:
    balancing:strategy, Type: Categorical, Choices: {none, weighting}, Default: none
    classifier:CustomRandomForest:max_features, Type: UniformFloat, Range: [0.0, 1.0], Default: 0.5
    classifier:CustomRandomForest:n_estimators, Type: UniformInteger, Range: [10, 1000], Default: 100
    classifier:MLPClassifier:activation, Type: Categorical, Choices: {identity, logistic, tanh, relu}, Default: relu
    classifier:MLPClassifier:alpha, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.0001
    classifier:MLPClassifier:hidden_layer_depth, Type: UniformInteger, Range: [1, 3], Default: 1
    classifier:MLPClassifier:num_nodes_per_layer, Type: UniformInteger, Range: [16, 216], Default: 32
    classifier:MLPClassifier:solver, Type: Categorical, Choices: {lbfgs, sgd, adam}, Default: adam
    classifier:__choice__, Type: Categorical, Choices: {adaboost, bernoulli_nb, decision_tree, extra_trees, gaussian_nb, gradient_boosting, k_nearest_neighbo