Feedback from PR

automl · Apr 12, 2021 · 426c2e1 · 426c2e1
1 parent e5539dc
commit 426c2e1
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 34 deletions.
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -8,7 +8,7 @@
 import os
 import sys
 import time
-from typing import Any, Dict, Optional, List, Tuple
+from typing import Any, Dict, Optional, List, Tuple, Union
 import uuid
 import unittest.mock
 import warnings
@@ -950,10 +950,11 @@ def fit_pipeline(
         y: SUPPORTED_TARGET_TYPES,
         task: int,
         is_classification: bool,
-        config: Configuration,
+        config: Union[Configuration,  Dict[str, Union[str, float, int]]],
         dataset_name: Optional[str] = None,
         X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
         y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        feat_type: Optional[List[str]] = None,
         **kwargs: Dict,
     ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:
         """ Fits and individual pipeline configuration and returns
@@ -975,13 +976,21 @@ def fit_pipeline(
             If provided, the testing performance will be tracked on this features.
         y_test: array-like
             If provided, the testing performance will be tracked on this labels
-        config: Configuration
-            A configuration object used to define a pipeline steps
+        config: Union[Configuration,  Dict[str, Union[str, float, int]]]
+            A configuration object used to define the pipeline steps. If a dictionary is passed,
+            a configuration is created based on this dictionary.
         dataset_name: Optional[str]
             A string to tag and identify the Auto-Sklearn run
         is_classification: bool
             Whether the task is for classification or regression. This affects
             how the targets are treated
+        feat_type : list, optional (default=None)
+            List of str of `len(X.shape[1])` describing the attribute type.
+            Possible types are `Categorical` and `Numerical`. `Categorical`
+            attributes will be automatically One-Hot encoded. The values
+            used for a categorical attribute must be integers, obtained for
+            example by `sklearn.preprocessing.LabelEncoder
+            <http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_.
 
         Returns
         -------
@@ -999,23 +1008,25 @@ def fit_pipeline(
         # dataset
         if self.configuration_space is None:
             self.configuration_space = self.fit(
-                X=X, y=y, task=task,
+                X=X, y=y,
                 dataset_name=dataset_name if dataset_name is not None else self._dataset_name,
                 X_test=X_test,
                 y_test=y_test,
-                feat_type=kwargs.pop('feat_type', self._feat_type),
+                feat_type=feat_type,
                 only_return_configuration_space=True)
 
         # We do not want to overwrite existing runs
         self.num_run += 1
+        if isinstance(config, dict):
+            config = Configuration(self.configuration_space, config)
         config.config_id = self.num_run
 
         # Get the components to include and exclude on the configuration space
         # from the estimator attributes
         include, exclude = parse_include_exclude_components(
             task=self._task,
-            include_estimators=self._exclude_estimators,
-            exclude_estimators=self._include_estimators,
+            include_estimators=self._include_estimators,
+            exclude_estimators=self._exclude_estimators,
             include_preprocessors=self._include_preprocessors,
             exclude_preprocessors=self._exclude_preprocessors,
         )
@@ -1591,10 +1602,11 @@ def fit_pipeline(
         self,
         X: SUPPORTED_FEAT_TYPES,
         y: SUPPORTED_TARGET_TYPES,
-        config: Configuration,
+        config: Union[Configuration,  Dict[str, Union[str, float, int]]],
         dataset_name: Optional[str] = None,
         X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
         y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        feat_type: Optional[List[str]] = None,
         **kwargs,
     ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:
         y_task = type_of_target(y)
@@ -1615,6 +1627,7 @@ def fit_pipeline(
             config=config,
             task=task,
             is_classification=True,
+            feat_type=feat_type,
             **kwargs,
         )
 
@@ -1681,10 +1694,11 @@ def fit_pipeline(
         self,
         X: SUPPORTED_FEAT_TYPES,
         y: SUPPORTED_TARGET_TYPES,
-        config: Configuration,
+        config: Union[Configuration,  Dict[str, Union[str, float, int]]],
         dataset_name: Optional[str] = None,
         X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
         y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        feat_type: Optional[List[str]] = None,
         **kwargs: Dict,
     ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:
 
@@ -1703,6 +1717,7 @@ def fit_pipeline(
             X_test=X_test, y_test=y_test,
             config=config,
             task=task,
+            feat_type=feat_type,
             dataset_name=dataset_name,
             is_classification=False,
             **kwargs,

diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
@@ -1,6 +1,6 @@
 # -*- encoding: utf-8 -*-
 
-from typing import Optional, Dict, List, Tuple
+from typing import Optional, Dict, List, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration
 import dask.distributed
@@ -349,10 +349,11 @@ def fit_pipeline(
         self,
         X: SUPPORTED_FEAT_TYPES,
         y: SUPPORTED_TARGET_TYPES,
-        config: Configuration,
+        config: Union[Configuration,  Dict[str, Union[str, float, int]]],
         dataset_name: Optional[str] = None,
         X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
         y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        feat_type: Optional[List[str]] = None,
         *args,
         **kwargs: Dict,
     ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:
@@ -369,35 +370,44 @@ def fit_pipeline(
 
         Parameters
         ----------
-            X: array-like, shape = (n_samples, n_features)
-                The features used for training
-            y: array-like
-                The labels used for training
-            X_test: Optionalarray-like, shape = (n_samples, n_features)
-                If provided, the testing performance will be tracked on this features.
-            y_test: array-like
-                If provided, the testing performance will be tracked on this labels
-            config: Configuration
-                A configuration object used to define a pipeline steps
-            dataset_name: Optional[str]
-                Name that will be used to tag the Auto-Sklearn run and identify the
-                Auto-Sklearn run
+        X: array-like, shape = (n_samples, n_features)
+            The features used for training
+        y: array-like
+            The labels used for training
+        X_test: Optionalarray-like, shape = (n_samples, n_features)
+            If provided, the testing performance will be tracked on this features.
+        y_test: array-like
+            If provided, the testing performance will be tracked on this labels
+        config: Union[Configuration,  Dict[str, Union[str, float, int]]]
+            A configuration object used to define the pipeline steps.
+            If a dictionary is passed, a configuration is created based on this dictionary.
+        dataset_name: Optional[str]
+            Name that will be used to tag the Auto-Sklearn run and identify the
+            Auto-Sklearn run
+        feat_type : list, optional (default=None)
+            List of str of `len(X.shape[1])` describing the attribute type.
+            Possible types are `Categorical` and `Numerical`. `Categorical`
+            attributes will be automatically One-Hot encoded. The values
+            used for a categorical attribute must be integers, obtained for
+            example by `sklearn.preprocessing.LabelEncoder
+            <http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_.
 
         Returns
         -------
-            pipeline: Optional[BasePipeline]
-                The fitted pipeline. In case of failure while fitting the pipeline,
-                a None is returned.
-            run_info: RunInFo
-                A named tuple that contains the configuration launched
-            run_value: RunValue
-                A named tuple that contains the result of the run
+        pipeline: Optional[BasePipeline]
+            The fitted pipeline. In case of failure while fitting the pipeline,
+            a None is returned.
+        run_info: RunInFo
+            A named tuple that contains the configuration launched
+        run_value: RunValue
+            A named tuple that contains the result of the run
         """
         if self.automl_ is None:
             self.automl_ = self.build_automl()
         return self.automl_.fit_pipeline(X=X, y=y,
                                          dataset_name=dataset_name,
                                          config=config,
+                                         feat_type=feat_type,
                                          X_test=X_test, y_test=y_test,
                                          *args, **kwargs)
 

diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py
@@ -418,8 +418,8 @@ def run_smbo(self):
         # into a queue and querying them once the time is over
         include, exclude = parse_include_exclude_components(
             task=self.task,
-            include_estimators=self.exclude_estimators,
-            exclude_estimators=self.include_estimators,
+            include_estimators=self.include_estimators,
+            exclude_estimators=self.exclude_estimators,
             include_preprocessors=self.include_preprocessors,
             exclude_preprocessors=self.exclude_preprocessors,
         )

diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py
@@ -732,6 +732,7 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_
         per_run_time_limit=30,
         ensemble_size=0,
         dask_client=dask_client,
+        include_estimators=['random_forest'],
         seed=seed,
         # We cannot get the configuration space with 'test' not fit with it
         resampling_strategy=resampling_strategy if resampling_strategy != 'test' else 'holdout',
@@ -765,6 +766,7 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_
     else:
         # We should have fitted a pipeline with named_steps
         assert hasattr(pipeline, 'named_steps')
+        assert 'RandomForest' in pipeline.steps[-1][-1].choice.__class__.__name__
 
     # Num run should be 2, as 1 is for dummy classifier and we have not launch
     # another pipeline