making data preprocessing step configurable with two options no prepr…

…ocessing and feature type split (#977) * making data preprocessing step configurable with two options no preprocessing and feature type split * Fix: execution fails when data_preprocessor is no_preprocesing * Incorporating review comments * Fixing test cases; updating metalearning with updated hyperparameters * Fixing examples * Updating portfolios with new config * Incorporated review comments and fix test case * Test fixes * Test fixes * Fix metalearning config * Remove unused imports * Fix test cases * Fix test cases and examples * Adding more checks for include and exclude params * Fix flake error * Fix flake error * Handling target_type in datatset_properties * Fixes * Fixes * Fix error * Fix test cases * Adding datatype annotations * Fix test cases * Fix build * Fix test case' * Update stale.yaml * Fix annotation type * Update portfolios with new config Co-authored-by: Rohit Agarwal <rohit.agarwal4@aexp.com>
automl · Aug 18, 2021 · e7b5daa · e7b5daa
1 parent 90658e5
commit e7b5daa
Show file tree

Hide file tree

Showing 158 changed files with 23,419 additions and 22,420 deletions.
diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
@@ -9,7 +9,7 @@ jobs:
     steps:
       - uses: actions/stale@v3
         with:
-          days-before-stale: 30
+          days-before-stale: 60
           days-before-close: 7
           stale-issue-message: >
             This issue has been automatically marked as stale because it has not had
@@ -18,7 +18,5 @@ jobs:
           close-issue-message: >
             This issue has been automatically closed due to inactivity.
           stale-issue-label: 'stale'
-          # Only issues with ANY of these labels are checked.
-          # Separate multiple labels with commas (eg. "incomplete,waiting-feedback").
-          any-of-labels: 'Answered,Feedback-Required,invalid,wontfix'
+          only-issue-labels: 'Answered,Feedback-Required,invalid,wontfix'
           exempt-all-milestones: true
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -55,7 +55,6 @@
     get_named_client_logger,
 )
 from autosklearn.util import pipeline, RE_PATTERN
-from autosklearn.util.pipeline import parse_include_exclude_components
 from autosklearn.util.parallel import preload_modules
 from autosklearn.ensemble_builder import EnsembleBuilderManager
 from autosklearn.ensembles.singlebest_ensemble import SingleBest
@@ -125,10 +124,8 @@ def __init__(self,
                  memory_limit=3072,
                  metadata_directory=None,
                  debug_mode=False,
-                 include_estimators=None,
-                 exclude_estimators=None,
-                 include_preprocessors=None,
-                 exclude_preprocessors=None,
+                 include=None,
+                 exclude=None,
                  resampling_strategy='holdout-iterative-fit',
                  resampling_strategy_arguments=None,
                  n_jobs=None,
@@ -157,10 +154,8 @@ def __init__(self,
         self._memory_limit = memory_limit
         self._data_memory_limit = None
         self._metadata_directory = metadata_directory
-        self._include_estimators = include_estimators
-        self._exclude_estimators = exclude_estimators
-        self._include_preprocessors = include_preprocessors
-        self._exclude_preprocessors = exclude_preprocessors
+        self._include = include
+        self._exclude = exclude
         self._resampling_strategy = resampling_strategy
         self._scoring_functions = scoring_functions if scoring_functions is not None else []
         self._resampling_strategy_arguments = resampling_strategy_arguments \
@@ -565,10 +560,8 @@ def fit(
         self._logger.debug('  memory_limit: %s', str(self._memory_limit))
         self._logger.debug('  metadata_directory: %s', self._metadata_directory)
         self._logger.debug('  debug_mode: %s', self._debug_mode)
-        self._logger.debug('  include_estimators: %s', str(self._include_estimators))
-        self._logger.debug('  exclude_estimators: %s', str(self._exclude_estimators))
-        self._logger.debug('  include_preprocessors: %s', str(self._include_preprocessors))
-        self._logger.debug('  exclude_preprocessors: %s', str(self._exclude_preprocessors))
+        self._logger.debug('  include: %s', str(self._include))
+        self._logger.debug('  exclude: %s', str(self._exclude))
         self._logger.debug('  resampling_strategy: %s', str(self._resampling_strategy))
         self._logger.debug('  resampling_strategy_arguments: %s',
                            str(self._resampling_strategy_arguments))
@@ -629,10 +622,9 @@ def fit(
             self._backend.temporary_directory,
             self._backend,
             datamanager,
-            include_estimators=self._include_estimators,
-            exclude_estimators=self._exclude_estimators,
-            include_preprocessors=self._include_preprocessors,
-            exclude_preprocessors=self._exclude_preprocessors)
+            include=self._include,
+            exclude=self._exclude,
+        )
         if only_return_configuration_space:
             self._fit_cleanup()
             return self.configuration_space
@@ -748,10 +740,8 @@ def fit(
                 metric=self._metric,
                 resampling_strategy=self._resampling_strategy,
                 resampling_strategy_args=self._resampling_strategy_arguments,
-                include_estimators=self._include_estimators,
-                exclude_estimators=self._exclude_estimators,
-                include_preprocessors=self._include_preprocessors,
-                exclude_preprocessors=self._exclude_preprocessors,
+                include=self._include,
+                exclude=self._exclude,
                 disable_file_output=self._disable_evaluator_output,
                 get_smac_object_callback=self._get_smac_object_callback,
                 smac_scenario_args=self._smac_scenario_args,
@@ -1088,21 +1078,11 @@ def fit_pipeline(
             config = Configuration(self.configuration_space, config)
         config.config_id = self.num_run
 
-        # Get the components to include and exclude on the configuration space
-        # from the estimator attributes
-        include, exclude = parse_include_exclude_components(
-            task=self._task,
-            include_estimators=self._include_estimators,
-            exclude_estimators=self._exclude_estimators,
-            include_preprocessors=self._include_preprocessors,
-            exclude_preprocessors=self._exclude_preprocessors,
-        )
-
         # Prepare missing components to the TAE function call
         if 'include' not in kwargs:
-            kwargs['include'] = include
+            kwargs['include'] = self._include
         if 'exclude' not in kwargs:
-            kwargs['exclude'] = exclude
+            kwargs['exclude'] = self._exclude
         if 'memory_limit' not in kwargs:
             kwargs['memory_limit'] = self._memory_limit
         if 'resampling_strategy' not in kwargs:
@@ -1575,20 +1555,18 @@ def show_models(self):
             return sio.getvalue()
 
     def _create_search_space(self, tmp_dir, backend, datamanager,
-                             include_estimators=None,
-                             exclude_estimators=None,
-                             include_preprocessors=None,
-                             exclude_preprocessors=None):
+                             include=None,
+                             exclude=None,
+                             ):
         task_name = 'CreateConfigSpace'
 
         self._stopwatch.start_task(task_name)
         configspace_path = os.path.join(tmp_dir, 'space.json')
         configuration_space = pipeline.get_configuration_space(
             datamanager.info,
-            include_estimators=include_estimators,
-            exclude_estimators=exclude_estimators,
-            include_preprocessors=include_preprocessors,
-            exclude_preprocessors=exclude_preprocessors)
+            include=include,
+            exclude=exclude,
+        )
         configuration_space = self.configuration_space_created_hook(
             datamanager, configuration_space)
         backend.write_txt_file(

diff --git a/autosklearn/data/abstract_data_manager.py b/autosklearn/data/abstract_data_manager.py
@@ -5,8 +5,8 @@
 
 import scipy.sparse
 
-from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \
-    import DataPreprocessor
+from autosklearn.pipeline.components.data_preprocessing.feature_type \
+    import FeatTypeSplit
 
 
 class AbstractDataManager():
@@ -39,11 +39,11 @@ def feat_type(self, value: Dict[Union[str, int], str]) -> None:
         self._feat_type = value
 
     @property
-    def encoder(self) -> DataPreprocessor:
+    def encoder(self) -> FeatTypeSplit:
         return self._encoder
 
     @encoder.setter
-    def encoder(self, value: DataPreprocessor) -> DataPreprocessor:
+    def encoder(self, value: FeatTypeSplit) -> FeatTypeSplit:
         self._encoder = value
 
     def __repr__(self) -> str:

diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
@@ -33,10 +33,8 @@ def __init__(
         max_models_on_disc=50,
         seed=1,
         memory_limit=3072,
-        include_estimators=None,
-        exclude_estimators=None,
-        include_preprocessors=None,
-        exclude_preprocessors=None,
+        include=None,
+        exclude=None,
         resampling_strategy='holdout',
         resampling_strategy_arguments=None,
         tmp_folder=None,
@@ -102,22 +100,16 @@ def __init__(
             In case of multi-processing, `memory_limit` will be per job.
             This memory limit also applies to the ensemble creation process.
 
-        include_estimators : list, optional (None)
-            If None, all possible estimators are used. Otherwise specifies
-            set of estimators to use.
+        include : dict, optional (None)
+            If None, all possible algorithms are used. Otherwise specifies
+            set of algorithms for each added component is used. Include and 
+            exclude are incompatible if used together on the same component
 
-        exclude_estimators : list, optional (None)
-            If None, all possible estimators are used. Otherwise specifies
-            set of estimators not to use. Incompatible with include_estimators.
-
-        include_preprocessors : list, optional (None)
-            If None all possible preprocessors are used. Otherwise specifies set
-            of preprocessors to use.
-
-        exclude_preprocessors : list, optional (None)
-            If None all possible preprocessors are used. Otherwise specifies set
-            of preprocessors not to use. Incompatible with
-            include_preprocessors.
+        exclude : dict, optional (None)
+            If None, all possible algorithms are used. Otherwise specifies
+            set of algorithms for each added component is not used.
+            Incompatible with include. Include and exclude are incompatible
+            if used together on the same component
 
         resampling_strategy : string or object, optional ('holdout')
             how to to handle overfitting, might need 'resampling_strategy_arguments'
@@ -161,7 +153,7 @@ def __init__(
             folder to store configuration output and log files, if ``None``
             automatically use ``/tmp/autosklearn_tmp_$pid_$random_number``
 
-        delete_tmp_folder_after_terminate: string, optional (True)
+        delete_tmp_folder_after_terminate: bool, optional (True)
             remove tmp_folder, when finished. If tmp_folder is None
             tmp_dir will always be deleted
 
@@ -254,10 +246,8 @@ def __init__(
         self.max_models_on_disc = max_models_on_disc
         self.seed = seed
         self.memory_limit = memory_limit
-        self.include_estimators = include_estimators
-        self.exclude_estimators = exclude_estimators
-        self.include_preprocessors = include_preprocessors
-        self.exclude_preprocessors = exclude_preprocessors
+        self.include = include
+        self.exclude = exclude
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_arguments = resampling_strategy_arguments
         self.tmp_folder = tmp_folder
@@ -309,10 +299,8 @@ def build_automl(self):
             max_models_on_disc=self.max_models_on_disc,
             seed=self.seed,
             memory_limit=self.memory_limit,
-            include_estimators=self.include_estimators,
-            exclude_estimators=self.exclude_estimators,
-            include_preprocessors=self.include_preprocessors,
-            exclude_preprocessors=self.exclude_preprocessors,
+            include=self.include,
+            exclude=self.exclude,
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_arguments=self.resampling_strategy_arguments,
             n_jobs=self._n_jobs,

diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
@@ -241,8 +241,7 @@ def __init__(
             self.predict_function = self._predict_proba
 
         self._init_params = {
-            'data_preprocessing:feat_type':
-                self.datamanager.feat_type
+            'data_preprocessor:feat_type': self.datamanager.feat_type
         }
 
         if init_params is not None:

diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py
@@ -305,6 +305,8 @@ def __init__(
             'extra_trees', 'passive_aggressive', 'random_forest', 'sgd', 'gradient_boosting', 'mlp',
         ]
         include_preprocessors = ["no_preprocessing"]
+        include = {'classifier': include_estimators,
+                   'feature_preprocessor': include_preprocessors}
         super().__init__(
             time_left_for_this_task=time_left_for_this_task,
             per_run_time_limit=per_run_time_limit,
@@ -314,10 +316,8 @@ def __init__(
             max_models_on_disc=max_models_on_disc,
             seed=seed,
             memory_limit=memory_limit,
-            include_estimators=include_estimators,
-            exclude_estimators=None,
-            include_preprocessors=include_preprocessors,
-            exclude_preprocessors=None,
+            include=include,
+            exclude=None,
             resampling_strategy=None,
             resampling_strategy_arguments=None,
             tmp_folder=tmp_folder,