[feature] Greedy Portfolio (#200)

* initial configurations added * In progress, adding flag in search function * Adds documentation, example and fixes setup.py * Address comments from shuhei, change run_greedy to portfolio_selection * address comments from fransisco, movie portfolio to configs * Address comments from fransisco, add tests for greedy portfolio and tests * fix flake tests * Simplify portfolio selection * Update autoPyTorch/optimizer/smbo.py Co-authored-by: Francisco Rivera Valverde <44504424+franchuterivera@users.noreply.github.com> * Address comments from fransisco, path exception handling and test * fix flake * Address comments from shuhei * fix bug in setup.py * fix tests in base trainer evaluate, increase n samples and add seed * fix tests in base trainer evaluate, increase n samples (fix) Co-authored-by: Francisco Rivera Valverde <44504424+franchuterivera@users.noreply.github.com>
automl · May 31, 2021 · 1e08fc9 · 1e08fc9
1 parent 097cb99
commit 1e08fc9
Show file tree

Hide file tree

Showing 16 changed files with 853 additions and 35 deletions.
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -121,6 +121,9 @@ class BaseTask:
         exclude_components (Optional[Dict]): If None, all possible components are used.
             Otherwise specifies set of components not to use. Incompatible with include
             components
+        search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+            search space updates that can be used to modify the search
+            space of particular components or choice modules of the pipeline
     """
 
     def __init__(
@@ -697,6 +700,7 @@ def _search(
         precision: int = 32,
         disable_file_output: List = [],
         load_models: bool = True,
+        portfolio_selection: Optional[str] = None
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -767,7 +771,15 @@ def _search(
             disable_file_output (Union[bool, List]):
             load_models (bool), (default=True): Whether to load the
                 models after fitting AutoPyTorch.
-
+            portfolio_selection (str), (default=None):
+                This argument controls the initial configurations that
+                AutoPyTorch uses to warm start SMAC for hyperparameter
+                optimization. By default, no warm-starting happens.
+                The user can provide a path to a json file containing
+                configurations, similar to (...herepathtogreedy...).
+                Additionally, the keyword 'greedy' is supported,
+                which would use the default portfolio from
+                `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`
         Returns:
             self
 
@@ -955,7 +967,8 @@ def _search(
                 # We do not increase the num_run here, this is something
                 # smac does internally
                 start_num_run=self._backend.get_next_num_run(peek=True),
-                search_space_updates=self.search_space_updates
+                search_space_updates=self.search_space_updates,
+                portfolio_selection=portfolio_selection,
             )
             try:
                 run_history, self.trajectory, budget_type = \

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -56,6 +56,9 @@ class TabularClassificationTask(BaseTask):
             If None, all possible components are used. Otherwise
             specifies set of components not to use. Incompatible
             with include components
+        search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+            search space updates that can be used to modify the search
+            space of particular components or choice modules of the pipeline
     """
     def __init__(
         self,
@@ -119,6 +122,7 @@ def search(
         precision: int = 32,
         disable_file_output: List = [],
         load_models: bool = True,
+        portfolio_selection: Optional[str] = None,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -131,21 +135,21 @@ def search(
                 A pair of features (X_train) and targets (y_train) used to fit a
                 pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
                 be provided to track the generalization performance of each stage.
-            optimize_metric (str): name of the metric that is used to
-                evaluate a pipeline.
+            optimize_metric (str):
+                name of the metric that is used to evaluate a pipeline.
             budget_type (Optional[str]):
                 Type of budget to be used when fitting the pipeline.
                 Either 'epochs' or 'runtime'. If not provided, uses
                 the default in the pipeline config ('epochs')
             budget (Optional[float]):
                 Budget to fit a single run of the pipeline. If not
                 provided, uses the default in the pipeline config
-            total_walltime_limit (int), (default=100): Time limit
-                in seconds for the search of appropriate models.
+            total_walltime_limit (int), (default=100):
+                Time limit in seconds for the search of appropriate models.
                 By increasing this value, autopytorch has a higher
                 chance of finding better models.
-            func_eval_time_limit_secs (int), (default=None): Time limit
-                for a single call to the machine learning model.
+            func_eval_time_limit_secs (int), (default=None):
+                Time limit for a single call to the machine learning model.
                 Model fitting will be terminated if the machine
                 learning algorithm runs over the time limit. Set
                 this value high enough so that typical machine
@@ -162,32 +166,40 @@ def search(
                 feature by turning this flag to False. All machine learning
                 algorithms that are fitted during search() are considered for
                 ensemble building.
-            memory_limit (Optional[int]), (default=4096): Memory
-                limit in MB for the machine learning algorithm. autopytorch
+            memory_limit (Optional[int]), (default=4096):
+                Memory limit in MB for the machine learning algorithm. autopytorch
                 will stop fitting the machine learning algorithm if it tries
                 to allocate more than memory_limit MB. If None is provided,
                 no memory limit is set. In case of multi-processing, memory_limit
                 will be per job. This memory limit also applies to the ensemble
                 creation process.
-            smac_scenario_args (Optional[Dict]): Additional arguments inserted
-                into the scenario of SMAC. See the
+            smac_scenario_args (Optional[Dict]):
+                Additional arguments inserted into the scenario of SMAC. See the
                 [SMAC documentation] (https://automl.github.io/SMAC3/master/options.html?highlight=scenario#scenario)
-            get_smac_object_callback (Optional[Callable]): Callback function
-                to create an object of class
+            get_smac_object_callback (Optional[Callable]):
+                Callback function to create an object of class
                 [smac.optimizer.smbo.SMBO](https://automl.github.io/SMAC3/master/apidoc/smac.optimizer.smbo.html).
                 The function must accept the arguments scenario_dict,
                 instances, num_params, runhistory, seed and ta. This is
                 an advanced feature. Use only if you are familiar with
                 [SMAC](https://automl.github.io/SMAC3/master/index.html).
-            all_supported_metrics (bool), (default=True): if True, all
-                metrics supporting current task will be calculated
+            all_supported_metrics (bool), (default=True):
+                if True, all metrics supporting current task will be calculated
                 for each pipeline and results will be available via cv_results
             precision (int), (default=32): Numeric precision used when loading
                 ensemble data. Can be either '16', '32' or '64'.
             disable_file_output (Union[bool, List]):
-            load_models (bool), (default=True): Whether to load the
-                models after fitting AutoPyTorch.
-
+            load_models (bool), (default=True):
+                Whether to load the models after fitting AutoPyTorch.
+            portfolio_selection (str), (default=None):
+                This argument controls the initial configurations that
+                AutoPyTorch uses to warm start SMAC for hyperparameter
+                optimization. By default, no warm-starting happens.
+                The user can provide a path to a json file containing
+                configurations, similar to (...herepathtogreedy...).
+                Additionally, the keyword 'greedy' is supported,
+                which would use the default portfolio from
+                `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`
         Returns:
             self
 
@@ -233,6 +245,7 @@ def search(
             precision=precision,
             disable_file_output=disable_file_output,
             load_models=load_models,
+            portfolio_selection=portfolio_selection,
         )
 
     def predict(

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -47,6 +47,9 @@ class TabularRegressionTask(BaseTask):
         exclude_components (Optional[Dict]): If None, all possible components are used.
             Otherwise specifies set of components not to use. Incompatible with include
             components
+        search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+            search space updates that can be used to modify the search
+            space of particular components or choice modules of the pipeline
     """
 
     def __init__(
@@ -111,6 +114,7 @@ def search(
         precision: int = 32,
         disable_file_output: List = [],
         load_models: bool = True,
+        portfolio_selection: Optional[str] = None,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -175,6 +179,15 @@ def search(
             disable_file_output (Union[bool, List]):
             load_models (bool), (default=True): Whether to load the
                 models after fitting AutoPyTorch.
+            portfolio_selection (str), (default=None):
+                This argument controls the initial configurations that
+                AutoPyTorch uses to warm start SMAC for hyperparameter
+                optimization. By default, no warm-starting happens.
+                The user can provide a path to a json file containing
+                configurations, similar to (...herepathtogreedy...).
+                Additionally, the keyword 'greedy' is supported,
+                which would use the default portfolio from
+                `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`
 
         Returns:
             self
@@ -221,6 +234,7 @@ def search(
             precision=precision,
             disable_file_output=disable_file_output,
             load_models=load_models,
+            portfolio_selection=portfolio_selection,
         )
 
     def predict(