Merge branch 'development' into add-col_tfr

automl · Oct 26, 2021 · f1e837d · f1e837d
2 parents 7288128 + 9002937
commit f1e837d
Show file tree

Hide file tree

Showing 32 changed files with 709 additions and 273 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.8]
+        python-version: [3.7, 3.8, 3.9]
         include:
           - python-version: 3.8
             code-cov: true

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -110,8 +110,9 @@ def search(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         dataset_name: Optional[str] = None,
-        budget_type: Optional[str] = None,
-        budget: Optional[float] = None,
+        budget_type: str = 'epochs',
+        min_budget: int = 5,
+        max_budget: int = 50,
         total_walltime_limit: int = 100,
         func_eval_time_limit_secs: Optional[int] = None,
         enable_traditional_pipeline: bool = True,
@@ -137,15 +138,38 @@ def search(
                 be provided to track the generalization performance of each stage.
             optimize_metric (str):
                 name of the metric that is used to evaluate a pipeline.
-            budget_type (Optional[str]):
+            budget_type (str):
                 Type of budget to be used when fitting the pipeline.
-                Either 'epochs' or 'runtime'. If not provided, uses
-                the default in the pipeline config ('epochs')
-            budget (Optional[float]):
-                Budget to fit a single run of the pipeline. If not
-                provided, uses the default in the pipeline config
-            total_walltime_limit (int), (default=100):
-                Time limit in seconds for the search of appropriate models.
+                It can be one of:
+                + 'epochs': The training of each pipeline will be terminated after
+                  a number of epochs have passed. This number of epochs is determined by the
+                  budget argument of this method.
+                + 'runtime': The training of each pipeline will be terminated after
+                  a number of seconds have passed. This number of seconds is determined by the
+                  budget argument of this method. The overall fitting time of a pipeline is
+                  controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated
+                  time to train a pipeline, but it does not consider the overall time it takes
+                  to create a pipeline (data loading and preprocessing, other i/o operations, etc.).
+                budget_type will determine the units of min_budget/max_budget. If budget_type=='epochs'
+                is used, min_budget will refer to epochs whereas if budget_type=='runtime' then
+                min_budget will refer to seconds.
+            min_budget (int):
+                Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
+                trade-off resources between running many pipelines at min_budget and
+                running the top performing pipelines on max_budget.
+                min_budget states the minimum resource allocation a pipeline should have
+                so that we can compare and quickly discard bad performing models.
+                For example, if the budget_type is epochs, and min_budget=5, then we will
+                run every pipeline to a minimum of 5 epochs before performance comparison.
+            max_budget (int):
+                Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
+                trade-off resources between running many pipelines at min_budget and
+                running the top performing pipelines on max_budget.
+                max_budget states the maximum resource allocation a pipeline is going to
+                be ran. For example, if the budget_type is epochs, and max_budget=50,
+                then the pipeline training will be terminated after 50 epochs.
+            total_walltime_limit (int), (default=100): Time limit
+                in seconds for the search of appropriate models.
                 By increasing this value, autopytorch has a higher
                 chance of finding better models.
             func_eval_time_limit_secs (int), (default=None):
@@ -234,7 +258,8 @@ def search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
             budget_type=budget_type,
-            budget=budget,
+            min_budget=min_budget,
+            max_budget=max_budget,
             total_walltime_limit=total_walltime_limit,
             func_eval_time_limit_secs=func_eval_time_limit_secs,
             enable_traditional_pipeline=enable_traditional_pipeline,

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -53,23 +53,23 @@ class TabularRegressionTask(BaseTask):
     """
 
     def __init__(
-            self,
-            seed: int = 1,
-            n_jobs: int = 1,
-            logging_config: Optional[Dict] = None,
-            ensemble_size: int = 50,
-            ensemble_nbest: int = 50,
-            max_models_on_disc: int = 50,
-            temporary_directory: Optional[str] = None,
-            output_directory: Optional[str] = None,
-            delete_tmp_folder_after_terminate: bool = True,
-            delete_output_folder_after_terminate: bool = True,
-            include_components: Optional[Dict] = None,
-            exclude_components: Optional[Dict] = None,
-            resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
-            resampling_strategy_args: Optional[Dict[str, Any]] = None,
-            backend: Optional[Backend] = None,
-            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+        self,
+        seed: int = 1,
+        n_jobs: int = 1,
+        logging_config: Optional[Dict] = None,
+        ensemble_size: int = 50,
+        ensemble_nbest: int = 50,
+        max_models_on_disc: int = 50,
+        temporary_directory: Optional[str] = None,
+        output_directory: Optional[str] = None,
+        delete_tmp_folder_after_terminate: bool = True,
+        delete_output_folder_after_terminate: bool = True,
+        include_components: Optional[Dict] = None,
+        exclude_components: Optional[Dict] = None,
+        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        backend: Optional[Backend] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
     ):
         super().__init__(
             seed=seed,
@@ -102,8 +102,9 @@ def search(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         dataset_name: Optional[str] = None,
-        budget_type: Optional[str] = None,
-        budget: Optional[float] = None,
+        budget_type: str = 'epochs',
+        min_budget: int = 5,
+        max_budget: int = 50,
         total_walltime_limit: int = 100,
         func_eval_time_limit_secs: Optional[int] = None,
         enable_traditional_pipeline: bool = True,
@@ -129,13 +130,36 @@ def search(
                 be provided to track the generalization performance of each stage.
             optimize_metric (str): name of the metric that is used to
                 evaluate a pipeline.
-            budget_type (Optional[str]):
+            budget_type (str):
                 Type of budget to be used when fitting the pipeline.
-                Either 'epochs' or 'runtime'. If not provided, uses
-                the default in the pipeline config ('epochs')
-            budget (Optional[float]):
-                Budget to fit a single run of the pipeline. If not
-                provided, uses the default in the pipeline config
+                It can be one of:
+                + 'epochs': The training of each pipeline will be terminated after
+                  a number of epochs have passed. This number of epochs is determined by the
+                  budget argument of this method.
+                + 'runtime': The training of each pipeline will be terminated after
+                  a number of seconds have passed. This number of seconds is determined by the
+                  budget argument of this method. The overall fitting time of a pipeline is
+                  controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated
+                  time to train a pipeline, but it does not consider the overall time it takes
+                  to create a pipeline (data loading and preprocessing, other i/o operations, etc.).
+                budget_type will determine the units of min_budget/max_budget. If budget_type=='epochs'
+                is used, min_budget will refer to epochs whereas if budget_type=='runtime' then
+                min_budget will refer to seconds.
+            min_budget (int):
+                Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
+                trade-off resources between running many pipelines at min_budget and
+                running the top performing pipelines on max_budget.
+                min_budget states the minimum resource allocation a pipeline should have
+                so that we can compare and quickly discard bad performing models.
+                For example, if the budget_type is epochs, and min_budget=5, then we will
+                run every pipeline to a minimum of 5 epochs before performance comparison.
+            max_budget (int):
+                Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
+                trade-off resources between running many pipelines at min_budget and
+                running the top performing pipelines on max_budget.
+                max_budget states the maximum resource allocation a pipeline is going to
+                be ran. For example, if the budget_type is epochs, and max_budget=50,
+                then the pipeline training will be terminated after 50 epochs.
             total_walltime_limit (int), (default=100): Time limit
                 in seconds for the search of appropriate models.
                 By increasing this value, autopytorch has a higher
@@ -227,7 +251,8 @@ def search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
             budget_type=budget_type,
-            budget=budget,
+            min_budget=min_budget,
+            max_budget=max_budget,
             total_walltime_limit=total_walltime_limit,
             func_eval_time_limit_secs=func_eval_time_limit_secs,
             enable_traditional_pipeline=enable_traditional_pipeline,

diff --git a/autoPyTorch/configs/default_pipeline_options.json b/autoPyTorch/configs/default_pipeline_options.json
@@ -1,11 +1,10 @@
 {
-            "device": "cpu",
-            "budget_type": "epochs",
-            "min_epochs": 5,
-            "epochs": 50,
-            "runtime": 3600,
-            "torch_num_threads": 1,
-            "early_stopping": 20,
-            "use_tensorboard_logger": "False",
-            "metrics_during_training": "True"
+    "device": "cpu",
+    "budget_type": "epochs",
+    "epochs": 50,
+    "runtime": 3600,
+    "torch_num_threads": 1,
+    "early_stopping": 20,
+    "use_tensorboard_logger": "False",
+    "metrics_during_training": "True"
 }
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -42,7 +42,7 @@
     calculate_loss,
     get_metrics,
 )
-from autoPyTorch.utils.common import subsampler
+from autoPyTorch.utils.common import dict_repr, subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
 from autoPyTorch.utils.pipeline import get_dataset_requirements
@@ -515,6 +515,12 @@ def __init__(self, backend: Backend,
         # If the budget is epochs, we want to limit that in the fit dictionary
         if self.budget_type == 'epochs':
             self.fit_dictionary['epochs'] = budget
+            self.fit_dictionary.pop('runtime', None)
+        elif self.budget_type == 'runtime':
+            self.fit_dictionary['runtime'] = budget
+            self.fit_dictionary.pop('epochs', None)
+        else:
+            raise ValueError(f"Unsupported budget type {self.budget_type} provided")
 
         self.num_run = 0 if num_run is None else num_run
 
@@ -531,7 +537,7 @@ def __init__(self, backend: Backend,
         self.Y_actual_train: Optional[np.ndarray] = None
         self.pipelines: Optional[List[BaseEstimator]] = None
         self.pipeline: Optional[BaseEstimator] = None
-        self.logger.debug("Fit dictionary in Abstract evaluator: {}".format(self.fit_dictionary))
+        self.logger.debug("Fit dictionary in Abstract evaluator: {}".format(dict_repr(self.fit_dictionary)))
         self.logger.debug("Search space updates :{}".format(self.search_space_updates))
 
     def _get_pipeline(self) -> BaseEstimator:

diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
@@ -26,7 +26,7 @@
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.evaluation.utils import empty_queue, extract_learning_curve, read_queue
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-from autoPyTorch.utils.common import replace_string_bool_to_bool
+from autoPyTorch.utils.common import dict_repr, replace_string_bool_to_bool
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
 from autoPyTorch.utils.parallel import preload_modules
@@ -209,9 +209,14 @@ def run_wrapper(
                 )
         else:
             if run_info.budget == 0:
-                run_info = run_info._replace(budget=self.pipeline_config[self.budget_type])
-            elif run_info.budget <= 0 or run_info.budget > 100:
-                raise ValueError('Illegal value for budget, must be >0 and <=100, but is %f' %
+                # SMAC can return budget zero for intensifiers that don't have a concept
+                # of budget, for example a simple bayesian optimization intensifier.
+                # Budget determines how our pipeline trains, which can be via runtime or epochs
+                epochs_budget = self.pipeline_config.get('epochs', np.inf)
+                runtime_budget = self.pipeline_config.get('runtime', np.inf)
+                run_info = run_info._replace(budget=min(epochs_budget, runtime_budget))
+            elif run_info.budget <= 0:
+                raise ValueError('Illegal value for budget, must be greater than zero but is %f' %
                                  run_info.budget)
             if self.budget_type not in ('epochs', 'runtime'):
                 raise ValueError("Illegal value for budget type, must be one of "
@@ -454,7 +459,14 @@ def run(
 
         empty_queue(queue)
         self.logger.debug(
-            'Finished function evaluation %s. Status: %s, Cost: %f, Runtime: %f, Additional %s',
-            str(num_run), status, cost, runtime, additional_run_info,
+            "Finish function evaluation {}.\n"
+            "Status: {}, Cost: {}, Runtime: {},\n"
+            "Additional information:\n{}".format(
+                str(num_run),
+                status,
+                cost,
+                runtime,
+                dict_repr(additional_run_info)
+            )
         )
         return status, cost, runtime, additional_run_info
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
@@ -19,7 +19,7 @@
     fit_and_suppress_warnings
 )
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-from autoPyTorch.utils.common import subsampler
+from autoPyTorch.utils.common import dict_repr, subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
 __all__ = ['TrainEvaluator', 'eval_function']
@@ -172,11 +172,11 @@ def fit_predict_and_loss(self) -> None:
 
             status = StatusType.SUCCESS
 
-            self.logger.debug("In train evaluator fit_predict_and_loss, num_run: {} loss:{},"
-                              " additional run info:{}, status: {}".format(self.num_run,
-                                                                           loss,
-                                                                           additional_run_info,
-                                                                           status))
+            self.logger.debug("In train evaluator.fit_predict_and_loss, num_run: {} loss:{},"
+                              " status: {},\nadditional run info:\n{}".format(self.num_run,
+                                                                              loss,
+                                                                              dict_repr(additional_run_info),
+                                                                              status))
             self.finish_up(
                 loss=loss,
                 train_loss=train_loss,