Merge branch 'development' into feat-epoch-wise-LR-scheduler

automl · Jun 19, 2021 · ae354c8 · ae354c8
2 parents 017595d + 999f3c3
commit ae354c8
Show file tree

Hide file tree

Showing 135 changed files with 2,174 additions and 1,387 deletions.
diff --git a/.github/workflows/long_regression_test.yml b/.github/workflows/long_regression_test.yml
@@ -0,0 +1,35 @@
+name: Tests
+
+on:
+  schedule:
+    # Every Truesday at 7AM UTC
+    # TODO teporary set to every day just for the PR
+    #- cron: '0 07 * * 2'
+    - cron: '0 07 * * *'
+
+
+jobs:
+  ubuntu:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.8]
+      fail-fast:  false
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        ref: development
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install test dependencies
+      run: |
+        git submodule update --init --recursive
+        python -m pip install --upgrade pip
+        pip install -e .[test]
+    - name: Run tests
+      run: |
+        python -m pytest --durations=200 cicd/test_preselected_configs.py -vs
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.7, 3.8]
         include:
           - python-version: 3.8
             code-cov: true
@@ -52,4 +52,4 @@ jobs:
       uses: codecov/codecov-action@v1
       with:
         fail_ci_if_error: true
-        verbose: true
+        verbose: true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,21 +3,25 @@ repos:
     rev: v0.761
     hooks:
       - id: mypy
-        args: [--show-error-codes]
-        name: mypy AutoPyTorch
+        args: [--show-error-codes,
+               --warn-redundant-casts,
+               --warn-return-any,
+               --warn-unreachable,
+        ]
         files: autoPyTorch/.*
+        exclude: autoPyTorch/ensemble/
   - repo: https://gitlab.com/pycqa/flake8
     rev: 3.8.3
     hooks:
       - id: flake8
-        name: flake8 AutoPyTorch
-        files: autoPyTorch/.*
         additional_dependencies:
           - flake8-print==3.1.4
           - flake8-import-order
+        name: flake8 autoPyTorch
+        files: autoPyTorch/.*
       - id: flake8
-        name: flake8 tests
-        files: test/.*
         additional_dependencies:
           - flake8-print==3.1.4
           - flake8-import-order
+        name: flake8 test
+        files: test/.*
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,10 +1,11 @@
 include requirements.txt
 include autoPyTorch/utils/logging.yaml
 include autoPyTorch/configs/default_pipeline_options.json
-include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/catboost.json
-include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/rotation_forest.json
-include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/random_forest.json
-include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/knn.json
-include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/svm.json
-include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/extra_trees.json
-include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/lgb.json
+include autoPyTorch/configs/greedy_portfolio.json
+include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json
+include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json
+include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json
+include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json
+include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json
+include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json
+include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json
diff --git a/README.md b/README.md
@@ -27,11 +27,7 @@ git submodule update --init --recursive
 # Create the environment
 conda create -n autopytorch python=3.8
 conda activate autopytorch
-For Linux:
-    conda install gxx_linux-64 gcc_linux-64 swig
-For mac:
-    conda install -c conda-forge clang_osx-64 clangxx_osx-64
-    conda install -c anaconda swig
+conda install swig
 cat requirements.txt | xargs -n 1 -L 1 pip install
 python setup.py install
 

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -12,11 +12,12 @@
 import unittest.mock
 import warnings
 from abc import abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 
 import dask
+import dask.distributed
 
 import joblib
 
@@ -38,13 +39,12 @@
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
-from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection
 from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
 from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autoPyTorch.optimizer.smbo import AutoMLSMBO
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
-from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models import get_available_classifiers
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score, get_metrics
 from autoPyTorch.utils.common import FitRequirement, replace_string_bool_to_bool
@@ -198,7 +198,7 @@ def __init__(
         # examples. Nevertheless, multi-process runs
         # have spawn as requirement to reduce the
         # possibility of a deadlock
-        self._dask_client = None
+        self._dask_client: Optional[dask.distributed.Client] = None
         self._multiprocessing_context = 'forkserver'
         if self.n_jobs == 1:
             self._multiprocessing_context = 'fork'
@@ -590,7 +590,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
         memory_limit = self._memory_limit
         if memory_limit is not None:
             memory_limit = int(math.ceil(memory_limit))
-        available_classifiers = get_available_classifiers()
+        available_classifiers = get_available_traditional_learners()
         dask_futures = []
 
         total_number_classifiers = len(available_classifiers)
@@ -711,7 +711,8 @@ def _search(
         precision: int = 32,
         disable_file_output: List = [],
         load_models: bool = True,
-        portfolio_selection: Optional[str] = None
+        portfolio_selection: Optional[str] = None,
+        dask_client: Optional[dask.distributed.Client] = None
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -838,6 +839,8 @@ def _search(
         self._metric = get_metrics(
             names=[optimize_metric], dataset_properties=dataset_properties)[0]
 
+        self.pipeline_options['optimize_metric'] = optimize_metric
+
         self.search_space = self.get_search_space(dataset)
 
         budget_config: Dict[str, Union[float, str]] = {}
@@ -855,10 +858,11 @@ def _search(
         # If no dask client was provided, we create one, so that we can
         # start a ensemble process in parallel to smbo optimize
         if (
-            self._dask_client is None and (self.ensemble_size > 0 or self.n_jobs is not None and self.n_jobs > 1)
+            dask_client is None and (self.ensemble_size > 0 or self.n_jobs > 1)
         ):
             self._create_dask_client()
         else:
+            self._dask_client = dask_client
             self._is_dask_client_internally_created = False
 
         # Handle time resource allocation
@@ -892,21 +896,18 @@ def _search(
         # ============> Run traditional ml
 
         if enable_traditional_pipeline:
-            if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
-                self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
-            else:
-                traditional_task_name = 'runTraditional'
-                self._stopwatch.start_task(traditional_task_name)
-                elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
-                # We want time for at least 1 Neural network in SMAC
-                time_for_traditional = int(
-                    self._time_for_task - elapsed_time - func_eval_time_limit_secs
-                )
-                self._do_traditional_prediction(
-                    func_eval_time_limit_secs=func_eval_time_limit_secs,
-                    time_left=time_for_traditional,
-                )
-                self._stopwatch.stop_task(traditional_task_name)
+            traditional_task_name = 'runTraditional'
+            self._stopwatch.start_task(traditional_task_name)
+            elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
+            # We want time for at least 1 Neural network in SMAC
+            time_for_traditional = int(
+                self._time_for_task - elapsed_time - func_eval_time_limit_secs
+            )
+            self._do_traditional_prediction(
+                func_eval_time_limit_secs=func_eval_time_limit_secs,
+                time_left=time_for_traditional,
+            )
+            self._stopwatch.stop_task(traditional_task_name)
 
         # ============> Starting ensemble
         elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
@@ -1207,7 +1208,6 @@ def predict(
 
         # Mypy assert
         assert self.ensemble_ is not None, "Load models should error out if no ensemble"
-        self.ensemble_ = cast(Union[SingleBest, EnsembleSelection], self.ensemble_)
 
         if isinstance(self.resampling_strategy, HoldoutValTypes):
             models = self.models_
@@ -1316,15 +1316,17 @@ def get_models_with_weights(self) -> List:
             self._load_models()
 
         assert self.ensemble_ is not None
-        return self.ensemble_.get_models_with_weights(self.models_)
+        models_with_weights: List[Tuple[float, BasePipeline]] = self.ensemble_.get_models_with_weights(self.models_)
+        return models_with_weights
 
     def show_models(self) -> str:
         df = []
         for weight, model in self.get_models_with_weights():
             representation = model.get_pipeline_representation()
             representation.update({'Weight': weight})
             df.append(representation)
-        return pd.DataFrame(df).to_markdown()
+        models_markdown: str = pd.DataFrame(df).to_markdown()
+        return models_markdown
 
     def _print_debug_info_to_log(self) -> None:
         """

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -106,7 +106,7 @@ def search(
         budget: Optional[float] = None,
         total_walltime_limit: int = 100,
         func_eval_time_limit_secs: Optional[int] = None,
-        enable_traditional_pipeline: bool = False,
+        enable_traditional_pipeline: bool = True,
         memory_limit: Optional[int] = 4096,
         smac_scenario_args: Optional[Dict[str, Any]] = None,
         get_smac_object_callback: Optional[Callable] = None,
@@ -151,7 +151,7 @@ def search(
                 total_walltime_limit // 2 to allow enough time to fit
                 at least 2 individual machine learning algorithms.
                 Set to np.inf in case no time limit is desired.
-            enable_traditional_pipeline (bool), (default=False):
+            enable_traditional_pipeline (bool), (default=True):
                 Not enabled for regression. This flag is here to comply
                 with the API.
             memory_limit (Optional[int]), (default=4096): Memory
@@ -187,7 +187,11 @@ def search(
                 configurations, similar to (...herepathtogreedy...).
                 Additionally, the keyword 'greedy' is supported,
                 which would use the default portfolio from
-                `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`
+                `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`.
+                Although portfolio selection is supported for tabular
+                regression, the portfolio has been built using
+                classification datasets. We will update a portfolio
+                to cover tabular regression datasets.
 
         Returns:
             self

diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
@@ -95,7 +95,6 @@ def fit(
                                      np.shape(y_test)
                                  ))
             if isinstance(y_train, pd.DataFrame):
-                y_train = typing.cast(pd.DataFrame, y_train)
                 y_test = typing.cast(pd.DataFrame, y_test)
                 if y_train.columns.tolist() != y_test.columns.tolist():
                     raise ValueError(

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -145,7 +145,6 @@ def transform(
             X = self.numpy_array_to_pandas(X)
 
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
-            X = typing.cast(pd.DataFrame, X)
             if np.any(pd.isnull(X)):
                 for column in X.columns:
                     if X[column].isna().all():

diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
@@ -194,8 +194,9 @@ def _check_data(
                 A set of features whose dimensionality and data type is going to be checked
         """
 
-        if not isinstance(
-                y, (np.ndarray, pd.DataFrame, list, pd.Series)) and not scipy.sparse.issparse(y):
+        if not isinstance(y, (np.ndarray, pd.DataFrame,
+                              typing.List, pd.Series)) \
+                and not scipy.sparse.issparse(y):  # type: ignore[misc]
             raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
                              " pd.Series, sparse data and Python Lists as targets, yet, "
                              "the provided input is of type {}".format(

diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -26,6 +26,7 @@
 from autoPyTorch.utils.common import FitRequirement
 
 BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
+BaseDatasetPropertiesType = Union[int, float, str, List, bool]
 
 
 def check_valid_data(data: Any) -> None:
@@ -125,7 +126,6 @@ def __init__(
         self.task_type: Optional[str] = None
         self.issparse: bool = issparse(self.train_tensors[0])
         self.input_shape: Tuple[int] = self.train_tensors[0].shape[1:]
-
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
             self.output_type: str = type_of_target(self.train_tensors[1])
 
@@ -205,7 +205,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         return X, Y
 
     def __len__(self) -> int:
-        return self.train_tensors[0].shape[0]
+        return int(self.train_tensors[0].shape[0])
 
     def _get_indices(self) -> np.ndarray:
         return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self))
@@ -349,7 +349,9 @@ def replace_data(self, X_train: BaseDatasetInputType,
             self.test_tensors = (X_test, self.test_tensors[1])
         return self
 
-    def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
+    def get_dataset_properties(
+        self, dataset_requirements: List[FitRequirement]
+    ) -> Dict[str, BaseDatasetPropertiesType]:
         """
         Gets the dataset properties required in the fit dictionary.
         This depends on the components that are active in the
@@ -364,7 +366,7 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                 <https://github.com/automl/Auto-PyTorch/blob/refactor_development/autoPyTorch/utils/pipeline.py#L25>`
 
         Returns:
-            dataset_properties (Dict[str, Any]):
+            dataset_properties (Dict[str, BaseDatasetPropertiesType]):
                 Dict of the dataset properties.
         """
         dataset_properties = dict()
@@ -376,11 +378,11 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
         dataset_properties.update(self.get_required_dataset_info())
         return dataset_properties
 
-    def get_required_dataset_info(self) -> Dict[str, Any]:
+    def get_required_dataset_info(self) -> Dict[str, BaseDatasetPropertiesType]:
         """
         Returns a dictionary containing required dataset
         properties to instantiate a pipeline.
         """
-        info = {'output_type': self.output_type,
-                'issparse': self.issparse}
+        info: Dict[str, BaseDatasetPropertiesType] = {'output_type': self.output_type,
+                                                      'issparse': self.issparse}
         return info