Skip to content

Commit

Permalink
Merge branch 'development' into feat-epoch-wise-LR-scheduler
Browse files Browse the repository at this point in the history
  • Loading branch information
nabenabe0928 committed Jun 19, 2021
2 parents 017595d + 999f3c3 commit ae354c8
Show file tree
Hide file tree
Showing 135 changed files with 2,174 additions and 1,387 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/long_regression_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Tests

on:
schedule:
# Every Truesday at 7AM UTC
# TODO teporary set to every day just for the PR
#- cron: '0 07 * * 2'
- cron: '0 07 * * *'


jobs:
ubuntu:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8]
fail-fast: false

steps:
- uses: actions/checkout@v2
with:
ref: development
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install test dependencies
run: |
git submodule update --init --recursive
python -m pip install --upgrade pip
pip install -e .[test]
- name: Run tests
run: |
python -m pytest --durations=200 cicd/test_preselected_configs.py -vs
4 changes: 2 additions & 2 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.7, 3.8]
include:
- python-version: 3.8
code-cov: true
Expand Down Expand Up @@ -52,4 +52,4 @@ jobs:
uses: codecov/codecov-action@v1
with:
fail_ci_if_error: true
verbose: true
verbose: true
16 changes: 10 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,25 @@ repos:
rev: v0.761
hooks:
- id: mypy
args: [--show-error-codes]
name: mypy AutoPyTorch
args: [--show-error-codes,
--warn-redundant-casts,
--warn-return-any,
--warn-unreachable,
]
files: autoPyTorch/.*
exclude: autoPyTorch/ensemble/
- repo: https://gitlab.com/pycqa/flake8
rev: 3.8.3
hooks:
- id: flake8
name: flake8 AutoPyTorch
files: autoPyTorch/.*
additional_dependencies:
- flake8-print==3.1.4
- flake8-import-order
name: flake8 autoPyTorch
files: autoPyTorch/.*
- id: flake8
name: flake8 tests
files: test/.*
additional_dependencies:
- flake8-print==3.1.4
- flake8-import-order
name: flake8 test
files: test/.*
15 changes: 8 additions & 7 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
include requirements.txt
include autoPyTorch/utils/logging.yaml
include autoPyTorch/configs/default_pipeline_options.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/catboost.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/rotation_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/random_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/knn.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/svm.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/extra_trees.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/lgb.json
include autoPyTorch/configs/greedy_portfolio.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json
6 changes: 1 addition & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,7 @@ git submodule update --init --recursive
# Create the environment
conda create -n autopytorch python=3.8
conda activate autopytorch
For Linux:
conda install gxx_linux-64 gcc_linux-64 swig
For mac:
conda install -c conda-forge clang_osx-64 clangxx_osx-64
conda install -c anaconda swig
conda install swig
cat requirements.txt | xargs -n 1 -L 1 pip install
python setup.py install

Expand Down
52 changes: 27 additions & 25 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
import unittest.mock
import warnings
from abc import abstractmethod
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from ConfigSpace.configuration_space import Configuration, ConfigurationSpace

import dask
import dask.distributed

import joblib

Expand All @@ -38,13 +39,12 @@
from autoPyTorch.datasets.base_dataset import BaseDataset
from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection
from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings
from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
from autoPyTorch.optimizer.smbo import AutoMLSMBO
from autoPyTorch.pipeline.base_pipeline import BasePipeline
from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models import get_available_classifiers
from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score, get_metrics
from autoPyTorch.utils.common import FitRequirement, replace_string_bool_to_bool
Expand Down Expand Up @@ -198,7 +198,7 @@ def __init__(
# examples. Nevertheless, multi-process runs
# have spawn as requirement to reduce the
# possibility of a deadlock
self._dask_client = None
self._dask_client: Optional[dask.distributed.Client] = None
self._multiprocessing_context = 'forkserver'
if self.n_jobs == 1:
self._multiprocessing_context = 'fork'
Expand Down Expand Up @@ -590,7 +590,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
memory_limit = self._memory_limit
if memory_limit is not None:
memory_limit = int(math.ceil(memory_limit))
available_classifiers = get_available_classifiers()
available_classifiers = get_available_traditional_learners()
dask_futures = []

total_number_classifiers = len(available_classifiers)
Expand Down Expand Up @@ -711,7 +711,8 @@ def _search(
precision: int = 32,
disable_file_output: List = [],
load_models: bool = True,
portfolio_selection: Optional[str] = None
portfolio_selection: Optional[str] = None,
dask_client: Optional[dask.distributed.Client] = None
) -> 'BaseTask':
"""
Search for the best pipeline configuration for the given dataset.
Expand Down Expand Up @@ -838,6 +839,8 @@ def _search(
self._metric = get_metrics(
names=[optimize_metric], dataset_properties=dataset_properties)[0]

self.pipeline_options['optimize_metric'] = optimize_metric

self.search_space = self.get_search_space(dataset)

budget_config: Dict[str, Union[float, str]] = {}
Expand All @@ -855,10 +858,11 @@ def _search(
# If no dask client was provided, we create one, so that we can
# start a ensemble process in parallel to smbo optimize
if (
self._dask_client is None and (self.ensemble_size > 0 or self.n_jobs is not None and self.n_jobs > 1)
dask_client is None and (self.ensemble_size > 0 or self.n_jobs > 1)
):
self._create_dask_client()
else:
self._dask_client = dask_client
self._is_dask_client_internally_created = False

# Handle time resource allocation
Expand Down Expand Up @@ -892,21 +896,18 @@ def _search(
# ============> Run traditional ml

if enable_traditional_pipeline:
if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
else:
traditional_task_name = 'runTraditional'
self._stopwatch.start_task(traditional_task_name)
elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
# We want time for at least 1 Neural network in SMAC
time_for_traditional = int(
self._time_for_task - elapsed_time - func_eval_time_limit_secs
)
self._do_traditional_prediction(
func_eval_time_limit_secs=func_eval_time_limit_secs,
time_left=time_for_traditional,
)
self._stopwatch.stop_task(traditional_task_name)
traditional_task_name = 'runTraditional'
self._stopwatch.start_task(traditional_task_name)
elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
# We want time for at least 1 Neural network in SMAC
time_for_traditional = int(
self._time_for_task - elapsed_time - func_eval_time_limit_secs
)
self._do_traditional_prediction(
func_eval_time_limit_secs=func_eval_time_limit_secs,
time_left=time_for_traditional,
)
self._stopwatch.stop_task(traditional_task_name)

# ============> Starting ensemble
elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
Expand Down Expand Up @@ -1207,7 +1208,6 @@ def predict(

# Mypy assert
assert self.ensemble_ is not None, "Load models should error out if no ensemble"
self.ensemble_ = cast(Union[SingleBest, EnsembleSelection], self.ensemble_)

if isinstance(self.resampling_strategy, HoldoutValTypes):
models = self.models_
Expand Down Expand Up @@ -1316,15 +1316,17 @@ def get_models_with_weights(self) -> List:
self._load_models()

assert self.ensemble_ is not None
return self.ensemble_.get_models_with_weights(self.models_)
models_with_weights: List[Tuple[float, BasePipeline]] = self.ensemble_.get_models_with_weights(self.models_)
return models_with_weights

def show_models(self) -> str:
df = []
for weight, model in self.get_models_with_weights():
representation = model.get_pipeline_representation()
representation.update({'Weight': weight})
df.append(representation)
return pd.DataFrame(df).to_markdown()
models_markdown: str = pd.DataFrame(df).to_markdown()
return models_markdown

def _print_debug_info_to_log(self) -> None:
"""
Expand Down
10 changes: 7 additions & 3 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def search(
budget: Optional[float] = None,
total_walltime_limit: int = 100,
func_eval_time_limit_secs: Optional[int] = None,
enable_traditional_pipeline: bool = False,
enable_traditional_pipeline: bool = True,
memory_limit: Optional[int] = 4096,
smac_scenario_args: Optional[Dict[str, Any]] = None,
get_smac_object_callback: Optional[Callable] = None,
Expand Down Expand Up @@ -151,7 +151,7 @@ def search(
total_walltime_limit // 2 to allow enough time to fit
at least 2 individual machine learning algorithms.
Set to np.inf in case no time limit is desired.
enable_traditional_pipeline (bool), (default=False):
enable_traditional_pipeline (bool), (default=True):
Not enabled for regression. This flag is here to comply
with the API.
memory_limit (Optional[int]), (default=4096): Memory
Expand Down Expand Up @@ -187,7 +187,11 @@ def search(
configurations, similar to (...herepathtogreedy...).
Additionally, the keyword 'greedy' is supported,
which would use the default portfolio from
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`.
Although portfolio selection is supported for tabular
regression, the portfolio has been built using
classification datasets. We will update a portfolio
to cover tabular regression datasets.
Returns:
self
Expand Down
1 change: 0 additions & 1 deletion autoPyTorch/data/base_target_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def fit(
np.shape(y_test)
))
if isinstance(y_train, pd.DataFrame):
y_train = typing.cast(pd.DataFrame, y_train)
y_test = typing.cast(pd.DataFrame, y_test)
if y_train.columns.tolist() != y_test.columns.tolist():
raise ValueError(
Expand Down
1 change: 0 additions & 1 deletion autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ def transform(
X = self.numpy_array_to_pandas(X)

if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
X = typing.cast(pd.DataFrame, X)
if np.any(pd.isnull(X)):
for column in X.columns:
if X[column].isna().all():
Expand Down
5 changes: 3 additions & 2 deletions autoPyTorch/data/tabular_target_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,9 @@ def _check_data(
A set of features whose dimensionality and data type is going to be checked
"""

if not isinstance(
y, (np.ndarray, pd.DataFrame, list, pd.Series)) and not scipy.sparse.issparse(y):
if not isinstance(y, (np.ndarray, pd.DataFrame,
typing.List, pd.Series)) \
and not scipy.sparse.issparse(y): # type: ignore[misc]
raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
" pd.Series, sparse data and Python Lists as targets, yet, "
"the provided input is of type {}".format(
Expand Down
16 changes: 9 additions & 7 deletions autoPyTorch/datasets/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from autoPyTorch.utils.common import FitRequirement

BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
BaseDatasetPropertiesType = Union[int, float, str, List, bool]


def check_valid_data(data: Any) -> None:
Expand Down Expand Up @@ -125,7 +126,6 @@ def __init__(
self.task_type: Optional[str] = None
self.issparse: bool = issparse(self.train_tensors[0])
self.input_shape: Tuple[int] = self.train_tensors[0].shape[1:]

if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
self.output_type: str = type_of_target(self.train_tensors[1])

Expand Down Expand Up @@ -205,7 +205,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
return X, Y

def __len__(self) -> int:
return self.train_tensors[0].shape[0]
return int(self.train_tensors[0].shape[0])

def _get_indices(self) -> np.ndarray:
return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self))
Expand Down Expand Up @@ -349,7 +349,9 @@ def replace_data(self, X_train: BaseDatasetInputType,
self.test_tensors = (X_test, self.test_tensors[1])
return self

def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
def get_dataset_properties(
self, dataset_requirements: List[FitRequirement]
) -> Dict[str, BaseDatasetPropertiesType]:
"""
Gets the dataset properties required in the fit dictionary.
This depends on the components that are active in the
Expand All @@ -364,7 +366,7 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
<https://github.com/automl/Auto-PyTorch/blob/refactor_development/autoPyTorch/utils/pipeline.py#L25>`
Returns:
dataset_properties (Dict[str, Any]):
dataset_properties (Dict[str, BaseDatasetPropertiesType]):
Dict of the dataset properties.
"""
dataset_properties = dict()
Expand All @@ -376,11 +378,11 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
dataset_properties.update(self.get_required_dataset_info())
return dataset_properties

def get_required_dataset_info(self) -> Dict[str, Any]:
def get_required_dataset_info(self) -> Dict[str, BaseDatasetPropertiesType]:
"""
Returns a dictionary containing required dataset
properties to instantiate a pipeline.
"""
info = {'output_type': self.output_type,
'issparse': self.issparse}
info: Dict[str, BaseDatasetPropertiesType] = {'output_type': self.output_type,
'issparse': self.issparse}
return info

0 comments on commit ae354c8

Please sign in to comment.