diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index 7f8ef50d5..59f70facf 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -34,5 +34,4 @@ jobs: python examples/tabular/20_basics/example_tabular_regression.py python examples/tabular/40_advanced/example_custom_configuration_space.py python examples/tabular/40_advanced/example_resampling_strategy.py - python examples/tabular/40_advanced/example_single_configuration.py python examples/example_image_classification.py \ No newline at end of file diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index e4b226d86..2084d7138 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -29,7 +29,7 @@ jobs: - name: Run tests run: | if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autoPyTorch --cov-report=xml'; fi - python -m pytest --forked --durations=20 --timeout=600 --timeout-method=signal -v $codecov test + python -m pytest --durations=20 --timeout=600 --timeout-method=signal -v $codecov test - name: Check for files left behind by test if: ${{ always() }} run: | diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index b7616e865..c4fa0e7ce 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -13,7 +13,7 @@ import uuid import warnings from abc import abstractmethod -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Callable, Dict, List, Optional, Union, cast from ConfigSpace.configuration_space import Configuration, ConfigurationSpace @@ -25,7 +25,7 @@ import pandas as pd -from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue +from smac.runhistory.runhistory import DataOrigin, RunHistory from smac.stats.stats import Stats from smac.tae import StatusType @@ -122,17 +122,6 @@ class BaseTask: exclude_components (Optional[Dict]): If None, all possible components are used. Otherwise specifies set of components not to use. Incompatible with include components - search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): updates to be made - to the hyperparameter search space of the pipeline - resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), - (default=HoldoutValTypes.holdout_validation): - strategy to split the training data. - resampling_strategy_args (Optional[Dict[str, Any]]): arguments - required for the chosen resampling strategy. If None, uses - the default values provided in DEFAULT_RESAMPLING_PARAMETERS - in ```datasets/resampling_strategy.py```. - task_type (str): The task of the experiment as a string. Currently, supported - tasks are 'tabular_classification' and 'tabular_regression' """ def __init__( @@ -155,7 +144,6 @@ def __init__( search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, task_type: Optional[str] = None ) -> None: - self.seed = seed self.n_jobs = n_jobs self.ensemble_size = ensemble_size @@ -217,11 +205,7 @@ def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, An raise NotImplementedError @abstractmethod - def build_pipeline(self, dataset_properties: Dict[str, Any], - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None - ) -> BasePipeline: + def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline: """ Build pipeline according to current task and for the passed dataset properties @@ -231,23 +215,7 @@ def build_pipeline(self, dataset_properties: Dict[str, Any], Returns: """ - - raise NotImplementedError("Function called on BaseTask, this can only be called by " - "specific task which is a child of the BaseTask") - - @abstractmethod - def get_dataset(self, - X_train: Union[List, pd.DataFrame, np.ndarray], - y_train: Union[List, pd.DataFrame, np.ndarray], - X_test: Union[List, pd.DataFrame, np.ndarray], - y_test: Union[List, pd.DataFrame, np.ndarray], - resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, - resampling_strategy_args: Optional[Dict[str, Any]] = None, - dataset_name: Optional[str] = None, - return_only: Optional[bool] = False - ) -> BaseDataset: - raise NotImplementedError("Function called on BaseTask, this can only be called by " - "specific task which is a child of the BaseTask") + raise NotImplementedError def set_pipeline_config( self, @@ -428,9 +396,9 @@ def _close_dask_client(self) -> None: None """ if ( - hasattr(self, '_is_dask_client_internally_created') - and self._is_dask_client_internally_created - and self._dask_client + hasattr(self, '_is_dask_client_internally_created') + and self._is_dask_client_internally_created + and self._dask_client ): self._dask_client.shutdown() self._dask_client.close() @@ -452,13 +420,6 @@ def _load_models(self) -> bool: raise ValueError("Resampling strategy is needed to determine what models to load") self.ensemble_ = self._backend.load_ensemble(self.seed) - if isinstance(self._disable_file_output, List): - disabled_file_outputs = self._disable_file_output - disable_file_output = False - elif isinstance(self._disable_file_output, bool): - disable_file_output = self._disable_file_output - disabled_file_outputs = [] - # If no ensemble is loaded, try to get the best performing model if not self.ensemble_: self.ensemble_ = self._load_best_individual_model() @@ -473,7 +434,7 @@ def _load_models(self) -> bool: if len(self.cv_models_) == 0: raise ValueError('No models fitted!') - elif disable_file_output or 'pipeline' not in disabled_file_outputs: + elif 'pipeline' not in self._disable_file_output: model_names = self._backend.list_all_models(self.seed) if len(model_names) == 0: @@ -555,7 +516,7 @@ def _do_dummy_prediction(self) -> None: initial_num_run=num_run, stats=stats, memory_limit=memory_limit, - disable_file_output=self._disable_file_output, + disable_file_output=True if len(self._disable_file_output) > 0 else False, all_supported_metrics=self._all_supported_metrics ) @@ -648,7 +609,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: initial_num_run=self._backend.get_next_num_run(), stats=stats, memory_limit=memory_limit, - disable_file_output=self._disable_file_output, + disable_file_output=True if len(self._disable_file_output) > 0 else False, all_supported_metrics=self._all_supported_metrics ) dask_futures.append([ @@ -737,7 +698,7 @@ def _search( get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: Union[bool, List] = False, + disable_file_output: List = [], load_models: bool = True, ) -> 'BaseTask': """ @@ -1047,10 +1008,10 @@ def _search( return self def refit( - self, - dataset: BaseDataset, - budget_config: Dict[str, Union[int, str]] = {}, - split_id: int = 0 + self, + dataset: BaseDataset, + budget_config: Dict[str, Union[int, str]] = {}, + split_id: int = 0 ) -> "BaseTask": """ Refit all models found with fit to new data. @@ -1118,118 +1079,37 @@ def refit( return self - def fit_pipeline(self, - configuration: Configuration, - dataset: Optional[BaseDataset] = None, - X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, - y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, - X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, - y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, - dataset_name: Optional[str] = None, - resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes]] = None, - resampling_strategy_args: Optional[Dict[str, Any]] = None, - run_time_limit_secs: int = 60, - memory_limit: Optional[int] = None, - eval_metric: Optional[str] = None, - all_supported_metrics: bool = False, - budget_type: Optional[str] = None, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, - budget: float = 50, - pipeline_options: Optional[Dict] = None, - disable_file_output: Optional[Union[bool, List]] = False, - return_dataset: bool = True - ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue, Optional[BaseDataset]]: - + def fit(self, + dataset: BaseDataset, + budget_config: Dict[str, Union[int, str]] = {}, + pipeline_config: Optional[Configuration] = None, + split_id: int = 0) -> BasePipeline: """ Fit a pipeline on the given task for the budget. A pipeline configuration can be specified if None, uses default - Args: - X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame] - A pair of features (X_train) and targets (y_train) used to fit a - pipeline. Additionally, a holdout of this pairs (X_test, y_test) can - be provided to track the generalization performance of each stage. - dataset_name (Optional[str]): - Name of the dataset, if None, random value is used. - resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), - (default=HoldoutValTypes.holdout_validation): - strategy to split the training data. - resampling_strategy_args (Optional[Dict[str, Any]]): arguments - required for the chosen resampling strategy. If None, uses - the default values provided in DEFAULT_RESAMPLING_PARAMETERS - in ```datasets/resampling_strategy.py```. - run_time_limit_secs (int), (default=120): Time limit - for a single call to the machine learning model. - Model fitting will be terminated if the machine - learning algorithm runs over the time limit. Set - this value high enough so that typical machine - learning algorithms can be fit on the training - data. - memory_limit (Optional[int]), (default=None): Memory - limit in MB for the machine learning algorithm. autopytorch - will stop fitting the machine learning algorithm if it tries - to allocate more than memory_limit MB. If None is provided, - no memory limit is set. In case of multi-processing, memory_limit - will be per job. This memory limit also applies to the ensemble - creation process. - eval_metric (str): name of the metric that is used to - evaluate a pipeline. - all_supported_metrics (bool), (default=True): if True, all - metrics supporting current task will be calculated - for each pipeline and results will be available via cv_results - budget_type (Optional[str]): - Type of budget to be used when fitting the pipeline. - Either 'epochs' or 'runtime'. If not provided, uses - the default in the pipeline config ('epochs') - include_components (Optional[Dict]): If None, all possible components are used. - Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): If None, all possible components are used. - Otherwise specifies set of components not to use. Incompatible with include - components - search_space_updates(Optional[HyperparameterSearchSpaceUpdates]): updates to be made - to the hyperparameter search space of the pipeline - budget (Optional[float]): - Budget to fit a single run of the pipeline. If not - provided, uses the default in the pipeline config - pipeline_options (Optional[Dict]): - Valid config options include "device", - "torch_num_threads", "early_stopping", "use_tensorboard_logger", - "metrics_during_training" - disable_file_output (Optional[Union[bool, List]]): - By default, the model, it's predictions and other metadata is stored on disk - for each finished configuration. This argument allows the user to skip - saving certain file type, for example the model, from being written to disk. - configuration: (Optional[Configuration]) - configuration to fit the pipeline with. + dataset: (Dataset) + The argument that will provide the dataset splits. It can either + be a dictionary with the splits, or the dataset object which can + generate the splits based on different restrictions. + budget_config: (Optional[Dict[str, Union[int, str]]]) + can contain keys from 'budget_type' and the budget + specified using 'epochs' or 'runtime'. + split_id: (int) (default=0) + split id to fit on. + pipeline_config: (Optional[Configuration]) + configuration to fit the pipeline with. If None, + uses default Returns: (BasePipeline): fitted pipeline - (RunInfo): Run information - (RunValue): Result of fitting the pipeline - (BaseDataset): Dataset created from the given tensors """ - if dataset is None: - assert X_train is not None or \ - y_train is not None or \ - X_test is not None or \ - y_test is not None, "No dataset provided, must provide X_train, y_train, X_test, y_test tensors" - dataset = self.get_dataset(X_train=X_train, - y_train=y_train, - X_test=X_test, - y_test=y_test, - resampling_strategy=resampling_strategy, - resampling_strategy_args=resampling_strategy_args, - dataset_name=dataset_name, - return_only=True) - - # TAE expects each configuration to have a config_id. - # For fitting a pipeline as it is not part of the - # search process, it makes sense to set it to 0 - if hasattr(configuration, 'config_id') or configuration.config_id is None: - configuration.__setattr__('config_id', 0) + if self.dataset_name is None: + self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) + + if self._logger is None: + self._logger = self._get_logger(self.dataset_name) # get dataset properties dataset_requirements = get_dataset_requirements( @@ -1237,101 +1117,35 @@ def fit_pipeline(self, dataset_properties = dataset.get_dataset_properties(dataset_requirements) self._backend.save_datamanager(dataset) - if self._logger is None: - self._logger = self._get_logger(dataset.dataset_name) - # build pipeline - if include_components is None: - include_components = self.include_components - if exclude_components is None: - exclude_components = self.exclude_components - if search_space_updates is None: - search_space_updates = self.search_space_updates + pipeline = self.build_pipeline(dataset_properties) + if pipeline_config is not None: + pipeline.set_hyperparameters(pipeline_config) - scenario_mock = unittest.mock.Mock() - scenario_mock.wallclock_limit = run_time_limit_secs - # This stats object is a hack - maybe the SMAC stats object should - # already be generated here! - stats = Stats(scenario_mock) - - if memory_limit is None: - if hasattr(self, '_memory_limit') and self._memory_limit is not None: - memory_limit = self._memory_limit - - metric = get_metrics(dataset_properties=dataset_properties, - names=[eval_metric] if eval_metric is not None else None, - all_supported_metrics=False).pop() - - pipeline_options = self.pipeline_options.copy().update(pipeline_options) if pipeline_options is not None \ - else self.pipeline_options.copy() - if budget_type is not None: - assert pipeline_options is not None - pipeline_options.update({'budget_type': budget_type}) - if disable_file_output is None: - disable_file_output = self._disable_file_output - stats.start_timing() - tae = ExecuteTaFuncWithQueue( - backend=self._backend, - seed=self.seed, - metric=metric, - logger_port=self._logger_port, - cost_for_crash=get_cost_of_crash(metric), - abort_on_first_run_crash=False, - initial_num_run=self._backend.get_next_num_run(), - stats=stats, - memory_limit=memory_limit, - disable_file_output=disable_file_output, - all_supported_metrics=all_supported_metrics, - budget_type=budget_type, - include=include_components, - exclude=exclude_components, - search_space_updates=search_space_updates, - pipeline_config=pipeline_options - ) + # initialise fit dictionary + X: Dict[str, Any] = dict({'dataset_properties': dataset_properties, + 'backend': self._backend, + 'X_train': dataset.train_tensors[0], + 'y_train': dataset.train_tensors[1], + 'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None, + 'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None, + 'train_indices': dataset.splits[split_id][0], + 'val_indices': dataset.splits[split_id][1], + 'split_id': split_id, + 'num_run': self._backend.get_next_num_run(), + }) + X.update({**self.pipeline_options, **budget_config}) - run_info, run_value = tae.run_wrapper( - RunInfo(config=configuration, - budget=budget, - seed=self.seed, - cutoff=run_time_limit_secs, - capped=False, - instance_specific=None, - instance=None) - ) - disabled_file_outputs: List = [] - if isinstance(disable_file_output, bool): - disable_file_output = disable_file_output - elif isinstance(disable_file_output, List): - disabled_file_outputs = disable_file_output - else: - raise ValueError('disable_file_output should be either a bool or a list') - - fitted_pipeline: Optional[BasePipeline] = None - if disable_file_output or 'pipeline' in disabled_file_outputs: - self._logger.warning("File output is disabled. No pipeline can returned") - elif run_value.status == StatusType.SUCCESS: - if self.resampling_strategy in CrossValTypes: - load_function = self._backend.load_cv_model_by_seed_and_id_and_budget - else: - load_function = self._backend.load_model_by_seed_and_id_and_budget - fitted_pipeline = load_function( - seed=self.seed, - idx=run_info.config.config_id + tae.initial_num_run, - budget=float(run_info.budget), - ) + fit_and_suppress_warnings(self._logger, pipeline, X, y=None) self._clean_logger() - - if not return_dataset: - dataset = None # type: ignore [assignment] - - return fitted_pipeline, run_info, run_value, dataset + return pipeline def predict( - self, - X_test: np.ndarray, - batch_size: Optional[int] = None, - n_jobs: int = 1 + self, + X_test: np.ndarray, + batch_size: Optional[int] = None, + n_jobs: int = 1 ) -> np.ndarray: """Generate the estimator predictions. Generate the predictions based on the given examples from the test set. @@ -1381,9 +1195,9 @@ def predict( return predictions def score( - self, - y_pred: np.ndarray, - y_test: Union[np.ndarray, pd.DataFrame] + self, + y_pred: np.ndarray, + y_test: Union[np.ndarray, pd.DataFrame] ) -> Dict[str, float]: """Calculate the score on the test set. Calculate the evaluation measure on the test set. @@ -1425,13 +1239,13 @@ def __del__(self) -> None: @typing.no_type_check def get_incumbent_results( - self + self ): pass @typing.no_type_check def get_incumbent_config( - self + self ): pass diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 71bb99729..deeb5244b 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -108,67 +108,16 @@ def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, An 'numerical_columns': dataset.numerical_columns, 'categorical_columns': dataset.categorical_columns} - def build_pipeline(self, dataset_properties: Dict[str, Any], - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None - ) -> TabularClassificationPipeline: - return TabularClassificationPipeline(dataset_properties=dataset_properties, - include=include_components, - exclude=exclude_components, - search_space_updates=search_space_updates) - - def get_dataset(self, - X_train: Union[List, pd.DataFrame, np.ndarray], - y_train: Union[List, pd.DataFrame, np.ndarray], - X_test: Union[List, pd.DataFrame, np.ndarray], - y_test: Union[List, pd.DataFrame, np.ndarray], - resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, - resampling_strategy_args: Optional[Dict[str, Any]] = None, - dataset_name: Optional[str] = None, - return_only: Optional[bool] = False - ) -> BaseDataset: - - if dataset_name is None: - dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - - resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy - resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ - self.resampling_strategy_args - - # Create a validator object to make sure that the data provided by - # the user matches the autopytorch requirements - InputValidator = TabularInputValidator( - is_classification=True, - logger_port=self._logger_port, - ) - - # Fit a input validator to check the provided data - # Also, an encoder is fit to both train and test data, - # to prevent unseen categories during inference - InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) - - dataset = TabularDataset( - X=X_train, Y=y_train, - X_test=X_test, Y_test=y_test, - validator=InputValidator, - resampling_strategy=resampling_strategy, - resampling_strategy_args=resampling_strategy_args, - dataset_name=dataset_name - ) - if not return_only: - self.InputValidator = InputValidator - self.dataset = dataset - - return dataset + def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline: + return TabularClassificationPipeline(dataset_properties=dataset_properties) def search( self, optimize_metric: str, - X_train: Union[List, pd.DataFrame, np.ndarray], - y_train: Union[List, pd.DataFrame, np.ndarray], - X_test: Union[List, pd.DataFrame, np.ndarray], - y_test: Union[List, pd.DataFrame, np.ndarray], + X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, dataset_name: Optional[str] = None, budget_type: Optional[str] = None, budget: Optional[float] = None, @@ -194,8 +143,6 @@ def search( A pair of features (X_train) and targets (y_train) used to fit a pipeline. Additionally, a holdout of this pairs (X_test, y_test) can be provided to track the generalization performance of each stage. - dataset_name (Optional[str]): - Name of the dayaset, if None, random value is used optimize_metric (str): name of the metric that is used to evaluate a pipeline. budget_type (Optional[str]): @@ -257,12 +204,31 @@ def search( self """ + if dataset_name is None: + dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - self.get_dataset(X_train=X_train, - y_train=y_train, - X_test=X_test, - y_test=y_test, - dataset_name=dataset_name) + # we have to create a logger for at this point for the validator + self._logger = self._get_logger(dataset_name) + + # Create a validator object to make sure that the data provided by + # the user matches the autopytorch requirements + self.InputValidator = TabularInputValidator( + is_classification=True, + logger_port=self._logger_port, + ) + + # Fit a input validator to check the provided data + # Also, an encoder is fit to both train and test data, + # to prevent unseen categories during inference + self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + + self.dataset = TabularDataset( + X=X_train, Y=y_train, + X_test=X_test, Y_test=y_test, + validator=self.InputValidator, + resampling_strategy=self.resampling_strategy, + resampling_strategy_args=self.resampling_strategy_args, + ) return self._search( dataset=self.dataset, diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index dc867c21a..afef8ce9f 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -100,59 +100,8 @@ def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, An 'numerical_columns': dataset.numerical_columns, 'categorical_columns': dataset.categorical_columns} - def build_pipeline(self, dataset_properties: Dict[str, Any], - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None - ) -> TabularRegressionPipeline: - return TabularRegressionPipeline(dataset_properties=dataset_properties, - include=include_components, - exclude=exclude_components, - search_space_updates=search_space_updates) - - def get_dataset(self, - X_train: Union[List, pd.DataFrame, np.ndarray], - y_train: Union[List, pd.DataFrame, np.ndarray], - X_test: Union[List, pd.DataFrame, np.ndarray], - y_test: Union[List, pd.DataFrame, np.ndarray], - resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, - resampling_strategy_args: Optional[Dict[str, Any]] = None, - dataset_name: Optional[str] = None, - return_only: Optional[bool] = False - ) -> BaseDataset: - - if dataset_name is None: - dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - - resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy - resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ - self.resampling_strategy_args - - # Create a validator object to make sure that the data provided by - # the user matches the autopytorch requirements - InputValidator = TabularInputValidator( - is_classification=False, - logger_port=self._logger_port, - ) - - # Fit a input validator to check the provided data - # Also, an encoder is fit to both train and test data, - # to prevent unseen categories during inference - InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) - - dataset = TabularDataset( - X=X_train, Y=y_train, - X_test=X_test, Y_test=y_test, - validator=InputValidator, - resampling_strategy=resampling_strategy, - resampling_strategy_args=resampling_strategy_args, - dataset_name=dataset_name - ) - if not return_only: - self.InputValidator = InputValidator - self.dataset = dataset - - return dataset + def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline: + return TabularRegressionPipeline(dataset_properties=dataset_properties) def search( self, @@ -243,14 +192,31 @@ def search( self """ + if dataset_name is None: + dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - self.get_dataset(X_train=X_train, - y_train=y_train, - X_test=X_test, - y_test=y_test, - resampling_strategy=self.resampling_strategy, - resampling_strategy_args=self.resampling_strategy_args, - dataset_name=dataset_name) + # we have to create a logger for at this point for the validator + self._logger = self._get_logger(dataset_name) + + # Create a validator object to make sure that the data provided by + # the user matches the autopytorch requirements + self.InputValidator = TabularInputValidator( + is_classification=False, + logger_port=self._logger_port, + ) + + # Fit a input validator to check the provided data + # Also, an encoder is fit to both train and test data, + # to prevent unseen categories during inference + self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + + self.dataset = TabularDataset( + X=X_train, Y=y_train, + X_test=X_test, Y_test=y_test, + validator=self.InputValidator, + resampling_strategy=self.resampling_strategy, + resampling_strategy_args=self.resampling_strategy_args, + ) return self._search( dataset=self.dataset, diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py index 192b70d93..a40678ec4 100644 --- a/autoPyTorch/evaluation/tae.py +++ b/autoPyTorch/evaluation/tae.py @@ -107,7 +107,7 @@ def __init__( include: typing.Optional[typing.Dict[str, typing.Any]] = None, exclude: typing.Optional[typing.Dict[str, typing.Any]] = None, memory_limit: typing.Optional[int] = None, - disable_file_output: typing.Union[bool, typing.List] = False, + disable_file_output: bool = False, init_params: typing.Dict[str, typing.Any] = None, budget_type: str = None, ta: typing.Optional[typing.Callable] = None, diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index 1478f83b5..c00965bbb 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -97,7 +97,7 @@ def __init__(self, resampling_strategy_args: typing.Optional[typing.Dict[str, typing.Any]] = None, include: typing.Optional[typing.Dict[str, typing.Any]] = None, exclude: typing.Optional[typing.Dict[str, typing.Any]] = None, - disable_file_output: typing.Union[bool, typing.List] = [], + disable_file_output: typing.List = [], smac_scenario_args: typing.Optional[typing.Dict[str, typing.Any]] = None, get_smac_object_callback: typing.Optional[typing.Callable] = None, all_supported_metrics: bool = True, diff --git a/examples/tabular/40_advanced/example_single_configuration.py b/examples/tabular/40_advanced/example_single_configuration.py deleted file mode 100644 index f9aa27278..000000000 --- a/examples/tabular/40_advanced/example_single_configuration.py +++ /dev/null @@ -1,85 +0,0 @@ -# -*- encoding: utf-8 -*- -""" -========================== -Fit a single configuration -========================== -*Auto-PyTorch* searches for the best combination of machine learning algorithms -and their hyper-parameter configuration for a given task. - -This example shows how one can fit one of these pipelines, both, with a user defined -configuration, and a randomly sampled one form the configuration space. -The pipelines that Auto-PyTorch fits are compatible with Scikit-Learn API. You can -get further documentation about Scikit-Learn models here: _ -""" -import os -import tempfile as tmp -import warnings - -os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() -os.environ['OMP_NUM_THREADS'] = '1' -os.environ['OPENBLAS_NUM_THREADS'] = '1' -os.environ['MKL_NUM_THREADS'] = '1' - -warnings.simplefilter(action='ignore', category=UserWarning) -warnings.simplefilter(action='ignore', category=FutureWarning) - -import sklearn.datasets -import sklearn.metrics - -from autoPyTorch.api.tabular_classification import TabularClassificationTask -from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes - - -if __name__ == '__main__': - ############################################################################ - # Data Loading - # ============ - - X, y = sklearn.datasets.fetch_openml(data_id=3, return_X_y=True, as_frame=True) - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, y, test_size=0.5, random_state=3 - ) - - ############################################################################ - # Define an estimator - # ============================ - - # Search for a good configuration - estimator = TabularClassificationTask( - resampling_strategy=HoldoutValTypes.holdout_validation, - resampling_strategy_args={'val_share': 0.33} - ) - - ############################################################################ - # Get a random configuration of the pipeline for current dataset - # =============================================================== - - dataset = estimator.get_dataset(X_train=X_train, - y_train=y_train, - X_test=X_test, - y_test=y_test) - configuration = estimator.get_search_space(dataset).get_default_configuration() - - ########################################################################### - # Fit the configuration - # ================================== - - pipeline, run_info, run_value, dataset = estimator.fit_pipeline(X_train=X_train, y_train=y_train, - dataset_name='kr-vs-kp', - X_test=X_test, y_test=y_test, - disable_file_output=False, - configuration=configuration - ) - - # This object complies with Scikit-Learn Pipeline API. - # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html - print(pipeline.named_steps) - - # The fit_pipeline command also returns a named tuple with the pipeline constraints - print(run_info) - - # The fit_pipeline command also returns a named tuple with train/test performance - print(run_value) - - print("Passed Configuration:", pipeline.config) - print("Network:", pipeline.named_steps['network'].network) diff --git a/setup.py b/setup.py index a3055b41c..30a9a0697 100755 --- a/setup.py +++ b/setup.py @@ -48,8 +48,7 @@ "codecov", "pep8", "mypy", - "openml", - "pytest-forked" + "openml" ], "examples": [ "matplotlib", diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 20633db1e..7462a75d7 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -1,50 +1,48 @@ import os import pickle import sys -import tempfile import time import unittest from test.test_api.api_utils import print_debug_information -from ConfigSpace.configuration_space import Configuration - import numpy as np import pandas as pd import pytest + import sklearn import sklearn.datasets -from sklearn.base import BaseEstimator, clone +from sklearn.base import clone from sklearn.ensemble import VotingClassifier, VotingRegressor -from smac.runhistory.runhistory import RunHistory, RunInfo, RunValue +from smac.runhistory.runhistory import RunHistory import torch from autoPyTorch.api.tabular_classification import TabularClassificationTask from autoPyTorch.api.tabular_regression import TabularRegressionTask -from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, ) from autoPyTorch.optimizer.smbo import AutoMLSMBO -from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy + # Fixtures # ======== # Test # ======== -@pytest.mark.parametrize('openml_id', (40981,)) +@pytest.mark.parametrize('openml_id', (40981, )) @pytest.mark.parametrize('resampling_strategy', (HoldoutValTypes.holdout_validation, CrossValTypes.k_fold_cross_validation, )) def test_tabular_classification(openml_id, resampling_strategy, backend): + # Get the data and check that contents of data-manager make sense X, y = sklearn.datasets.fetch_openml( data_id=int(openml_id), @@ -199,11 +197,12 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): restored_estimator.predict(X_test) -@pytest.mark.parametrize('openml_name', ("boston",)) +@pytest.mark.parametrize('openml_name', ("boston", )) @pytest.mark.parametrize('resampling_strategy', (HoldoutValTypes.holdout_validation, CrossValTypes.k_fold_cross_validation, )) def test_tabular_regression(openml_name, resampling_strategy, backend): + # Get the data and check that contents of data-manager make sense X, y = sklearn.datasets.fetch_openml( openml_name, @@ -453,103 +452,3 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular): estimator._clean_logger() del estimator - - -@pytest.mark.parametrize("disable_file_output", [True, False]) -@pytest.mark.parametrize('openml_id', (40984,)) -@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', - ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}), - (CrossValTypes.k_fold_cross_validation, {'num_splits': 2}) - ) - ) -def test_pipeline_fit(openml_id, - resampling_strategy, - resampling_strategy_args, - backend, - disable_file_output): - # Get the data and check that contents of data-manager make sense - X, y = sklearn.datasets.fetch_openml( - data_id=int(openml_id), - return_X_y=True, as_frame=True - ) - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, y, random_state=1) - - # Search for a good configuration - estimator = TabularClassificationTask( - backend=backend, - resampling_strategy=resampling_strategy, - ) - - dataset = estimator.get_dataset(X_train=X_train, - y_train=y_train, - X_test=X_test, - y_test=y_test, - resampling_strategy=resampling_strategy, - resampling_strategy_args=resampling_strategy_args) - - configuration = estimator.get_search_space(dataset).get_default_configuration() - pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, - configuration=configuration, - run_time_limit_secs=50, - disable_file_output=disable_file_output - ) - assert isinstance(dataset, BaseDataset) - assert isinstance(run_info, RunInfo) - assert isinstance(run_info.config, Configuration) - - assert isinstance(run_value, RunValue) - assert 'SUCCESS' in str(run_value.status) - - if not disable_file_output: - if resampling_strategy in CrossValTypes: - pytest.skip("Bug, Can't predict with cross validation pipeline") - assert isinstance(pipeline, BaseEstimator) - X_test = dataset.test_tensors[0] - preds = pipeline.predict(X_test) - assert isinstance(preds, np.ndarray) - - score = accuracy(dataset.test_tensors[1], preds) - assert isinstance(score, float) - assert score > 0.8 - else: - assert isinstance(pipeline, BasePipeline) - # To make sure we fitted the model, there should be a - # run summary object with accuracy - run_summary = pipeline.named_steps['trainer'].run_summary - assert run_summary is not None - X_test = dataset.test_tensors[0] - preds = pipeline.predict(X_test) - assert isinstance(preds, np.ndarray) - - score = accuracy(dataset.test_tensors[1], preds) - assert isinstance(score, float) - assert score > 0.8 - else: - assert pipeline is None - assert run_value.cost < 0.2 - - # Make sure that the pipeline can be pickled - dump_file = os.path.join(tempfile.gettempdir(), 'automl.dump.pkl') - with open(dump_file, 'wb') as f: - pickle.dump(pipeline, f) - - num_run_dir = estimator._backend.get_numrun_directory( - run_info.seed, run_value.additional_info['num_run'], budget=50.0) - - cv_model_path = os.path.join(num_run_dir, estimator._backend.get_cv_model_filename( - run_info.seed, run_value.additional_info['num_run'], budget=50.0)) - model_path = os.path.join(num_run_dir, estimator._backend.get_model_filename( - run_info.seed, run_value.additional_info['num_run'], budget=50.0)) - - if disable_file_output: - # No file output is expected - assert not os.path.exists(num_run_dir) - else: - # We expect the model path always - # And the cv model only on 'cv' - assert os.path.exists(model_path) - if resampling_strategy in CrossValTypes: - assert os.path.exists(cv_model_path) - elif resampling_strategy in HoldoutValTypes: - assert not os.path.exists(cv_model_path)