Skip to content

Commit

Permalink
[FIX] Fixes for Tabular Regression (#235)
Browse files Browse the repository at this point in the history
* Fix hardcoded score and add score in classification. Also set torch seed in regression

* Enabled traditional estimators for regression, with tests and exampleS

* added documentation, test in progress

* Remove traditional pipeline as it is introduced in PR #224

* fix flake and mypy

* Address comments from fransisco

* Increase number of samples and add torch seed to base training evaluate

* Fix mypy and flake

* in progress

* Address comments from shuhei

* add coverage for base task pipeline config, and fix tests

* Fix flake

* revert import statement to fix the patch

* fix flake

* increase coverage

* Increase coverage for api fit

* Add coverage for errors while predicting

* Update documentation for tabular regression task and installation instructions
  • Loading branch information
ravinkohli committed Jun 16, 2021
1 parent 1818445 commit 3995391
Show file tree
Hide file tree
Showing 38 changed files with 1,302 additions and 907 deletions.
15 changes: 8 additions & 7 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
include requirements.txt
include autoPyTorch/utils/logging.yaml
include autoPyTorch/configs/default_pipeline_options.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/catboost.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/rotation_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/random_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/knn.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/svm.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/extra_trees.json
include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/lgb.json
include autoPyTorch/configs/greedy_portfolio.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json
include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json
31 changes: 14 additions & 17 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
from autoPyTorch.optimizer.smbo import AutoMLSMBO
from autoPyTorch.pipeline.base_pipeline import BasePipeline
from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models import get_available_classifiers
from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score, get_metrics
from autoPyTorch.utils.common import FitRequirement, replace_string_bool_to_bool
Expand Down Expand Up @@ -590,7 +590,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
memory_limit = self._memory_limit
if memory_limit is not None:
memory_limit = int(math.ceil(memory_limit))
available_classifiers = get_available_classifiers()
available_classifiers = get_available_traditional_learners()
dask_futures = []

total_number_classifiers = len(available_classifiers)
Expand Down Expand Up @@ -892,21 +892,18 @@ def _search(
# ============> Run traditional ml

if enable_traditional_pipeline:
if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
else:
traditional_task_name = 'runTraditional'
self._stopwatch.start_task(traditional_task_name)
elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
# We want time for at least 1 Neural network in SMAC
time_for_traditional = int(
self._time_for_task - elapsed_time - func_eval_time_limit_secs
)
self._do_traditional_prediction(
func_eval_time_limit_secs=func_eval_time_limit_secs,
time_left=time_for_traditional,
)
self._stopwatch.stop_task(traditional_task_name)
traditional_task_name = 'runTraditional'
self._stopwatch.start_task(traditional_task_name)
elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
# We want time for at least 1 Neural network in SMAC
time_for_traditional = int(
self._time_for_task - elapsed_time - func_eval_time_limit_secs
)
self._do_traditional_prediction(
func_eval_time_limit_secs=func_eval_time_limit_secs,
time_left=time_for_traditional,
)
self._stopwatch.stop_task(traditional_task_name)

# ============> Starting ensemble
elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
Expand Down
10 changes: 7 additions & 3 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def search(
budget: Optional[float] = None,
total_walltime_limit: int = 100,
func_eval_time_limit_secs: Optional[int] = None,
enable_traditional_pipeline: bool = False,
enable_traditional_pipeline: bool = True,
memory_limit: Optional[int] = 4096,
smac_scenario_args: Optional[Dict[str, Any]] = None,
get_smac_object_callback: Optional[Callable] = None,
Expand Down Expand Up @@ -151,7 +151,7 @@ def search(
total_walltime_limit // 2 to allow enough time to fit
at least 2 individual machine learning algorithms.
Set to np.inf in case no time limit is desired.
enable_traditional_pipeline (bool), (default=False):
enable_traditional_pipeline (bool), (default=True):
Not enabled for regression. This flag is here to comply
with the API.
memory_limit (Optional[int]), (default=4096): Memory
Expand Down Expand Up @@ -187,7 +187,11 @@ def search(
configurations, similar to (...herepathtogreedy...).
Additionally, the keyword 'greedy' is supported,
which would use the default portfolio from
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`.
Although portfolio selection is supported for tabular
regression, the portfolio has been built using
classification datasets. We will update a portfolio
to cover tabular regression datasets.
Returns:
self
Expand Down
100 changes: 81 additions & 19 deletions autoPyTorch/evaluation/abstract_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import autoPyTorch.pipeline.tabular_classification
import autoPyTorch.pipeline.tabular_regression
import autoPyTorch.pipeline.traditional_tabular_classification
import autoPyTorch.pipeline.traditional_tabular_regression
from autoPyTorch.automl_common.common.utils.backend import Backend
from autoPyTorch.constants import (
CLASSIFICATION_TASKS,
Expand Down Expand Up @@ -64,7 +65,7 @@ class MyTraditionalTabularClassificationPipeline(BaseEstimator):
Attributes:
dataset_properties (Dict[str, Any]):
A dictionary containing dataset specific information
random_state (Optional[Union[int, np.random.RandomState]]):
random_state (Optional[np.random.RandomState]):
Object that contains a seed and allows for reproducible results
init_params (Optional[Dict]):
An optional dictionary that is passed to the pipeline's steps. It complies
Expand All @@ -73,18 +74,18 @@ class MyTraditionalTabularClassificationPipeline(BaseEstimator):

def __init__(self, config: str,
dataset_properties: Dict[str, Any],
random_state: Optional[Union[int, np.random.RandomState]] = None,
random_state: Optional[np.random.RandomState] = None,
init_params: Optional[Dict] = None):
self.config = config
self.dataset_properties = dataset_properties
self.random_state = random_state
self.init_params = init_params
self.pipeline = autoPyTorch.pipeline.traditional_tabular_classification.\
self.pipeline = autoPyTorch.pipeline.traditional_tabular_classification. \
TraditionalTabularClassificationPipeline(dataset_properties=dataset_properties,
random_state=self.random_state)
configuration_space = self.pipeline.get_hyperparameter_search_space()
default_configuration = configuration_space.get_default_configuration().get_dictionary()
default_configuration['model_trainer:tabular_classifier:classifier'] = config
default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config
self.configuration = Configuration(configuration_space, default_configuration)
self.pipeline.set_hyperparameters(self.configuration)

Expand All @@ -100,18 +101,15 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
batch_size: int = 1000) -> np.array:
return self.pipeline.predict(X, batch_size=batch_size)

def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
return False

def get_additional_run_info(self) -> Dict[str, Any]: # pylint: disable=R0201
def get_additional_run_info(self) -> Dict[str, Any]:
"""
Can be used to return additional info for the run.
Returns:
Dict[str, Any]:
Currently contains
1. pipeline_configuration: the configuration of the pipeline, i.e, the traditional model used
2. trainer_configuration: the parameters for the traditional model used.
Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs
Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs
"""
return {'pipeline_configuration': self.configuration,
'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(),
Expand All @@ -126,6 +124,71 @@ def get_default_pipeline_options() -> Dict[str, Any]:
TraditionalTabularClassificationPipeline.get_default_pipeline_options()


class MyTraditionalTabularRegressionPipeline(BaseEstimator):
"""
A wrapper class that holds a pipeline for traditional regression.
Estimators like CatBoost, and Random Forest are considered traditional machine
learning models and are fitted before neural architecture search.
This class is an interface to fit a pipeline containing a traditional machine
learning model, and is the final object that is stored for inference.
Attributes:
dataset_properties (Dict[str, Any]):
A dictionary containing dataset specific information
random_state (Optional[np.random.RandomState]):
Object that contains a seed and allows for reproducible results
init_params (Optional[Dict]):
An optional dictionary that is passed to the pipeline's steps. It complies
a similar function as the kwargs
"""
def __init__(self, config: str,
dataset_properties: Dict[str, Any],
random_state: Optional[np.random.RandomState] = None,
init_params: Optional[Dict] = None):
self.config = config
self.dataset_properties = dataset_properties
self.random_state = random_state
self.init_params = init_params
self.pipeline = autoPyTorch.pipeline.traditional_tabular_regression. \
TraditionalTabularRegressionPipeline(dataset_properties=dataset_properties,
random_state=self.random_state)
configuration_space = self.pipeline.get_hyperparameter_search_space()
default_configuration = configuration_space.get_default_configuration().get_dictionary()
default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config
self.configuration = Configuration(configuration_space, default_configuration)
self.pipeline.set_hyperparameters(self.configuration)

def fit(self, X: Dict[str, Any], y: Any,
sample_weight: Optional[np.ndarray] = None) -> object:
return self.pipeline.fit(X, y)

def predict(self, X: Union[np.ndarray, pd.DataFrame],
batch_size: int = 1000) -> np.array:
return self.pipeline.predict(X, batch_size=batch_size)

def get_additional_run_info(self) -> Dict[str, Any]:
"""
Can be used to return additional info for the run.
Returns:
Dict[str, Any]:
Currently contains
1. pipeline_configuration: the configuration of the pipeline, i.e, the traditional model used
2. trainer_configuration: the parameters for the traditional model used.
Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs
"""
return {'pipeline_configuration': self.configuration,
'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config()}

def get_pipeline_representation(self) -> Dict[str, str]:
return self.pipeline.get_pipeline_representation()

@staticmethod
def get_default_pipeline_options() -> Dict[str, Any]:
return autoPyTorch.pipeline.traditional_tabular_regression.\
TraditionalTabularRegressionPipeline.get_default_pipeline_options()


class DummyClassificationPipeline(DummyClassifier):
"""
A wrapper class that holds a pipeline for dummy classification.
Expand Down Expand Up @@ -175,9 +238,6 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
new_X = np.ones((X.shape[0], 1))
return super(DummyClassificationPipeline, self).predict(new_X).astype(np.float32)

def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
return False

def get_additional_run_info(self) -> Dict: # pylint: disable=R0201
return {'configuration_origin': 'DUMMY'}

Expand Down Expand Up @@ -234,12 +294,15 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
new_X = np.ones((X.shape[0], 1))
return super(DummyRegressionPipeline, self).predict(new_X).astype(np.float32)

def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
return False

def get_additional_run_info(self) -> Dict: # pylint: disable=R0201
return {'configuration_origin': 'DUMMY'}

def get_pipeline_representation(self) -> Dict[str, str]:
return {
'Preprocessing': 'None',
'Estimator': 'Dummy',
}

@staticmethod
def get_default_pipeline_options() -> Dict[str, Any]:
return {'budget_type': 'epochs',
Expand Down Expand Up @@ -401,8 +464,7 @@ def __init__(self, backend: Backend,
if isinstance(self.configuration, int):
self.pipeline_class = DummyRegressionPipeline
elif isinstance(self.configuration, str):
raise ValueError("Only tabular classifications tasks "
"are currently supported with traditional methods")
self.pipeline_class = MyTraditionalTabularRegressionPipeline
elif isinstance(self.configuration, Configuration):
self.pipeline_class = autoPyTorch.pipeline.tabular_regression.TabularRegressionPipeline
else:
Expand All @@ -415,8 +477,7 @@ def __init__(self, backend: Backend,
if self.task_type in TABULAR_TASKS:
self.pipeline_class = MyTraditionalTabularClassificationPipeline
else:
raise ValueError("Only tabular classifications tasks "
"are currently supported with traditional methods")
raise ValueError("Only tabular tasks are currently supported with traditional methods")
elif isinstance(self.configuration, Configuration):
if self.task_type in TABULAR_TASKS:
self.pipeline_class = autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline
Expand Down Expand Up @@ -446,6 +507,7 @@ def __init__(self, backend: Backend,
'y_test': self.y_test,
'backend': self.backend,
'logger_port': logger_port,
'optimize_metric': self.metric.name
})
assert self.pipeline_class is not None, "Could not infer pipeline class"
pipeline_config = pipeline_config if pipeline_config is not None \
Expand Down

0 comments on commit 3995391

Please sign in to comment.