Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADD] dataset compression #387

Merged
merged 12 commits into from
Feb 25, 2022
2 changes: 1 addition & 1 deletion autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def __init__(
if self.n_jobs == 1:
self._multiprocessing_context = 'fork'

self.InputValidator: Optional[BaseInputValidator] = None
self.input_validator: Optional[BaseInputValidator] = None

self.search_space_updates = search_space_updates
if search_space_updates is not None:
Expand Down
68 changes: 53 additions & 15 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union

import numpy as np

Expand All @@ -11,6 +11,9 @@
TASK_TYPES_TO_STRING,
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.data.utils import (
get_dataset_compression_mapping
)
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
HoldoutValTypes,
Expand Down Expand Up @@ -163,6 +166,7 @@ def _get_dataset_input_validator(
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[Mapping[str, Any]] = None,
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
Returns an object of `TabularDataset` and an object of
Expand Down Expand Up @@ -199,26 +203,27 @@ def _get_dataset_input_validator(

# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
InputValidator = TabularInputValidator(
input_validator = TabularInputValidator(
is_classification=True,
logger_port=self._logger_port,
dataset_compression=dataset_compression
)

# Fit a input validator to check the provided data
# Also, an encoder is fit to both train and test data,
# to prevent unseen categories during inference
InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

dataset = TabularDataset(
X=X_train, Y=y_train,
X_test=X_test, Y_test=y_test,
validator=InputValidator,
validator=input_validator,
resampling_strategy=resampling_strategy,
resampling_strategy_args=resampling_strategy_args,
dataset_name=dataset_name
)

return dataset, InputValidator
return dataset, input_validator

def search(
self,
Expand All @@ -234,14 +239,15 @@ def search(
total_walltime_limit: int = 100,
func_eval_time_limit_secs: Optional[int] = None,
enable_traditional_pipeline: bool = True,
memory_limit: Optional[int] = 4096,
memory_limit: int = 4096,
smac_scenario_args: Optional[Dict[str, Any]] = None,
get_smac_object_callback: Optional[Callable] = None,
all_supported_metrics: bool = True,
precision: int = 32,
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
load_models: bool = True,
portfolio_selection: Optional[str] = None,
dataset_compression: Union[Mapping[str, Any], bool] = False,
) -> 'BaseTask':
"""
Search for the best pipeline configuration for the given dataset.
Expand Down Expand Up @@ -310,7 +316,7 @@ def search(
feature by turning this flag to False. All machine learning
algorithms that are fitted during search() are considered for
ensemble building.
memory_limit (Optional[int]: default=4096):
memory_limit (int: default=4096):
Memory limit in MB for the machine learning algorithm.
Autopytorch will stop fitting the machine learning algorithm
if it tries to allocate more than memory_limit MB. If None
Expand Down Expand Up @@ -368,20 +374,52 @@ def search(
Additionally, the keyword 'greedy' is supported,
which would use the default portfolio from
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
dataset_compression: Union[bool, Mapping[str, Any]] = True
We compress datasets so that they fit into some predefined amount of memory.
**NOTE**

Default configuration when left as ``True``:
.. code-block:: python
{
"memory_allocation": 0.1,
"methods": ["precision"]
}
You can also pass your own configuration with the same keys and choosing
from the available ``"methods"``.
The available options are described here:
**memory_allocation**
By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This
float value can be set with ``"memory_allocation": 0.1``. We also allow for
specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
The memory used by the dataset is checked after each reduction method is
performed. If the dataset fits into the allocated memory, any further methods
listed in ``"methods"`` will not be performed.

**methods**
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` - We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.

Returns:
self

"""
self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)

self.dataset, self.InputValidator = self._get_dataset_input_validator(
self.dataset, self.input_validator = self._get_dataset_input_validator(
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
resampling_strategy=self.resampling_strategy,
resampling_strategy_args=self.resampling_strategy_args,
dataset_name=dataset_name)
dataset_name=dataset_name,
dataset_compression=self._dataset_compression)

return self._search(
dataset=self.dataset,
Expand Down Expand Up @@ -418,28 +456,28 @@ def predict(
Returns:
Array with estimator predictions.
"""
if self.InputValidator is None or not self.InputValidator._is_fitted:
if self.input_validator is None or not self.input_validator._is_fitted:
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator search() method.")

X_test = self.InputValidator.feature_validator.transform(X_test)
X_test = self.input_validator.feature_validator.transform(X_test)
predicted_probabilities = super().predict(X_test, batch_size=batch_size,
n_jobs=n_jobs)

if self.InputValidator.target_validator.is_single_column_target():
if self.input_validator.target_validator.is_single_column_target():
predicted_indexes = np.argmax(predicted_probabilities, axis=1)
else:
predicted_indexes = (predicted_probabilities > 0.5).astype(int)

# Allow to predict in the original domain -- that is, the user is not interested
# in our encoded values
return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
return self.input_validator.target_validator.inverse_transform(predicted_indexes)

def predict_proba(self,
X_test: Union[np.ndarray, pd.DataFrame, List],
batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
if self.InputValidator is None or not self.InputValidator._is_fitted:
if self.input_validator is None or not self.input_validator._is_fitted:
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator search() method.")
X_test = self.InputValidator.feature_validator.transform(X_test)
X_test = self.input_validator.feature_validator.transform(X_test)
return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)
64 changes: 52 additions & 12 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union

import numpy as np

Expand All @@ -11,6 +11,9 @@
TASK_TYPES_TO_STRING
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.data.utils import (
get_dataset_compression_mapping
)
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
HoldoutValTypes,
Expand Down Expand Up @@ -164,6 +167,7 @@ def _get_dataset_input_validator(
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[Mapping[str, Any]] = None,
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
Returns an object of `TabularDataset` and an object of
Expand Down Expand Up @@ -200,26 +204,27 @@ def _get_dataset_input_validator(

# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
InputValidator = TabularInputValidator(
input_validator = TabularInputValidator(
is_classification=False,
logger_port=self._logger_port,
dataset_compression=dataset_compression
)

# Fit a input validator to check the provided data
# Also, an encoder is fit to both train and test data,
# to prevent unseen categories during inference
InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

dataset = TabularDataset(
X=X_train, Y=y_train,
X_test=X_test, Y_test=y_test,
validator=InputValidator,
validator=input_validator,
resampling_strategy=resampling_strategy,
resampling_strategy_args=resampling_strategy_args,
dataset_name=dataset_name
)

return dataset, InputValidator
return dataset, input_validator

def search(
self,
Expand All @@ -235,14 +240,15 @@ def search(
total_walltime_limit: int = 100,
func_eval_time_limit_secs: Optional[int] = None,
enable_traditional_pipeline: bool = True,
memory_limit: Optional[int] = 4096,
memory_limit: int = 4096,
smac_scenario_args: Optional[Dict[str, Any]] = None,
get_smac_object_callback: Optional[Callable] = None,
all_supported_metrics: bool = True,
precision: int = 32,
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
load_models: bool = True,
portfolio_selection: Optional[str] = None,
dataset_compression: Union[Mapping[str, Any], bool] = False,
) -> 'BaseTask':
"""
Search for the best pipeline configuration for the given dataset.
Expand Down Expand Up @@ -311,7 +317,7 @@ def search(
feature by turning this flag to False. All machine learning
algorithms that are fitted during search() are considered for
ensemble building.
memory_limit (Optional[int]: default=4096):
memory_limit (int: default=4096):
Memory limit in MB for the machine learning algorithm.
Autopytorch will stop fitting the machine learning algorithm
if it tries to allocate more than memory_limit MB. If None
Expand Down Expand Up @@ -369,19 +375,53 @@ def search(
Additionally, the keyword 'greedy' is supported,
which would use the default portfolio from
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
dataset_compression: Union[bool, Mapping[str, Any]] = True
We compress datasets so that they fit into some predefined amount of memory.
**NOTE**

Default configuration when left as ``True``:
.. code-block:: python
{
"memory_allocation": 0.1,
"methods": ["precision"]
}
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
You can also pass your own configuration with the same keys and choosing
from the available ``"methods"``.
The available options are described here:
**memory_allocation**
By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This
float value can be set with ``"memory_allocation": 0.1``. We also allow for
specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
The memory used by the dataset is checked after each reduction method is
performed. If the dataset fits into the allocated memory, any further methods
listed in ``"methods"`` will not be performed.

**methods**
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` - We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.

Returns:
self

"""
self.dataset, self.InputValidator = self._get_dataset_input_validator(

self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)

self.dataset, self.input_validator = self._get_dataset_input_validator(
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
resampling_strategy=self.resampling_strategy,
resampling_strategy_args=self.resampling_strategy_args,
dataset_name=dataset_name)
dataset_name=dataset_name,
dataset_compression=self._dataset_compression)

return self._search(
dataset=self.dataset,
Expand All @@ -408,14 +448,14 @@ def predict(
batch_size: Optional[int] = None,
n_jobs: int = 1
) -> np.ndarray:
if self.InputValidator is None or not self.InputValidator._is_fitted:
if self.input_validator is None or not self.input_validator._is_fitted:
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator search() method.")

X_test = self.InputValidator.feature_validator.transform(X_test)
X_test = self.input_validator.feature_validator.transform(X_test)
predicted_values = super().predict(X_test, batch_size=batch_size,
n_jobs=n_jobs)

# Allow to predict in the original domain -- that is, the user is not interested
# in our encoded values
return self.InputValidator.target_validator.inverse_transform(predicted_values)
return self.input_validator.target_validator.inverse_transform(predicted_values)