Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADD] Subsampling Dataset #398

Merged
merged 9 commits into from
Mar 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
41 changes: 40 additions & 1 deletion autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
STRING_TO_TASK_TYPES,
)
from autoPyTorch.data.base_validator import BaseInputValidator
from autoPyTorch.data.utils import DatasetCompressionSpec
from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
Expand Down Expand Up @@ -299,6 +300,7 @@ def _get_dataset_input_validator(
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
) -> Tuple[BaseDataset, BaseInputValidator]:
"""
Returns an object of a child class of `BaseDataset` and
Expand All @@ -323,6 +325,9 @@ def _get_dataset_input_validator(
in ```datasets/resampling_strategy.py```.
dataset_name (Optional[str]):
name of the dataset, used as experiment name.
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.

Returns:
BaseDataset:
Expand All @@ -341,6 +346,7 @@ def get_dataset(
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
) -> BaseDataset:
"""
Returns an object of a child class of `BaseDataset` according to the current task.
Expand All @@ -363,6 +369,38 @@ def get_dataset(
in ```datasets/resampling_strategy.py```.
dataset_name (Optional[str]):
name of the dataset, used as experiment name.
dataset_compression (Optional[DatasetCompressionSpec]):
We compress datasets so that they fit into some predefined amount of memory.
**NOTE**

You can also pass your own configuration with the same keys and choosing
from the available ``"methods"``.
The available options are described here:
**memory_allocation**
Absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
The memory used by the dataset is checked after each reduction method is
performed. If the dataset fits into the allocated memory, any further methods
listed in ``"methods"`` will not be performed.
It can be either float or int.

**methods**
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` -
We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.
* ``subsample`` -
We subsample data such that it **fits directly into
the memory allocation** ``memory_allocation * memory_limit``.
Therefore, this should likely be the last method listed in
``"methods"``.
Subsampling takes into account classification labels and stratifies
accordingly. We guarantee that at least one occurrence of each
label is included in the sampled set.

Returns:
BaseDataset:
Expand All @@ -375,7 +413,8 @@ def get_dataset(
y_test=y_test,
resampling_strategy=resampling_strategy,
resampling_strategy_args=resampling_strategy_args,
dataset_name=dataset_name)
dataset_name=dataset_name,
dataset_compression=dataset_compression)

return dataset

Expand Down
34 changes: 24 additions & 10 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.data.utils import (
get_dataset_compression_mapping
DatasetCompressionSpec,
get_dataset_compression_mapping,
)
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
Expand Down Expand Up @@ -166,7 +167,7 @@ def _get_dataset_input_validator(
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[Mapping[str, Any]] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
Returns an object of `TabularDataset` and an object of
Expand All @@ -190,6 +191,10 @@ def _get_dataset_input_validator(
in ```datasets/resampling_strategy.py```.
dataset_name (Optional[str]):
name of the dataset, used as experiment name.
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.

Returns:
TabularDataset:
the dataset object.
Expand Down Expand Up @@ -396,14 +401,23 @@ def search(
listed in ``"methods"`` will not be performed.

**methods**
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` - We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` -
We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.
* ``subsample`` -
We subsample data such that it **fits directly into
the memory allocation** ``memory_allocation * memory_limit``.
Therefore, this should likely be the last method listed in
``"methods"``.
Subsampling takes into account classification labels and stratifies
accordingly. We guarantee that at least one occurrence of each
label is included in the sampled set.

Returns:
self
Expand Down
33 changes: 23 additions & 10 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.data.utils import (
get_dataset_compression_mapping
DatasetCompressionSpec,
get_dataset_compression_mapping,
)
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
Expand Down Expand Up @@ -167,7 +168,7 @@ def _get_dataset_input_validator(
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[Mapping[str, Any]] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
Returns an object of `TabularDataset` and an object of
Expand All @@ -191,6 +192,9 @@ def _get_dataset_input_validator(
in ```datasets/resampling_strategy.py```.
dataset_name (Optional[str]):
name of the dataset, used as experiment name.
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
Returns:
TabularDataset:
the dataset object.
Expand Down Expand Up @@ -397,14 +401,23 @@ def search(
listed in ``"methods"`` will not be performed.

**methods**
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` - We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` -
We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.
* ``subsample`` -
We subsample data such that it **fits directly into
the memory allocation** ``memory_allocation * memory_limit``.
Therefore, this should likely be the last method listed in
``"methods"``.
Subsampling takes into account classification labels and stratifies
accordingly. We guarantee that at least one occurrence of each
label is included in the sampled set.

Returns:
self
Expand Down
42 changes: 2 additions & 40 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import functools
from logging import Logger
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
from typing import Dict, List, Optional, Tuple, Union, cast

import numpy as np

Expand All @@ -18,11 +18,6 @@
from sklearn.pipeline import make_pipeline

from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
from autoPyTorch.data.utils import (
DatasetCompressionInputType,
DatasetDTypeContainerType,
reduce_dataset_size_if_too_large
)
from autoPyTorch.utils.common import ispandas
from autoPyTorch.utils.logging_ import PicklableClientLogger

Expand Down Expand Up @@ -103,10 +98,7 @@ class TabularFeatureValidator(BaseFeatureValidator):
def __init__(
self,
logger: Optional[Union[PicklableClientLogger, Logger]] = None,
dataset_compression: Optional[Mapping[str, Any]] = None,
) -> None:
self._dataset_compression = dataset_compression
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
):
super().__init__(logger)

@staticmethod
Expand Down Expand Up @@ -290,38 +282,8 @@ def transform(
"numerical or categorical values.")
raise e

X = self._compress_dataset(X)

return X

# TODO: modify once we have added subsampling as well.
def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressionInputType:
"""
Compress the dataset. This function ensures that
the testing data is converted to the same dtype as
the training data.


Args:
X (DatasetCompressionInputType):
Dataset

Returns:
DatasetCompressionInputType:
Compressed dataset.
"""
is_dataframe = ispandas(X)
is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
if not is_reducible_type or self._dataset_compression is None:
return X
elif self._reduced_dtype is not None:
X = X.astype(self._reduced_dtype)
return X
else:
X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
return X

def _check_data(
self,
X: SupportedFeatTypes,
Expand Down
80 changes: 73 additions & 7 deletions autoPyTorch/data/tabular_validator.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
# -*- encoding: utf-8 -*-
import logging
from typing import Any, Mapping, Optional, Union
from typing import Optional, Tuple, Union

import numpy as np

from scipy.sparse import issparse

from autoPyTorch.data.base_validator import BaseInputValidator
from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
from autoPyTorch.data.tabular_feature_validator import SupportedFeatTypes, TabularFeatureValidator
from autoPyTorch.data.tabular_target_validator import SupportedTargetTypes, TabularTargetValidator
from autoPyTorch.data.utils import (
DatasetCompressionInputType,
DatasetCompressionSpec,
DatasetDTypeContainerType,
reduce_dataset_size_if_too_large
)
from autoPyTorch.utils.common import ispandas
from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger


Expand All @@ -27,16 +38,22 @@ class TabularInputValidator(BaseInputValidator):
target_validator (TargetValidator):
A TargetValidator instance used to validate and encode (in case of classification)
the target values
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
"""
def __init__(
self,
is_classification: bool = False,
logger_port: Optional[int] = None,
dataset_compression: Optional[Mapping[str, Any]] = None,
) -> None:
dataset_compression: Optional[DatasetCompressionSpec] = None,
seed: int = 42,
):
self.dataset_compression = dataset_compression
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
self.is_classification = is_classification
self.logger_port = logger_port
self.dataset_compression = dataset_compression
self.seed = seed
if self.logger_port is not None:
self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
name='Validation',
Expand All @@ -46,10 +63,59 @@ def __init__(
self.logger = logging.getLogger('Validation')

self.feature_validator = TabularFeatureValidator(
dataset_compression=self.dataset_compression,
logger=self.logger)
self.target_validator = TabularTargetValidator(
is_classification=self.is_classification,
logger=self.logger
)
self._is_fitted = False

def _compress_dataset(
self,
X: DatasetCompressionInputType,
y: SupportedTargetTypes,
) -> DatasetCompressionInputType:
"""
Compress the dataset. This function ensures that
the testing data is converted to the same dtype as
the training data.
See `autoPyTorch.data.utils.reduce_dataset_size_if_too_large`
for more information.

Args:
X (DatasetCompressionInputType):
features of dataset
y (SupportedTargetTypes):
targets of dataset
Returns:
DatasetCompressionInputType:
Compressed dataset.
"""
is_dataframe = ispandas(X)
is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
if not is_reducible_type or self.dataset_compression is None:
return X, y
elif self._reduced_dtype is not None:
X = X.astype(self._reduced_dtype)
return X, y
else:
X, y = reduce_dataset_size_if_too_large(
X,
y=y,
is_classification=self.is_classification,
random_state=self.seed,
**self.dataset_compression # type: ignore [arg-type]
)
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
return X, y

def transform(
self,
X: SupportedFeatTypes,
y: Optional[SupportedTargetTypes] = None,
) -> Tuple[np.ndarray, Optional[np.ndarray]]:

X, y = super().transform(X, y)
X_reduced, y_reduced = self._compress_dataset(X, y)

return X_reduced, y_reduced