automl · ravinkohli · Mar 9, 2022 · Mar 1, 2022 · Mar 2, 2022 · Mar 2, 2022
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -39,6 +39,7 @@
     STRING_TO_TASK_TYPES,
 )
 from autoPyTorch.data.base_validator import BaseInputValidator
+from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
@@ -299,6 +300,7 @@ def _get_dataset_input_validator(
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> Tuple[BaseDataset, BaseInputValidator]:
         """
         Returns an object of a child class of `BaseDataset` and
@@ -323,6 +325,9 @@ def _get_dataset_input_validator(
                 in ```datasets/resampling_strategy.py```.
             dataset_name (Optional[str]):
                 name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
 
         Returns:
             BaseDataset:
@@ -341,6 +346,7 @@ def get_dataset(
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> BaseDataset:
         """
         Returns an object of a child class of `BaseDataset` according to the current task.
@@ -363,6 +369,38 @@ def get_dataset(
                 in ```datasets/resampling_strategy.py```.
             dataset_name (Optional[str]):
                 name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                We compress datasets so that they fit into some predefined amount of memory.
+                **NOTE**
+
+                You can also pass your own configuration with the same keys and choosing
+                from the available ``"methods"``.
+                The available options are described here:
+                **memory_allocation**
+                    Absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
+                    The memory used by the dataset is checked after each reduction method is
+                    performed. If the dataset fits into the allocated memory, any further methods
+                    listed in ``"methods"`` will not be performed.
+                    It can be either float or int.
+
+                **methods**
+                    We currently provide the following methods for reducing the dataset size.
+                    These can be provided in a list and are performed in the order as given.
+                    *   ``"precision"`` -
+                        We reduce floating point precision as follows:
+                            *   ``np.float128 -> np.float64``
+                            *   ``np.float96 -> np.float64``
+                            *   ``np.float64 -> np.float32``
+                            *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                                to the lowest possible precision.
+                    *   ``subsample`` -
+                        We subsample data such that it **fits directly into
+                        the memory allocation** ``memory_allocation * memory_limit``.
+                        Therefore, this should likely be the last method listed in
+                        ``"methods"``.
+                        Subsampling takes into account classification labels and stratifies
+                        accordingly. We guarantee that at least one occurrence of each
+                        label is included in the sampled set.
 
         Returns:
             BaseDataset:
@@ -375,7 +413,8 @@ def get_dataset(
             y_test=y_test,
             resampling_strategy=resampling_strategy,
             resampling_strategy_args=resampling_strategy_args,
-            dataset_name=dataset_name)
+            dataset_name=dataset_name,
+            dataset_compression=dataset_compression)
 
         return dataset
 

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -12,7 +12,8 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
-    get_dataset_compression_mapping
+    DatasetCompressionSpec,
+    get_dataset_compression_mapping,
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
@@ -166,7 +167,7 @@ def _get_dataset_input_validator(
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
-        dataset_compression: Optional[Mapping[str, Any]] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> Tuple[TabularDataset, TabularInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -190,6 +191,10 @@ def _get_dataset_input_validator(
                 in ```datasets/resampling_strategy.py```.
             dataset_name (Optional[str]):
                 name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
+
         Returns:
             TabularDataset:
                 the dataset object.
@@ -396,14 +401,23 @@ def search(
                     listed in ``"methods"`` will not be performed.
 
                 **methods**
-                We currently provide the following methods for reducing the dataset size.
-                These can be provided in a list and are performed in the order as given.
-                *   ``"precision"`` - We reduce floating point precision as follows:
-                    *   ``np.float128 -> np.float64``
-                    *   ``np.float96 -> np.float64``
-                    *   ``np.float64 -> np.float32``
-                    *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
-                        to the lowest possible precision.
+                    We currently provide the following methods for reducing the dataset size.
+                    These can be provided in a list and are performed in the order as given.
+                    *   ``"precision"`` -
+                        We reduce floating point precision as follows:
+                            *   ``np.float128 -> np.float64``
+                            *   ``np.float96 -> np.float64``
+                            *   ``np.float64 -> np.float32``
+                            *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                                to the lowest possible precision.
+                    *   ``subsample`` -
+                        We subsample data such that it **fits directly into
+                        the memory allocation** ``memory_allocation * memory_limit``.
+                        Therefore, this should likely be the last method listed in
+                        ``"methods"``.
+                        Subsampling takes into account classification labels and stratifies
+                        accordingly. We guarantee that at least one occurrence of each
+                        label is included in the sampled set.
 
         Returns:
             self

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -12,7 +12,8 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
-    get_dataset_compression_mapping
+    DatasetCompressionSpec,
+    get_dataset_compression_mapping,
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
@@ -167,7 +168,7 @@ def _get_dataset_input_validator(
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
-        dataset_compression: Optional[Mapping[str, Any]] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> Tuple[TabularDataset, TabularInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -191,6 +192,9 @@ def _get_dataset_input_validator(
                 in ```datasets/resampling_strategy.py```.
             dataset_name (Optional[str]):
                 name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
         Returns:
             TabularDataset:
                 the dataset object.
@@ -397,14 +401,23 @@ def search(
                     listed in ``"methods"`` will not be performed.
 
                 **methods**
-                We currently provide the following methods for reducing the dataset size.
-                These can be provided in a list and are performed in the order as given.
-                *   ``"precision"`` - We reduce floating point precision as follows:
-                    *   ``np.float128 -> np.float64``
-                    *   ``np.float96 -> np.float64``
-                    *   ``np.float64 -> np.float32``
-                    *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
-                        to the lowest possible precision.
+                    We currently provide the following methods for reducing the dataset size.
+                    These can be provided in a list and are performed in the order as given.
+                    *   ``"precision"`` -
+                        We reduce floating point precision as follows:
+                            *   ``np.float128 -> np.float64``
+                            *   ``np.float96 -> np.float64``
+                            *   ``np.float64 -> np.float32``
+                            *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                                to the lowest possible precision.
+                    *   ``subsample`` -
+                        We subsample data such that it **fits directly into
+                        the memory allocation** ``memory_allocation * memory_limit``.
+                        Therefore, this should likely be the last method listed in
+                        ``"methods"``.
+                        Subsampling takes into account classification labels and stratifies
+                        accordingly. We guarantee that at least one occurrence of each
+                        label is included in the sampled set.
 
         Returns:
             self

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -1,6 +1,6 @@
 import functools
 from logging import Logger
-from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
+from typing import Dict, List, Optional, Tuple, Union, cast
 
 import numpy as np
 
@@ -18,11 +18,6 @@
 from sklearn.pipeline import make_pipeline
 
 from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
-from autoPyTorch.data.utils import (
-    DatasetCompressionInputType,
-    DatasetDTypeContainerType,
-    reduce_dataset_size_if_too_large
-)
 from autoPyTorch.utils.common import ispandas
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
@@ -103,10 +98,7 @@ class TabularFeatureValidator(BaseFeatureValidator):
     def __init__(
         self,
         logger: Optional[Union[PicklableClientLogger, Logger]] = None,
-        dataset_compression: Optional[Mapping[str, Any]] = None,
-    ) -> None:
-        self._dataset_compression = dataset_compression
-        self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
+    ):
         super().__init__(logger)
 
     @staticmethod
@@ -290,38 +282,8 @@ def transform(
                                   "numerical or categorical values.")
             raise e
 
-        X = self._compress_dataset(X)
-
         return X
 
-    # TODO: modify once we have added subsampling as well.
-    def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressionInputType:
-        """
-        Compress the dataset. This function ensures that
-        the testing data is converted to the same dtype as
-        the training data.
-
-
-        Args:
-            X (DatasetCompressionInputType):
-                Dataset
-
-        Returns:
-            DatasetCompressionInputType:
-                Compressed dataset.
-        """
-        is_dataframe = ispandas(X)
-        is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
-        if not is_reducible_type or self._dataset_compression is None:
-            return X
-        elif self._reduced_dtype is not None:
-            X = X.astype(self._reduced_dtype)
-            return X
-        else:
-            X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
-            self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
-            return X
-
     def _check_data(
         self,
         X: SupportedFeatTypes,

diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py
@@ -1,10 +1,21 @@
 # -*- encoding: utf-8 -*-
 import logging
-from typing import Any, Mapping, Optional, Union
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+from scipy.sparse import issparse
 
 from autoPyTorch.data.base_validator import BaseInputValidator
-from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
-from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
+from autoPyTorch.data.tabular_feature_validator import SupportedFeatTypes, TabularFeatureValidator
+from autoPyTorch.data.tabular_target_validator import SupportedTargetTypes, TabularTargetValidator
+from autoPyTorch.data.utils import (
+    DatasetCompressionInputType,
+    DatasetCompressionSpec,
+    DatasetDTypeContainerType,
+    reduce_dataset_size_if_too_large
+)
+from autoPyTorch.utils.common import ispandas
 from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
 
 
@@ -27,16 +38,22 @@ class TabularInputValidator(BaseInputValidator):
         target_validator (TargetValidator):
             A TargetValidator instance used to validate and encode (in case of classification)
             the target values
+        dataset_compression (Optional[DatasetCompressionSpec]):
+            specifications for dataset compression. For more info check
+            documentation for `BaseTask.get_dataset`.
     """
     def __init__(
         self,
         is_classification: bool = False,
         logger_port: Optional[int] = None,
-        dataset_compression: Optional[Mapping[str, Any]] = None,
-    ) -> None:
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
+        seed: int = 42,
+    ):
+        self.dataset_compression = dataset_compression
+        self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
         self.is_classification = is_classification
         self.logger_port = logger_port
-        self.dataset_compression = dataset_compression
+        self.seed = seed
         if self.logger_port is not None:
             self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
                 name='Validation',
@@ -46,10 +63,59 @@ def __init__(
             self.logger = logging.getLogger('Validation')
 
         self.feature_validator = TabularFeatureValidator(
-            dataset_compression=self.dataset_compression,
             logger=self.logger)
         self.target_validator = TabularTargetValidator(
             is_classification=self.is_classification,
             logger=self.logger
         )
         self._is_fitted = False
+
+    def _compress_dataset(
+        self,
+        X: DatasetCompressionInputType,
+        y: SupportedTargetTypes,
+    ) -> DatasetCompressionInputType:
+        """
+        Compress the dataset. This function ensures that
+        the testing data is converted to the same dtype as
+        the training data.
+        See `autoPyTorch.data.utils.reduce_dataset_size_if_too_large`
+        for more information.
+
+        Args:
+            X (DatasetCompressionInputType):
+                features of dataset
+            y (SupportedTargetTypes):
+                targets of dataset
+        Returns:
+            DatasetCompressionInputType:
+                Compressed dataset.
+        """
+        is_dataframe = ispandas(X)
+        is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
+        if not is_reducible_type or self.dataset_compression is None:
+            return X, y
+        elif self._reduced_dtype is not None:
+            X = X.astype(self._reduced_dtype)
+            return X, y
+        else:
+            X, y = reduce_dataset_size_if_too_large(
+                X,
+                y=y,
+                is_classification=self.is_classification,
+                random_state=self.seed,
+                **self.dataset_compression  # type: ignore [arg-type]
+            )
+            self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
+            return X, y
+
+    def transform(
+        self,
+        X: SupportedFeatTypes,
+        y: Optional[SupportedTargetTypes] = None,
+    ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+
+        X, y = super().transform(X, y)
+        X_reduced, y_reduced = self._compress_dataset(X, y)
+
+        return X_reduced, y_reduced