automl · ravinkohli · Feb 9, 2022 · Feb 2, 2022 · Feb 2, 2022 · Feb 3, 2022
diff --git a/...Torch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py b/...Torch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py
@@ -0,0 +1,38 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+
+
+class PowerTransformer(BaseScaler):
+    """
+    Map data to as close to a Gaussian distribution as possible
+    in order to reduce variance and minimize skewness.
+
+    Uses `yeo-johnson` power transform method. Also, data is normalised
+    to zero mean and unit variance.
+    """
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None):
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnPowerTransformer(copy=False)
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'PowerTransformer',
+            'name': 'PowerTransformer',
+            'handles_sparse': False
+        }
diff --git a/...ch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py b/...ch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py
@@ -0,0 +1,70 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter
+)
+
+import numpy as np
+
+from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+
+
+class QuantileTransformer(BaseScaler):
+    """
+    Transforms the features to follow a uniform or a normal distribution
+    using quantiles information.
+    """
+    def __init__(
+        self,
+        n_quantiles: int = 1000,
+        output_distribution: str = "normal",
+        random_state: Optional[Union[np.random.RandomState, int]] = None
+    ):
+        super().__init__()
+        self.random_state = random_state
+        self.n_quantiles = n_quantiles
+        self.output_distribution = output_distribution
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles,
+                                                                    output_distribution=self.output_distribution,
+                                                                    copy=False)
+        return self
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        n_quantiles: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="n_quantiles",
+                                                                           value_range=(10, 2000),
+                                                                           default_value=1000,
+                                                                           ),
+        output_distribution: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="output_distribution",
+                                                                                   value_range=("uniform", "normal"),
+                                                                                   default_value="normal",
+                                                                                   )
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        # TODO parametrize like the Random Forest as n_quantiles = n_features^param
+        add_hyperparameter(cs, n_quantiles, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, output_distribution, CategoricalHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'QuantileTransformer',
+            'name': 'QuantileTransformer',
+            'handles_sparse': False
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py
@@ -66,7 +66,14 @@ def get_hyperparameter_search_space(self,
             raise ValueError("no scalers found, please add a scaler")
 
         if default is None:
-            defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler']
+            defaults = [
+                'StandardScaler',
+                'Normalizer',
+                'MinMaxScaler',
+                'PowerTransformer',
+                'QuantileTransformer',
+                'NoScaler'
+            ]
             for default_ in defaults:
                 if default_ in available_scalers:
                     default = default_

diff --git a/test/test_pipeline/components/preprocessing/test_scalers.py b/test/test_pipeline/components/preprocessing/test_scalers.py
@@ -9,6 +9,10 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.MinMaxScaler import MinMaxScaler
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.NoScaler import NoScaler
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.Normalizer import Normalizer
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.PowerTransformer import \
+    PowerTransformer
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.QuantileTransformer import \
+    QuantileTransformer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.StandardScaler import StandardScaler
 
 
@@ -239,3 +243,123 @@ def test_none_scaler(self):
         self.assertIsInstance(X['scaler'], dict)
         self.assertIsNone(X['scaler']['categorical'])
         self.assertIsNone(X['scaler']['numerical'])
+
+
+def test_power_transformer():
+    data = np.array([[1, 2, 3],
+                    [7, 8, 9],
+                    [4, 5, 6],
+                    [11, 12, 13],
+                    [17, 18, 19],
+                    [14, 15, 16]])
+    train_indices = np.array([0, 2, 5])
+    test_indices = np.array([1, 4, 3])
+    categorical_columns = list()
+    numerical_columns = [0, 1, 2]
+    dataset_properties = {'categorical_columns': categorical_columns,
+                          'numerical_columns': numerical_columns,
+                          'issparse': False}
+    X = {
+        'X_train': data[train_indices],
+        'dataset_properties': dataset_properties
+    }
+    scaler_component = PowerTransformer()
+
+    scaler_component = scaler_component.fit(X)
+    X = scaler_component.transform(X)
+    scaler = X['scaler']['numerical']
+
+    # check if the fit dictionary X is modified as expected
+    assert isinstance(X['scaler'], dict)
+    assert isinstance(scaler, BaseEstimator)
+    assert X['scaler']['categorical'] is None
+
+    # make column transformer with returned encoder to fit on data
+    column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
+                                                 remainder='passthrough')
+    column_transformer = column_transformer.fit(X['X_train'])
+    transformed = column_transformer.transform(data[test_indices])
+
+    assert_allclose(transformed, np.array([[0.531648, 0.522782, 0.515394],
+                                           [1.435794, 1.451064, 1.461685],
+                                           [0.993609, 1.001055, 1.005734]]), rtol=1e-06)
+
+
+class TestQuantileTransformer():
+    def test_quantile_transformer_uniform(self):
+        data = np.array([[1, 2, 3],
+                         [7, 8, 9],
+                         [4, 5, 6],
+                         [11, 12, 13],
+                         [17, 18, 19],
+                         [14, 15, 16]])
+        train_indices = np.array([0, 2, 5])
+        test_indices = np.array([1, 4, 3])
+        categorical_columns = list()
+        numerical_columns = [0, 1, 2]
+        dataset_properties = {'categorical_columns': categorical_columns,
+                              'numerical_columns': numerical_columns,
+                              'issparse': False}
+        X = {
+            'X_train': data[train_indices],
+            'dataset_properties': dataset_properties
+        }
+        scaler_component = QuantileTransformer(output_distribution='uniform')
+
+        scaler_component = scaler_component.fit(X)
+        X = scaler_component.transform(X)
+        scaler = X['scaler']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        assert isinstance(X['scaler'], dict)
+        assert isinstance(scaler, BaseEstimator)
+        assert X['scaler']['categorical'] is None
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(data[test_indices])
+
+        assert_allclose(transformed, np.array([[0.65, 0.65, 0.65],
+                                               [1, 1, 1],
+                                               [0.85, 0.85, 0.85]]), rtol=1e-06)
+
+    def test_quantile_transformer_normal(self):
+        data = np.array([[1, 2, 3],
+                         [7, 8, 9],
+                         [4, 5, 6],
+                         [11, 12, 13],
+                         [17, 18, 19],
+                         [14, 15, 16]])
+        train_indices = np.array([0, 2, 5])
+        test_indices = np.array([1, 4, 3])
+        categorical_columns = list()
+        numerical_columns = [0, 1, 2]
+        dataset_properties = {'categorical_columns': categorical_columns,
+                              'numerical_columns': numerical_columns,
+                              'issparse': False}
+        X = {
+            'X_train': data[train_indices],
+            'dataset_properties': dataset_properties
+        }
+        scaler_component = QuantileTransformer(output_distribution='normal')
+
+        scaler_component = scaler_component.fit(X)
+        X = scaler_component.transform(X)
+        scaler = X['scaler']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        assert isinstance(X['scaler'], dict)
+        assert isinstance(scaler, BaseEstimator)
+        assert X['scaler']['categorical'] is None
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(data[test_indices])
+
+        assert_allclose(transformed, np.array([[0.38532, 0.38532, 0.38532],
+                                               [5.199338, 5.199338, 5.199338],
+                                               [1.036433, 1.036433, 1.036433]]), rtol=1e-05)