Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADD] scalers from autosklearn #372

Merged
merged 9 commits into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler


class PowerTransformer(BaseScaler):
"""
Map data to as close to a Gaussian distribution as possible
in order to reduce variance and minimize skewness.

Uses `yeo-johnson` power transform method. Also, data is normalised
to zero mean and unit variance.
"""
def __init__(self,
random_state: Optional[Union[np.random.RandomState, int]] = None):
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
super().__init__()
self.random_state = random_state

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:

self.check_requirements(X, y)

self.preprocessor['numerical'] = SklearnPowerTransformer(copy=False)
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
return self

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'PowerTransformer',
'name': 'PowerTransformer',
'handles_sparse': False
}
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from typing import Any, Dict, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
CategoricalHyperparameter,
UniformIntegerHyperparameter
)

import numpy as np

from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter


class QuantileTransformer(BaseScaler):
"""
Transforms the features to follow a uniform or a normal distribution
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
using quantiles information.
"""
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self,
n_quantiles: int = 1000,
output_distribution: str = "normal",
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
random_state: Optional[Union[np.random.RandomState, int]] = None
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
):
super().__init__()
self.random_state = random_state
self.n_quantiles = n_quantiles
self.output_distribution = output_distribution

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:

self.check_requirements(X, y)

self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles,
output_distribution=self.output_distribution,
copy=False)
return self

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
n_quantiles: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="n_quantiles",
value_range=(10, 2000),
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
default_value=1000,
),
output_distribution: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="output_distribution",
value_range=("uniform", "normal"),
default_value="normal",
)
) -> ConfigurationSpace:
cs = ConfigurationSpace()

# TODO parametrize like the Random Forest as n_quantiles = n_features^param
add_hyperparameter(cs, n_quantiles, UniformIntegerHyperparameter)
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
add_hyperparameter(cs, output_distribution, CategoricalHyperparameter)

return cs

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'QuantileTransformer',
'name': 'QuantileTransformer',
'handles_sparse': False
}
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,14 @@ def get_hyperparameter_search_space(self,
raise ValueError("no scalers found, please add a scaler")

if default is None:
defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler']
defaults = [
'StandardScaler',
'Normalizer',
'MinMaxScaler',
'PowerTransformer',
'QuantileTransformer',
'NoScaler'
]
for default_ in defaults:
if default_ in available_scalers:
default = default_
Expand Down
124 changes: 124 additions & 0 deletions test/test_pipeline/components/preprocessing/test_scalers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.MinMaxScaler import MinMaxScaler
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.NoScaler import NoScaler
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.Normalizer import Normalizer
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.PowerTransformer import \
PowerTransformer
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.QuantileTransformer import \
QuantileTransformer
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.StandardScaler import StandardScaler


Expand Down Expand Up @@ -239,3 +243,123 @@ def test_none_scaler(self):
self.assertIsInstance(X['scaler'], dict)
self.assertIsNone(X['scaler']['categorical'])
self.assertIsNone(X['scaler']['numerical'])


def test_power_transformer():
data = np.array([[1, 2, 3],
[7, 8, 9],
[4, 5, 6],
[11, 12, 13],
[17, 18, 19],
[14, 15, 16]])
train_indices = np.array([0, 2, 5])
test_indices = np.array([1, 4, 3])
categorical_columns = list()
numerical_columns = [0, 1, 2]
dataset_properties = {'categorical_columns': categorical_columns,
'numerical_columns': numerical_columns,
'issparse': False}
X = {
'X_train': data[train_indices],
'dataset_properties': dataset_properties
}
scaler_component = PowerTransformer()

scaler_component = scaler_component.fit(X)
X = scaler_component.transform(X)
scaler = X['scaler']['numerical']

# check if the fit dictionary X is modified as expected
assert isinstance(X['scaler'], dict)
assert isinstance(scaler, BaseEstimator)
assert X['scaler']['categorical'] is None

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])

assert_allclose(transformed, np.array([[0.531648, 0.522782, 0.515394],
[1.435794, 1.451064, 1.461685],
[0.993609, 1.001055, 1.005734]]), rtol=1e-06)


class TestQuantileTransformer():
def test_quantile_transformer_uniform(self):
data = np.array([[1, 2, 3],
[7, 8, 9],
[4, 5, 6],
[11, 12, 13],
[17, 18, 19],
[14, 15, 16]])
train_indices = np.array([0, 2, 5])
test_indices = np.array([1, 4, 3])
categorical_columns = list()
numerical_columns = [0, 1, 2]
dataset_properties = {'categorical_columns': categorical_columns,
'numerical_columns': numerical_columns,
'issparse': False}
X = {
'X_train': data[train_indices],
'dataset_properties': dataset_properties
}
scaler_component = QuantileTransformer(output_distribution='uniform')

scaler_component = scaler_component.fit(X)
X = scaler_component.transform(X)
scaler = X['scaler']['numerical']

# check if the fit dictionary X is modified as expected
assert isinstance(X['scaler'], dict)
assert isinstance(scaler, BaseEstimator)
assert X['scaler']['categorical'] is None

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])

assert_allclose(transformed, np.array([[0.65, 0.65, 0.65],
[1, 1, 1],
[0.85, 0.85, 0.85]]), rtol=1e-06)

def test_quantile_transformer_normal(self):
data = np.array([[1, 2, 3],
[7, 8, 9],
[4, 5, 6],
[11, 12, 13],
[17, 18, 19],
[14, 15, 16]])
train_indices = np.array([0, 2, 5])
test_indices = np.array([1, 4, 3])
categorical_columns = list()
numerical_columns = [0, 1, 2]
dataset_properties = {'categorical_columns': categorical_columns,
'numerical_columns': numerical_columns,
'issparse': False}
X = {
'X_train': data[train_indices],
'dataset_properties': dataset_properties
}
scaler_component = QuantileTransformer(output_distribution='normal')

scaler_component = scaler_component.fit(X)
X = scaler_component.transform(X)
scaler = X['scaler']['numerical']

# check if the fit dictionary X is modified as expected
assert isinstance(X['scaler'], dict)
assert isinstance(scaler, BaseEstimator)
assert X['scaler']['categorical'] is None

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])

assert_allclose(transformed, np.array([[0.38532, 0.38532, 0.38532],
[5.199338, 5.199338, 5.199338],
[1.036433, 1.036433, 1.036433]]), rtol=1e-05)