Skip to content

Commit

Permalink
[fix] [test] Adapt the modification of targets to scipy.sparse.xxx_ma…
Browse files Browse the repository at this point in the history
…trix
  • Loading branch information
nabenabe0928 committed Feb 23, 2022
1 parent e8d7685 commit 5134c49
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 59 deletions.
16 changes: 2 additions & 14 deletions autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,13 @@

import pandas as pd

import scipy.sparse

from sklearn.base import BaseEstimator

from autoPyTorch.utils.common import SparseMatrixType
from autoPyTorch.utils.logging_ import PicklableClientLogger


SupportedFeatTypes = Union[
List,
pd.DataFrame,
np.ndarray,
scipy.sparse.bsr_matrix,
scipy.sparse.coo_matrix,
scipy.sparse.csc_matrix,
scipy.sparse.csr_matrix,
scipy.sparse.dia_matrix,
scipy.sparse.dok_matrix,
scipy.sparse.lil_matrix,
]
SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, SparseMatrixType]


class BaseFeatureValidator(BaseEstimator):
Expand Down
17 changes: 2 additions & 15 deletions autoPyTorch/data/base_target_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,13 @@

import pandas as pd

import scipy.sparse

from sklearn.base import BaseEstimator

from autoPyTorch.utils.common import SparseMatrixType
from autoPyTorch.utils.logging_ import PicklableClientLogger


SupportedTargetTypes = Union[
List,
pd.Series,
pd.DataFrame,
np.ndarray,
scipy.sparse.bsr_matrix,
scipy.sparse.coo_matrix,
scipy.sparse.csc_matrix,
scipy.sparse.csr_matrix,
scipy.sparse.dia_matrix,
scipy.sparse.dok_matrix,
scipy.sparse.lil_matrix,
]
SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, SparseMatrixType]


class BaseTargetValidator(BaseEstimator):
Expand Down
59 changes: 32 additions & 27 deletions autoPyTorch/data/tabular_target_validator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, cast
from typing import List, Optional, Union, cast

import numpy as np

Expand All @@ -14,13 +14,37 @@
from sklearn.utils.multiclass import type_of_target

from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
from autoPyTorch.utils.common import SparseMatrixType


def _check_and_to_numpy(y: SupportedTargetTypes) -> np.ndarray:
ArrayType = Union[np.ndarray, SparseMatrixType]


def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType:
""" sklearn check array will make sure we have the correct numerical features for the array """
return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False)


def _modify_regression_target(y: ArrayType) -> ArrayType:
# Regression targets must have numbers after a decimal point.
# Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
y_min = np.abs(y).min()
offset = y_min * 1e-16 # Sufficiently small number
if y_min > 1e15:
raise ValueError(
"The minimum value for the target labels of regression tasks must be smaller than "
f"1e15 to avoid errors caused by an overflow, but got {y_min}"
)

# Since it is all integer, we can just add a random small number
if isinstance(y, np.ndarray):
y = y.astype(dtype=np.float64) + offset
else:
y.data = y.data.astype(dtype=np.float64) + offset

return y


class TabularTargetValidator(BaseTargetValidator):
def _fit(
self,
Expand Down Expand Up @@ -101,7 +125,7 @@ def _fit(

def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
if self.encoder is None:
return _check_and_to_numpy(y)
return _check_and_to_array(y)

# remove ravel warning from pandas Series
shape = np.shape(y)
Expand All @@ -115,12 +139,9 @@ def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
else:
y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1)

return _check_and_to_numpy(y)
return _check_and_to_array(y)

def transform(
self,
y: SupportedTargetTypes,
) -> np.ndarray:
def transform(self, y: SupportedTargetTypes) -> np.ndarray:
"""
Validates and fit a categorical encoder (if needed) to the features.
The supported data types are List, numpy arrays and pandas DataFrames.
Expand All @@ -146,24 +167,11 @@ def transform(
y = np.ravel(y)

if not self.is_classification and "continuous" not in type_of_target(y):
# Regression targets must have numbers after a decimal point.
# Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
y_min = np.abs(y).min()
offset = y_min * 1e-16 # Sufficiently small number
if y_min > 1e15:
raise ValueError(
"The minimum value for the target labels of regression tasks must be smaller than "
f"1e15 to avoid errors caused by an overflow, but got {y_min}"
)

y = y.astype(dtype=np.float64) + offset # Since it is all integer, we can just add a random small number
y = _modify_regression_target(y)

return y

def inverse_transform(
self,
y: SupportedTargetTypes,
) -> np.ndarray:
def inverse_transform(self, y: SupportedTargetTypes) -> np.ndarray:
"""
Revert any encoding transformation done on a target array
Expand Down Expand Up @@ -197,10 +205,7 @@ def inverse_transform(
y = y.astype(self.dtype)
return y

def _check_data(
self,
y: SupportedTargetTypes,
) -> None:
def _check_data(self, y: SupportedTargetTypes) -> None:
"""
Perform dimensionality and data type checks on the targets
Expand Down
9 changes: 9 additions & 0 deletions autoPyTorch/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@
from torch.utils.data.dataloader import default_collate

HyperparameterValueType = Union[int, str, float]
SparseMatrixType = Union[
scipy.sparse.bsr_matrix,
scipy.sparse.coo_matrix,
scipy.sparse.csc_matrix,
scipy.sparse.csr_matrix,
scipy.sparse.dia_matrix,
scipy.sparse.dok_matrix,
scipy.sparse.lil_matrix,
]


class FitRequirement(NamedTuple):
Expand Down
6 changes: 3 additions & 3 deletions test/test_data/test_target_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,17 +150,17 @@ def test_targetvalidator_supported_types_noclassification(input_data_targettest)
assert validator.encoder is None

if hasattr(input_data_targettest, "iloc"):
np.testing.assert_array_equal(
np.isclose(
np.ravel(input_data_targettest.to_numpy()),
np.ravel(transformed_y)
)
elif sparse.issparse(input_data_targettest):
np.testing.assert_array_equal(
np.isclose(
np.ravel(input_data_targettest.todense()),
np.ravel(transformed_y.todense())
)
else:
np.testing.assert_array_equal(
np.isclose(
np.ravel(np.array(input_data_targettest)),
np.ravel(transformed_y)
)
Expand Down

0 comments on commit 5134c49

Please sign in to comment.