Skip to content

Commit

Permalink
[fix] Modify so that tests pass
Browse files Browse the repository at this point in the history
  • Loading branch information
nabenabe0928 committed Feb 9, 2022
1 parent 1996d1c commit 6d64582
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class NoCoalescer(BaseCoalescer):
def __init__(self, random_state: np.random.RandomState):
super().__init__()
self.random_state = random_state
self._processing = False

def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
class BaseCoalescer(autoPyTorchTabularPreprocessingComponent):
def __init__(self) -> None:
super().__init__()
self._processing = True
self.add_fit_requirements([
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categories', (List,), user_defined=True, dataset_property=True)
Expand All @@ -24,7 +25,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
Returns:
X (Dict[str, Any]): the updated fit dictionary
"""
if all(self.preprocessor[key] is None for key in ['numerical', 'categorical']):
if self._processing and all(self.preprocessor[key] is None for key in ['numerical', 'categorical']):
raise ValueError(f"fit() must be called before transform() on {self.__class__.__name__}")

X.update({'coalescer': self.preprocessor})
return X
58 changes: 39 additions & 19 deletions autoPyTorch/utils/implementations.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
from typing import Any, Callable, Dict, List, Optional, Type, Union

import numpy as np

Expand Down Expand Up @@ -69,7 +69,7 @@ class MinorityCoalesceTransformer(BaseEstimator, TransformerMixin):
""" Group together categories whose occurrence is less than a specified min_fraction."""
def __init__(self, min_fraction: Optional[float] = None):
self.min_fraction = min_fraction
self._categories_to_coalesce: Optional[List[Set[int]]] = None
self._categories_to_coalesce: Optional[List[np.ndarray]] = None

if self.min_fraction is not None and (self.min_fraction < 0 or self.min_fraction > 1):
raise ValueError(f"min_fraction for {self.__class__.__name__} must be in [0, 1], but got {min_fraction}")
Expand All @@ -91,6 +91,36 @@ def _check_dataset(self, X: Union[np.ndarray, sparse.csr_matrix]) -> None:
raise ValueError("The categoricals in input features for MinorityCoalesceTransformer "
"cannot have integers smaller than -2.")

@staticmethod
def _get_column_data(
X: Union[np.ndarray, sparse.csr_matrix],
col_idx: int,
is_sparse: bool
) -> Union[np.ndarray, sparse.csr_matrix]:
"""
Args:
X (Union[np.ndarray, sparse.csr_matrix]):
The feature tensor with only categoricals.
col_idx (int):
The index of the column to get the data.
is_sparse (bool):
Whether the tensor is sparse or not.
Return:
col_data (Union[np.ndarray, sparse.csr_matrix]):
The column data of the tensor.
"""

if is_sparse:
assert not isinstance(X, np.ndarray) # mypy check
indptr_start = X.indptr[col_idx]
indptr_end = X.indptr[col_idx + 1]
col_data = X.data[indptr_start:indptr_end]
else:
col_data = X[:, col_idx]

return col_data

def fit(self, X: Union[np.ndarray, sparse.csr_matrix],
y: Optional[np.ndarray] = None) -> 'MinorityCoalesceTransformer':
"""
Expand All @@ -103,24 +133,19 @@ def fit(self, X: Union[np.ndarray, sparse.csr_matrix],
Optional labels for the given task, not used by this estimator.
"""
self._check_dataset(X)
n_instances, n_features = X.shape

if self.min_fraction is None:
self._categories_to_coalesce = [np.array([]) for _ in range(n_features)]
return self

n_instances, n_features = X.shape
categories_to_coalesce: List[Set[int]] = [set() for _ in range(n_features)]
categories_to_coalesce: List[np.ndarray] = []
is_sparse = sparse.issparse(X)
for col in range(n_features):
if is_sparse:
indptr_start = X.indptr[col]
indptr_end = X.indptr[col + 1]
col_data = X.data[indptr_start:indptr_end]
else:
col_data = X[:, col]

col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse)
unique_vals, counts = np.unique(col_data, return_counts=True)
frac = counts / n_instances
categories_to_coalesce[col].update(unique_vals[frac < self.min_fraction])
categories_to_coalesce.append(unique_vals[frac < self.min_fraction])

self._categories_to_coalesce = categories_to_coalesce
return self
Expand All @@ -139,7 +164,7 @@ def transform(
self._check_dataset(X)

if self._categories_to_coalesce is None:
raise ValueError("fit() must be called before transform()")
raise RuntimeError("fit() must be called before transform()")

if self.min_fraction is None:
return X
Expand All @@ -148,14 +173,9 @@ def transform(
is_sparse = sparse.issparse(X)

for col in range(n_features):
if is_sparse:
indptr_start = X.indptr[col]
indptr_end = X.indptr[col + 1]
col_data = X.data[indptr_start:indptr_end]
else:
col_data = X[:, col]
# The imputer uses -1 for unknown categories
# Then -2 means coalesced categories
col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse)
mask = np.isin(col_data, self._categories_to_coalesce[col])
col_data[mask] = -2

Expand Down

0 comments on commit 6d64582

Please sign in to comment.