[fix] Modify so that tests pass

automl · Feb 9, 2022 · 6d64582 · 6d64582
1 parent 1996d1c
commit 6d64582
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 20 deletions.
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py
@@ -9,6 +9,7 @@ class NoCoalescer(BaseCoalescer):
     def __init__(self, random_state: np.random.RandomState):
         super().__init__()
         self.random_state = random_state
+        self._processing = False
 
     def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer:
         """

diff --git a/...Torch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py b/...Torch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
@@ -9,6 +9,7 @@
 class BaseCoalescer(autoPyTorchTabularPreprocessingComponent):
     def __init__(self) -> None:
         super().__init__()
+        self._processing = True
         self.add_fit_requirements([
             FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
             FitRequirement('categories', (List,), user_defined=True, dataset_property=True)
@@ -24,7 +25,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             X (Dict[str, Any]): the updated fit dictionary
         """
-        if all(self.preprocessor[key] is None for key in ['numerical', 'categorical']):
+        if self._processing and all(self.preprocessor[key] is None for key in ['numerical', 'categorical']):
             raise ValueError(f"fit() must be called before transform() on {self.__class__.__name__}")
+
         X.update({'coalescer': self.preprocessor})
         return X
diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type, Union
 
 import numpy as np
 
@@ -69,7 +69,7 @@ class MinorityCoalesceTransformer(BaseEstimator, TransformerMixin):
     """ Group together categories whose occurrence is less than a specified min_fraction."""
     def __init__(self, min_fraction: Optional[float] = None):
         self.min_fraction = min_fraction
-        self._categories_to_coalesce: Optional[List[Set[int]]] = None
+        self._categories_to_coalesce: Optional[List[np.ndarray]] = None
 
         if self.min_fraction is not None and (self.min_fraction < 0 or self.min_fraction > 1):
             raise ValueError(f"min_fraction for {self.__class__.__name__} must be in [0, 1], but got {min_fraction}")
@@ -91,6 +91,36 @@ def _check_dataset(self, X: Union[np.ndarray, sparse.csr_matrix]) -> None:
             raise ValueError("The categoricals in input features for MinorityCoalesceTransformer "
                              "cannot have integers smaller than -2.")
 
+    @staticmethod
+    def _get_column_data(
+        X: Union[np.ndarray, sparse.csr_matrix],
+        col_idx: int,
+        is_sparse: bool
+    ) -> Union[np.ndarray, sparse.csr_matrix]:
+        """
+        Args:
+            X (Union[np.ndarray, sparse.csr_matrix]):
+                The feature tensor with only categoricals.
+            col_idx (int):
+                The index of the column to get the data.
+            is_sparse (bool):
+                Whether the tensor is sparse or not.
+
+        Return:
+            col_data (Union[np.ndarray, sparse.csr_matrix]):
+                The column data of the tensor.
+        """
+
+        if is_sparse:
+            assert not isinstance(X, np.ndarray)  # mypy check
+            indptr_start = X.indptr[col_idx]
+            indptr_end = X.indptr[col_idx + 1]
+            col_data = X.data[indptr_start:indptr_end]
+        else:
+            col_data = X[:, col_idx]
+
+        return col_data
+
     def fit(self, X: Union[np.ndarray, sparse.csr_matrix],
             y: Optional[np.ndarray] = None) -> 'MinorityCoalesceTransformer':
         """
@@ -103,24 +133,19 @@ def fit(self, X: Union[np.ndarray, sparse.csr_matrix],
                 Optional labels for the given task, not used by this estimator.
         """
         self._check_dataset(X)
+        n_instances, n_features = X.shape
 
         if self.min_fraction is None:
+            self._categories_to_coalesce = [np.array([]) for _ in range(n_features)]
             return self
 
-        n_instances, n_features = X.shape
-        categories_to_coalesce: List[Set[int]] = [set() for _ in range(n_features)]
+        categories_to_coalesce: List[np.ndarray] = []
         is_sparse = sparse.issparse(X)
         for col in range(n_features):
-            if is_sparse:
-                indptr_start = X.indptr[col]
-                indptr_end = X.indptr[col + 1]
-                col_data = X.data[indptr_start:indptr_end]
-            else:
-                col_data = X[:, col]
-
+            col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse)
             unique_vals, counts = np.unique(col_data, return_counts=True)
             frac = counts / n_instances
-            categories_to_coalesce[col].update(unique_vals[frac < self.min_fraction])
+            categories_to_coalesce.append(unique_vals[frac < self.min_fraction])
 
         self._categories_to_coalesce = categories_to_coalesce
         return self
@@ -139,7 +164,7 @@ def transform(
         self._check_dataset(X)
 
         if self._categories_to_coalesce is None:
-            raise ValueError("fit() must be called before transform()")
+            raise RuntimeError("fit() must be called before transform()")
 
         if self.min_fraction is None:
             return X
@@ -148,14 +173,9 @@ def transform(
         is_sparse = sparse.issparse(X)
 
         for col in range(n_features):
-            if is_sparse:
-                indptr_start = X.indptr[col]
-                indptr_end = X.indptr[col + 1]
-                col_data = X.data[indptr_start:indptr_end]
-            else:
-                col_data = X[:, col]
             # The imputer uses -1 for unknown categories
             # Then -2 means coalesced categories
+            col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse)
             mask = np.isin(col_data, self._categories_to_coalesce[col])
             col_data[mask] = -2