[fix] Address the ravin's comment

automl · Feb 9, 2022 · c00ea4a · c00ea4a
1 parent aa9409a
commit c00ea4a
Showing 1 changed file with 4 additions and 3 deletions.
diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py
@@ -77,10 +77,12 @@ def __init__(self, min_frac: Optional[float] = None):
     def _check_dataset(self, X: Union[np.ndarray, sparse.csr_matrix]) -> None:
         """
         When transforming datasets, we modify values to:
-            * -1 for missing values
+            *  0 for nan values
+            * -1 for unknown values
             * -2 for values to be coalesced
         For this reason, we need to check whether datasets have values
         smaller than -2 to avoid mis-transformation.
+        Note that zero-imputation is the default setting in SimpleImputer of sklearn.
 
         Args:
             X (np.ndarray):
@@ -173,8 +175,7 @@ def transform(
         is_sparse = sparse.issparse(X)
 
         for col in range(n_features):
-            # The imputer uses -1 for unknown categories
-            # Then -2 means coalesced categories
+            # -2 stands coalesced. For more details, see the doc in _check_dataset
             col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse)
             mask = np.isin(col_data, self._categories_to_coalesce[col])
             col_data[mask] = -2