Split fill_value into categorical_fill_value and `numeric_fill_va…

…lue` for Imputer (#1019) * init * fix * fix simpleimputer and add tests * remove unnecessary lines * address some tests, cleanup * fix tests and release note * use evalml simpleimputer for imputer * cleanup * category update * add pyfixture and reorganize tests * update via most comments * fix test * update simple imputer * cleanup * linting
alteryx · Aug 10, 2020 · bbc315f · bbc315f
1 parent 2c1aa75
commit bbc315f
Show file tree

Hide file tree

Showing 6 changed files with 179 additions and 76 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -3,6 +3,7 @@ Release Notes
 
 **Future Releases**
     * Enhancements
+        * Split `fill_value` into `categorical_fill_value` and `numeric_fill_value` for Imputer :pr:`1019`
     * Fixes
     * Changes
     * Documentation Changes

diff --git a/evalml/pipelines/components/transformers/imputers/imputer.py b/evalml/pipelines/components/transformers/imputers/imputer.py
@@ -1,7 +1,7 @@
 import pandas as pd
-from sklearn.impute import SimpleImputer as SkImputer
 
 from evalml.pipelines.components.transformers import Transformer
+from evalml.pipelines.components.transformers.imputers import SimpleImputer
 from evalml.utils.gen_utils import boolean, categorical_dtypes, numeric_dtypes
 
 
@@ -16,15 +16,17 @@ class Imputer(Transformer):
     _valid_numeric_impute_strategies = set(["mean", "median", "most_frequent", "constant"])
 
     def __init__(self, categorical_impute_strategy="most_frequent",
+                 categorical_fill_value=None,
                  numeric_impute_strategy="mean",
-                 fill_value=None, random_state=0, **kwargs):
+                 numeric_fill_value=None,
+                 random_state=0, **kwargs):
         """Initalizes an transformer that imputes missing data according to the specified imputation strategy."
 
         Arguments:
             categorical_impute_strategy (string): Impute strategy to use for string, object, boolean, categorical dtypes. Valid values include "most_frequent" and "constant".
             numeric_impute_strategy (string): Impute strategy to use for numeric dtypes. Valid values include "mean", "median", "most_frequent", and "constant".
-            fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data.
-               Defaults to 0 when imputing  data and "missing_value" for strings or object data types.
+            categorical_fill_value (string): When categorical_impute_strategy == "constant", fill_value is used to replace missing data. The default value of None will fill with the string "missing_value".
+            numeric_fill_value (int, float): When numeric_impute_strategy == "constant", fill_value is used to replace missing data. The default value of None will fill with 0.
         """
         if categorical_impute_strategy not in self._valid_categorical_impute_strategies:
             raise ValueError(f"{categorical_impute_strategy} is an invalid parameter. Valid categorical impute strategies are {', '.join(self._valid_numeric_impute_strategies)}")
@@ -33,14 +35,15 @@ def __init__(self, categorical_impute_strategy="most_frequent",
 
         parameters = {"categorical_impute_strategy": categorical_impute_strategy,
                       "numeric_impute_strategy": numeric_impute_strategy,
-                      "fill_value": fill_value}
+                      "categorical_fill_value": categorical_fill_value,
+                      "numeric_fill_value": numeric_fill_value}
         parameters.update(kwargs)
-        self._categorical_imputer = SkImputer(strategy=categorical_impute_strategy,
-                                              fill_value=fill_value,
+        self._categorical_imputer = SimpleImputer(impute_strategy=categorical_impute_strategy,
+                                                  fill_value=categorical_fill_value,
+                                                  **kwargs)
+        self._numeric_imputer = SimpleImputer(impute_strategy=numeric_impute_strategy,
+                                              fill_value=numeric_fill_value,
                                               **kwargs)
-        self._numeric_imputer = SkImputer(strategy=numeric_impute_strategy,
-                                          fill_value=fill_value,
-                                          **kwargs)
         self._all_null_cols = None
         self._numeric_cols = None
         self._categorical_cols = None
@@ -87,20 +90,18 @@ def transform(self, X, y=None):
         """
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X)
-
         X_null_dropped = X.copy()
         X_null_dropped.drop(self._all_null_cols, inplace=True, axis=1, errors='ignore')
+        X_null_dropped.reset_index(inplace=True, drop=True)
         if X_null_dropped.empty:
             return X_null_dropped
-        dtypes = X_null_dropped.dtypes.to_dict()
 
         if self._numeric_cols is not None and len(self._numeric_cols) > 0:
             X_numeric = X_null_dropped[self._numeric_cols]
             X_null_dropped[X_numeric.columns] = self._numeric_imputer.transform(X_numeric)
+
         if self._categorical_cols is not None and len(self._categorical_cols) > 0:
             X_categorical = X_null_dropped[self._categorical_cols]
             X_null_dropped[X_categorical.columns] = self._categorical_imputer.transform(X_categorical)
 
-        transformed = X_null_dropped.astype(dtypes)
-        transformed.reset_index(inplace=True, drop=True)
-        return transformed
+        return X_null_dropped
diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py
@@ -54,19 +54,22 @@ def transform(self, X, y=None):
         Returns:
             pd.DataFrame: Transformed X
         """
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
 
+        X_null_dropped = X.copy()
+        X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True)
+        category_cols = X_null_dropped.select_dtypes(include=['category']).columns
         X_t = self._component_obj.transform(X)
-        if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame):
-            # skLearn's SimpleImputer loses track of column type, so we need to restore
-            X_null_dropped = X.drop(self._all_null_cols, axis=1)
-            if X_null_dropped.empty:
-                return pd.DataFrame(X_t, columns=X_null_dropped.columns)
-            return pd.DataFrame(X_t, columns=X_null_dropped.columns).astype(X_null_dropped.dtypes.to_dict())
-        return pd.DataFrame(X_t)
+        if X_null_dropped.empty:
+            return pd.DataFrame(X_t, columns=X_null_dropped.columns)
+        X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
+        if len(category_cols) > 0:
+            X_t[category_cols] = X_t[category_cols].astype('category')
+        return X_t
 
     def fit_transform(self, X, y=None):
         """Fits on X and transforms X
-
         Arguments:
             X (pd.DataFrame): Data to fit and transform
             y (pd. DataFrame): Labels to fit and transform

diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py
@@ -6,6 +6,24 @@
 from evalml.pipelines.components import Imputer
 
 
+@pytest.fixture
+def imputer_test_data():
+    return pd.DataFrame({
+        "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
+        "int col": [0, 1, 2, 0, 3],
+        "object col": ["b", "b", "a", "c", "d"],
+        "float col": [0.0, 1.0, 0.0, -2.0, 5.],
+        "bool col": [True, False, False, True, True],
+        "categorical with nan": pd.Series([np.nan, "1", np.nan, "0", "3"], dtype='category'),
+        "int with nan": [np.nan, 1, 0, 0, 1],
+        "float with nan": [0.0, 1.0, np.nan, -1.0, 0.],
+        "object with nan": ["b", "b", np.nan, "c", np.nan],
+        "bool col with nan": [True, np.nan, False, np.nan, True],
+        "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan],
+        "all nan cat": pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], dtype='category')
+    })
+
+
 def test_invalid_strategy_parameters():
     with pytest.raises(ValueError, match="Valid impute strategies are"):
         Imputer(numeric_impute_strategy="not a valid strategy")
@@ -18,18 +36,25 @@ def test_imputer_default_parameters():
     expected_parameters = {
         'categorical_impute_strategy': 'most_frequent',
         'numeric_impute_strategy': 'mean',
-        'fill_value': None
+        'categorical_fill_value': None,
+        'numeric_fill_value': None
     }
     assert imputer.parameters == expected_parameters
 
 
-def test_imputer_init():
-    imputer = Imputer(categorical_impute_strategy="most_frequent",
-                      numeric_impute_strategy="median")
+@pytest.mark.parametrize("categorical_impute_strategy", ["most_frequent", "constant"])
+@pytest.mark.parametrize("numeric_impute_strategy", ["mean", "median", "most_frequent", "constant"])
+def test_imputer_init(categorical_impute_strategy, numeric_impute_strategy):
+
+    imputer = Imputer(categorical_impute_strategy=categorical_impute_strategy,
+                      numeric_impute_strategy=numeric_impute_strategy,
+                      categorical_fill_value="str_fill_value",
+                      numeric_fill_value=-1)
     expected_parameters = {
-        'categorical_impute_strategy': 'most_frequent',
-        'numeric_impute_strategy': 'median',
-        'fill_value': None
+        'categorical_impute_strategy': categorical_impute_strategy,
+        'numeric_impute_strategy': numeric_impute_strategy,
+        'categorical_fill_value': 'str_fill_value',
+        'numeric_fill_value': -1
     }
     expected_hyperparameters = {
         "categorical_impute_strategy": ["most_frequent"],
@@ -40,22 +65,17 @@ def test_imputer_init():
     assert imputer.hyperparameter_ranges == expected_hyperparameters
 
 
-def test_numeric_only_input():
-    X = pd.DataFrame({
-        "int col": [0, 1, 2, 0, 3],
-        "float col": [0.0, 1.0, 0.0, -2.0, 5.],
-        "int with nan": [np.nan, 1, 2, 1, 0],
-        "float with nan": [0.0, 1.0, np.nan, -1.0, 0.],
-        "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan]
-    })
+def test_numeric_only_input(imputer_test_data):
+    X = imputer_test_data[["int col", "float col",
+                           "int with nan", "float with nan", "all nan"]]
     y = pd.Series([0, 0, 1, 0, 1])
     imputer = Imputer(numeric_impute_strategy="median")
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame({
         "int col": [0, 1, 2, 0, 3],
         "float col": [0.0, 1.0, 0.0, -2.0, 5.],
-        "int with nan": [1, 1, 2, 1, 0],
+        "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
         "float with nan": [0.0, 1.0, 0, -1.0, 0.]
     })
     assert_frame_equal(transformed, expected, check_dtype=False)
@@ -65,25 +85,19 @@ def test_numeric_only_input():
     assert_frame_equal(transformed, expected, check_dtype=False)
 
 
-def test_categorical_only_input():
-    X = pd.DataFrame({
-        "categorical col": pd.Series([0, 1, 2, 0, 3], dtype='category'),
-        "object col": ["b", "b", "a", "c", "d"],
-        "bool col": [True, False, False, True, True],
-        "categorical with nan": pd.Series([np.nan, 1, np.nan, 0, 3], dtype='category'),
-        "object with nan": ["b", "b", np.nan, "c", np.nan],
-        "bool col with nan": [True, np.nan, False, np.nan, True],
-        "all nan": pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], dtype='category')
-    })
+def test_categorical_only_input(imputer_test_data):
+    X = imputer_test_data[["categorical col", "object col", "bool col",
+                           "categorical with nan", "object with nan",
+                           "bool col with nan", "all nan cat"]]
     y = pd.Series([0, 0, 1, 0, 1])
     imputer = Imputer()
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame({
-        "categorical col": pd.Series([0, 1, 2, 0, 3], dtype='category'),
+        "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
         "object col": ["b", "b", "a", "c", "d"],
         "bool col": [True, False, False, True, True],
-        "categorical with nan": pd.Series([0, 1, 0, 0, 3], dtype='category'),
+        "categorical with nan": pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
         "object with nan": ["b", "b", "b", "c", "b"],
         "bool col with nan": [True, True, False, True, True]
     })
@@ -93,32 +107,20 @@ def test_categorical_only_input():
     assert_frame_equal(transformed, expected, check_dtype=False)
 
 
-def test_categorical_and_numeric_input():
-    X = pd.DataFrame({
-        "categorical col": pd.Series([0, 1, 2, 0, 3], dtype='category'),
-        "int col": [0, 1, 2, 0, 3],
-        "object col": ["b", "b", "a", "c", "d"],
-        "float col": [0.0, 1.0, 0.0, -2.0, 5.],
-        "bool col": [True, False, False, True, True],
-        "int with nan": [np.nan, 1, 2, 1, 0],
-        "categorical with nan": pd.Series([np.nan, 1, np.nan, 0, 3], dtype='category'),
-        "float with nan": [0.0, 1.0, np.nan, -1.0, 0.],
-        "object with nan": ["b", "b", np.nan, "c", np.nan],
-        "bool col with nan": [True, np.nan, False, np.nan, True],
-        "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan]
-    })
+def test_categorical_and_numeric_input(imputer_test_data):
+    X = imputer_test_data
     y = pd.Series([0, 0, 1, 0, 1])
     imputer = Imputer()
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame({
-        "categorical col": pd.Series([0, 1, 2, 0, 3], dtype='category'),
+        "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
         "int col": [0, 1, 2, 0, 3],
         "object col": ["b", "b", "a", "c", "d"],
         "float col": [0.0, 1.0, 0.0, -2.0, 5.],
         "bool col": [True, False, False, True, True],
-        "int with nan": [1, 1, 2, 1, 0],
-        "categorical with nan": pd.Series([0, 1, 0, 0, 3], dtype='category'),
+        "categorical with nan": pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
+        "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
         "float with nan": [0.0, 1.0, 0, -1.0, 0.],
         "object with nan": ["b", "b", "b", "c", "b"],
         "bool col with nan": [True, True, False, True, True]
@@ -130,11 +132,8 @@ def test_categorical_and_numeric_input():
     assert_frame_equal(transformed, expected, check_dtype=False)
 
 
-def test_drop_all_columns():
-    X = pd.DataFrame({
-        "all nan cat": pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], dtype='category'),
-        "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan]
-    })
+def test_drop_all_columns(imputer_test_data):
+    X = imputer_test_data[["all nan cat", "all nan"]]
     y = pd.Series([0, 0, 1, 0, 1])
     imputer = Imputer()
     imputer.fit(X, y)
@@ -187,7 +186,7 @@ def test_imputer_empty_data(data_type):
     if data_type == 'pd':
         X = pd.DataFrame()
         y = pd.Series()
-        expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
+        expected = pd.DataFrame(index=pd.Int64Index([]), columns=pd.Index([]))
     else:
         X = np.array([[]])
         y = np.array([])
@@ -221,3 +220,46 @@ def test_imputer_resets_index():
                                   pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9]},
                                                dtype=float,
                                                index=list(range(0, 9))))
+
+
+def test_imputer_fill_value(imputer_test_data):
+    X = imputer_test_data[["int with nan", "categorical with nan",
+                           "float with nan", "object with nan", "bool col with nan"]]
+    y = pd.Series([0, 0, 1, 0, 1])
+    imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
+                      categorical_fill_value="fill", numeric_fill_value=-1)
+    imputer.fit(X, y)
+    transformed = imputer.transform(X, y)
+    expected = pd.DataFrame({
+        "int with nan": [-1, 1, 0, 0, 1],
+        "categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'),
+        "float with nan": [0.0, 1.0, -1, -1.0, 0.],
+        "object with nan": ["b", "b", "fill", "c", "fill"],
+        "bool col with nan": [True, "fill", False, "fill", True]
+    })
+    assert_frame_equal(transformed, expected, check_dtype=False)
+
+    imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
+                      categorical_fill_value="fill", numeric_fill_value=-1)
+    transformed = imputer.fit_transform(X, y)
+    assert_frame_equal(transformed, expected, check_dtype=False)
+
+
+def test_imputer_no_nans(imputer_test_data):
+    X = imputer_test_data[["categorical col", "object col", "bool col"]]
+    y = pd.Series([0, 0, 1, 0, 1])
+    imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
+                      categorical_fill_value="fill", numeric_fill_value=-1)
+    imputer.fit(X, y)
+    transformed = imputer.transform(X, y)
+    expected = pd.DataFrame({
+        "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
+        "object col": ["b", "b", "a", "c", "d"],
+        "bool col": [True, False, False, True, True],
+    })
+    assert_frame_equal(transformed, expected, check_dtype=False)
+
+    imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
+                      categorical_fill_value="fill", numeric_fill_value=-1)
+    transformed = imputer.fit_transform(X, y)
+    assert_frame_equal(transformed, expected, check_dtype=False)
diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py
@@ -149,3 +149,57 @@ def test_numpy_input():
     np.testing.assert_almost_equal(X, np.array([[np.nan, 0, 1, np.nan],
                                                 [np.nan, 2, 3, 2],
                                                 [np.nan, 2, 3, 0]]))
+
+
+@pytest.mark.parametrize("data_type", ["numeric", "categorical"])
+def test_simple_imputer_fill_value(data_type):
+    if data_type == "numeric":
+        X = pd.DataFrame({
+            "some numeric": [np.nan, 1, 0],
+            "another numeric": [0, np.nan, 2]
+        })
+        fill_value = -1
+        expected = pd.DataFrame({
+            "some numeric": [-1, 1, 0],
+            "another numeric": [0, -1, 2]
+        })
+    else:
+        X = pd.DataFrame({
+            "categorical with nan": pd.Series([np.nan, "1", np.nan, "0", "3"], dtype='category'),
+            "object with nan": ["b", "b", np.nan, "c", np.nan]
+        })
+        fill_value = "fill"
+        expected = pd.DataFrame({
+            "categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'),
+            "object with nan": ["b", "b", "fill", "c", "fill"],
+        })
+    y = pd.Series([0, 0, 1, 0, 1])
+    imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value)
+    imputer.fit(X, y)
+    transformed = imputer.transform(X, y)
+    assert_frame_equal(transformed, expected, check_dtype=False)
+
+    imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value)
+    transformed = imputer.fit_transform(X, y)
+    assert_frame_equal(transformed, expected, check_dtype=False)
+
+
+def test_simple_imputer_resets_index():
+    X = pd.DataFrame({'input_val': np.arange(10), 'target': np.arange(10)})
+    X.loc[5, 'input_val'] = np.nan
+    assert X.index.tolist() == list(range(10))
+
+    X.drop(0, inplace=True)
+    y = X.pop('target')
+    pd.testing.assert_frame_equal(X,
+                                  pd.DataFrame({'input_val': [1.0, 2, 3, 4, np.nan, 6, 7, 8, 9]},
+                                               dtype=float,
+                                               index=list(range(1, 10))))
+
+    imputer = SimpleImputer(impute_strategy="mean")
+    imputer.fit(X, y=y)
+    transformed = imputer.transform(X)
+    pd.testing.assert_frame_equal(transformed,
+                                  pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9]},
+                                               dtype=float,
+                                               index=list(range(0, 9))))