Fixes issue where Imputer cannot fit when there is None in a categori…

…cal or boolean column (#1144) * init * release notes * add docstr
alteryx · Sep 8, 2020 · b88c9c6 · b88c9c6
1 parent 221bceb
commit b88c9c6
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 4 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -9,6 +9,7 @@ Release Notes
     * Fixes
         * Fixed XGBoost column names for partial dependence methods :pr:`1104`
         * Removed dead code validating column type from `TextFeaturizer` :pr:`1122`
+        * Fixed issue where Imputer cannot fit when there is None in a categorical or boolean column :pr:`1144`
     * Changes
         * Pinned scikit-optimize version to 0.7.4 :pr:`1136`
     * Documentation Changes

diff --git a/evalml/pipelines/components/transformers/imputers/imputer.py b/evalml/pipelines/components/transformers/imputers/imputer.py
@@ -52,7 +52,8 @@ def __init__(self, categorical_impute_strategy="most_frequent",
                          random_state=random_state)
 
     def fit(self, X, y=None):
-        """Fits imputer to data
+        """Fits imputer to data. 'None' values are converted to np.nan before imputation and are
+            treated as the same.
 
         Arguments:
             X (pd.DataFrame or np.array): the input training data of shape [n_samples, n_features]
@@ -80,7 +81,8 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, X, y=None):
-        """Transforms data X by imputing missing values
+        """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are
+            treated as the same.
 
         Arguments:
             X (pd.DataFrame): Data to transform

diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 from sklearn.impute import SimpleImputer as SkImputer
 
@@ -30,7 +31,8 @@ def __init__(self, impute_strategy="most_frequent", fill_value=None, random_stat
                          random_state=random_state)
 
     def fit(self, X, y=None):
-        """Fits imputer to data
+        """Fits imputer to data. 'None' values are converted to np.nan before imputation and are
+            treated as the same.
 
         Arguments:
             X (pd.DataFrame or np.array): the input training data of shape [n_samples, n_features]
@@ -41,12 +43,16 @@ def fit(self, X, y=None):
         """
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X)
+        # Convert None to np.nan, since None cannot be properly handled
+        X = X.fillna(value=np.nan)
+
         self._component_obj.fit(X, y)
         self._all_null_cols = set(X.columns) - set(X.dropna(axis=1, how='all').columns)
         return self
 
     def transform(self, X, y=None):
-        """Transforms data X by imputing missing values
+        """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are
+            treated as the same.
 
         Arguments:
             X (pd.DataFrame): Data to transform
@@ -56,6 +62,8 @@ def transform(self, X, y=None):
         """
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X)
+        # Convert None to np.nan, since None cannot be properly handled
+        X = X.fillna(value=np.nan)
 
         X_null_dropped = X.copy()
         X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True)

diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py
@@ -263,3 +263,26 @@ def test_imputer_no_nans(imputer_test_data):
                       categorical_fill_value="fill", numeric_fill_value=-1)
     transformed = imputer.fit_transform(X, y)
     assert_frame_equal(transformed, expected, check_dtype=False)
+
+
+def test_imputer_with_none():
+    X = pd.DataFrame({"int with None": [1, 0, 5, None],
+                      "float with None": [0.1, 0.0, 0.5, None],
+                      "category with None": pd.Series(["b", "a", "a", None], dtype='category'),
+                      "boolean with None": [True, None, False, True],
+                      "object with None": ["b", "a", "a", None],
+                      "all None": [None, None, None, None]})
+    y = pd.Series([0, 0, 1, 0, 1])
+    imputer = Imputer()
+    imputer.fit(X, y)
+    transformed = imputer.transform(X, y)
+    expected = pd.DataFrame({"int with None": [1, 0, 5, 2],
+                             "float with None": [0.1, 0.0, 0.5, 0.2],
+                             "category with None": pd.Series(["b", "a", "a", "a"], dtype='category'),
+                             "boolean with None": [True, True, False, True],
+                             "object with None": ["b", "a", "a", "a"]})
+    assert_frame_equal(transformed, expected, check_dtype=False)
+
+    imputer = Imputer()
+    transformed = imputer.fit_transform(X, y)
+    assert_frame_equal(transformed, expected, check_dtype=False)
diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py
@@ -203,3 +203,29 @@ def test_simple_imputer_resets_index():
                                   pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9]},
                                                dtype=float,
                                                index=list(range(0, 9))))
+
+
+def test_simple_imputer_with_none():
+    X = pd.DataFrame({"int with None": [1, 0, 5, None],
+                      "float with None": [0.1, 0.0, 0.5, None],
+                      "all None": [None, None, None, None]})
+    y = pd.Series([0, 0, 1, 0, 1])
+    imputer = SimpleImputer(impute_strategy="mean")
+    imputer.fit(X, y)
+    transformed = imputer.transform(X, y)
+    expected = pd.DataFrame({"int with None": [1, 0, 5, 2],
+                             "float with None": [0.1, 0.0, 0.5, 0.2]})
+    assert_frame_equal(transformed, expected, check_dtype=False)
+
+    X = pd.DataFrame({"category with None": pd.Series(["b", "a", "a", None], dtype='category'),
+                      "boolean with None": [True, None, False, True],
+                      "object with None": ["b", "a", "a", None],
+                      "all None": [None, None, None, None]})
+    y = pd.Series([0, 0, 1, 0, 1])
+    imputer = SimpleImputer()
+    imputer.fit(X, y)
+    transformed = imputer.transform(X, y)
+    expected = pd.DataFrame({"category with None": pd.Series(["b", "a", "a", "a"], dtype='category'),
+                             "boolean with None": [True, True, False, True],
+                             "object with None": ["b", "a", "a", "a"]})
+    assert_frame_equal(transformed, expected, check_dtype=False)