Skip to content

Commit

Permalink
Fixes issue where Imputer cannot fit when there is None in a categori…
Browse files Browse the repository at this point in the history
…cal or boolean column (#1144)

* init

* release notes

* add docstr
  • Loading branch information
angela97lin committed Sep 8, 2020
1 parent 221bceb commit b88c9c6
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 4 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Release Notes
* Fixes
* Fixed XGBoost column names for partial dependence methods :pr:`1104`
* Removed dead code validating column type from `TextFeaturizer` :pr:`1122`
* Fixed issue where Imputer cannot fit when there is None in a categorical or boolean column :pr:`1144`
* Changes
* Pinned scikit-optimize version to 0.7.4 :pr:`1136`
* Documentation Changes
Expand Down
6 changes: 4 additions & 2 deletions evalml/pipelines/components/transformers/imputers/imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ def __init__(self, categorical_impute_strategy="most_frequent",
random_state=random_state)

def fit(self, X, y=None):
"""Fits imputer to data
"""Fits imputer to data. 'None' values are converted to np.nan before imputation and are
treated as the same.
Arguments:
X (pd.DataFrame or np.array): the input training data of shape [n_samples, n_features]
Expand Down Expand Up @@ -80,7 +81,8 @@ def fit(self, X, y=None):
return self

def transform(self, X, y=None):
"""Transforms data X by imputing missing values
"""Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are
treated as the same.
Arguments:
X (pd.DataFrame): Data to transform
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer as SkImputer

Expand Down Expand Up @@ -30,7 +31,8 @@ def __init__(self, impute_strategy="most_frequent", fill_value=None, random_stat
random_state=random_state)

def fit(self, X, y=None):
"""Fits imputer to data
"""Fits imputer to data. 'None' values are converted to np.nan before imputation and are
treated as the same.
Arguments:
X (pd.DataFrame or np.array): the input training data of shape [n_samples, n_features]
Expand All @@ -41,12 +43,16 @@ def fit(self, X, y=None):
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
# Convert None to np.nan, since None cannot be properly handled
X = X.fillna(value=np.nan)

self._component_obj.fit(X, y)
self._all_null_cols = set(X.columns) - set(X.dropna(axis=1, how='all').columns)
return self

def transform(self, X, y=None):
"""Transforms data X by imputing missing values
"""Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are
treated as the same.
Arguments:
X (pd.DataFrame): Data to transform
Expand All @@ -56,6 +62,8 @@ def transform(self, X, y=None):
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
# Convert None to np.nan, since None cannot be properly handled
X = X.fillna(value=np.nan)

X_null_dropped = X.copy()
X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True)
Expand Down
23 changes: 23 additions & 0 deletions evalml/tests/component_tests/test_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,3 +263,26 @@ def test_imputer_no_nans(imputer_test_data):
categorical_fill_value="fill", numeric_fill_value=-1)
transformed = imputer.fit_transform(X, y)
assert_frame_equal(transformed, expected, check_dtype=False)


def test_imputer_with_none():
X = pd.DataFrame({"int with None": [1, 0, 5, None],
"float with None": [0.1, 0.0, 0.5, None],
"category with None": pd.Series(["b", "a", "a", None], dtype='category'),
"boolean with None": [True, None, False, True],
"object with None": ["b", "a", "a", None],
"all None": [None, None, None, None]})
y = pd.Series([0, 0, 1, 0, 1])
imputer = Imputer()
imputer.fit(X, y)
transformed = imputer.transform(X, y)
expected = pd.DataFrame({"int with None": [1, 0, 5, 2],
"float with None": [0.1, 0.0, 0.5, 0.2],
"category with None": pd.Series(["b", "a", "a", "a"], dtype='category'),
"boolean with None": [True, True, False, True],
"object with None": ["b", "a", "a", "a"]})
assert_frame_equal(transformed, expected, check_dtype=False)

imputer = Imputer()
transformed = imputer.fit_transform(X, y)
assert_frame_equal(transformed, expected, check_dtype=False)
26 changes: 26 additions & 0 deletions evalml/tests/component_tests/test_simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,29 @@ def test_simple_imputer_resets_index():
pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9]},
dtype=float,
index=list(range(0, 9))))


def test_simple_imputer_with_none():
X = pd.DataFrame({"int with None": [1, 0, 5, None],
"float with None": [0.1, 0.0, 0.5, None],
"all None": [None, None, None, None]})
y = pd.Series([0, 0, 1, 0, 1])
imputer = SimpleImputer(impute_strategy="mean")
imputer.fit(X, y)
transformed = imputer.transform(X, y)
expected = pd.DataFrame({"int with None": [1, 0, 5, 2],
"float with None": [0.1, 0.0, 0.5, 0.2]})
assert_frame_equal(transformed, expected, check_dtype=False)

X = pd.DataFrame({"category with None": pd.Series(["b", "a", "a", None], dtype='category'),
"boolean with None": [True, None, False, True],
"object with None": ["b", "a", "a", None],
"all None": [None, None, None, None]})
y = pd.Series([0, 0, 1, 0, 1])
imputer = SimpleImputer()
imputer.fit(X, y)
transformed = imputer.transform(X, y)
expected = pd.DataFrame({"category with None": pd.Series(["b", "a", "a", "a"], dtype='category'),
"boolean with None": [True, True, False, True],
"object with None": ["b", "a", "a", "a"]})
assert_frame_equal(transformed, expected, check_dtype=False)

0 comments on commit b88c9c6

Please sign in to comment.