Skip to content

Commit

Permalink
Fixes bug where SimpleImputer cannot handle dropped columns (#846)
Browse files Browse the repository at this point in the history
* init

* cleanup and adding test

* add more test cov

* cleanup and tests

* fix test
  • Loading branch information
angela97lin committed Jun 13, 2020
1 parent e2ea9a2 commit cc87a99
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 24 deletions.
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Changelog
* Added preprocessing component to handle DateTime columns featurization :pr:`838`
* Define getter method for component `parameters` :pr:`847`
* Fixes
* Fixed bug where SimpleImputer cannot handle dropped columns :pr:`846`
* Enforce requirement that builtin components save all inputted values in their parameters dict :pr:`847`
* Don't list base classes in `all_components` output :pr:`847`
* Changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,27 @@ def __init__(self, impute_strategy="most_frequent", fill_value=None, random_stat
"fill_value": fill_value}
imputer = SkImputer(strategy=impute_strategy,
fill_value=fill_value)
self._all_null_cols = None
super().__init__(parameters=parameters,
component_obj=imputer,
random_state=random_state)

def fit(self, X, y=None):
"""Fits imputer to data
Arguments:
X (pd.DataFrame or np.array): the input training data of shape [n_samples, n_features]
y (pd.Series, optional): the target training labels of length [n_samples]
Returns:
self
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self._component_obj.fit(X, y)
self._all_null_cols = set(X.columns) - set(X.dropna(axis=1, how='all').columns)
return self

def transform(self, X, y=None):
"""Transforms data X by imputing missing values
Expand All @@ -35,23 +52,24 @@ def transform(self, X, y=None):
Returns:
pd.DataFrame: Transformed X
"""
if self._all_null_cols is None:
raise RuntimeError("Must fit transformer before calling transform!")
X_t = self._component_obj.transform(X)
if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame):
# skLearn's SimpleImputer loses track of column type, so we need to restore
X_t = pd.DataFrame(X_t, columns=X.columns).astype(X.dtypes.to_dict())
X_null_dropped = X.drop(self._all_null_cols, axis=1)
if X_null_dropped.empty:
return pd.DataFrame(X_t, columns=X_null_dropped.columns)
X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns).astype(X_null_dropped.dtypes.to_dict())
return X_t

def fit_transform(self, X, y=None):
"""Fits imputer on data X then imputes missing values in X
"""Fits on X and transforms X
Arguments:
X (pd.DataFrame): Data to fit and transform
y (pd.Series): Labels to fit and transform
y (pd. DataFrame): Labels to fit and transform
Returns:
pd.DataFrame: Transformed X
"""
X_t = self._component_obj.fit_transform(X, y)
if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame):
# skLearn's SimpleImputer loses track of column type, so we need to restore
X_t = pd.DataFrame(X_t, columns=X.columns).astype(X.dtypes.to_dict())
return X_t
return self.fit(X, y).transform(X, y)
72 changes: 56 additions & 16 deletions evalml/tests/component_tests/test_simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@ def test_median():
X_t = transformer.fit_transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)

X_t = transformer.transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)


def test_mean():
X = pd.DataFrame([[np.nan, 0, 1, np.nan],
Expand All @@ -37,9 +34,6 @@ def test_mean():
X_t = transformer.fit_transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)

X_t = transformer.transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)


def test_constant():
# test impute strategy is constant and fill value is not specified
Expand All @@ -54,9 +48,6 @@ def test_constant():
X_t = transformer.fit_transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)

X_t = transformer.transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)


def test_most_frequent():
X = pd.DataFrame([[np.nan, 0, 1, np.nan],
Expand All @@ -70,9 +61,6 @@ def test_most_frequent():
X_t = transformer.fit_transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)

X_t = transformer.transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)


def test_col_with_non_numeric():
# test col with all strings
Expand Down Expand Up @@ -101,9 +89,6 @@ def test_col_with_non_numeric():
X_t = transformer.fit_transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)

X_t = transformer.transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)

transformer = SimpleImputer(impute_strategy='constant', fill_value=2)
X_expected_arr = pd.DataFrame([["a", 0, 1, 2],
["b", 2, 3, 3],
Expand All @@ -112,5 +97,60 @@ def test_col_with_non_numeric():
X_t = transformer.fit_transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)

X_t = transformer.transform(X)

def test_fit_transform_drop_all_nan_columns():
X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan],
"some_nan": [np.nan, 1, 0],
"another_col": [0, 1, 2]})

transformer = SimpleImputer(impute_strategy='most_frequent')
X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]})
X_t = transformer.fit_transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
assert_frame_equal(X, pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan],
"some_nan": [np.nan, 1, 0],
"another_col": [0, 1, 2]}))


def test_transform_drop_all_nan_columns():
X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan],
"some_nan": [np.nan, 1, 0],
"another_col": [0, 1, 2]})
transformer = SimpleImputer(impute_strategy='most_frequent')
transformer.fit(X)
X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]})
assert_frame_equal(X_expected_arr, transformer.transform(X), check_dtype=False)
assert_frame_equal(X, pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan],
"some_nan": [np.nan, 1, 0],
"another_col": [0, 1, 2]}))


def test_transform_drop_all_nan_columns_empty():
X = pd.DataFrame([[np.nan, np.nan, np.nan]])
transformer = SimpleImputer(impute_strategy='most_frequent')
assert transformer.fit_transform(X).empty
assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))

transformer = SimpleImputer(impute_strategy='most_frequent')
transformer.fit(X)
assert transformer.transform(X).empty
assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))


def test_transform_before_fit():
with pytest.raises(RuntimeError, match="Must fit transformer before calling transform!"):
SimpleImputer(impute_strategy='most_frequent').transform(pd.DataFrame())


def test_numpy_input():
X = np.array([[np.nan, 0, 1, np.nan],
[np.nan, 2, 3, 2],
[np.nan, 2, 3, 0]])
transformer = SimpleImputer(impute_strategy='mean')
X_expected_arr = np.array([[0, 1, 1],
[2, 3, 2],
[2, 3, 0]])
assert np.allclose(X_expected_arr, transformer.fit_transform(X))
np.testing.assert_almost_equal(X, np.array([[np.nan, 0, 1, np.nan],
[np.nan, 2, 3, 2],
[np.nan, 2, 3, 0]]))

0 comments on commit cc87a99

Please sign in to comment.