Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed bug where PerColumnImputer cannot handle dropped columns #855

Merged
merged 15 commits into from Jun 16, 2020
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Expand Up @@ -14,6 +14,7 @@ Changelog
* Define getter method for component `parameters` :pr:`847`
* Fixes
* Fixed bug where SimpleImputer cannot handle dropped columns :pr:`846`
* Fixed bug where PerColumnImputer cannot handle dropped columns :pr:`855`
* Enforce requirement that builtin components save all inputted values in their parameters dict :pr:`847`
* Don't list base classes in `all_components` output :pr:`847`
* Changes
Expand Down
@@ -1,6 +1,7 @@
from sklearn.impute import SimpleImputer as SkImputer

from evalml.pipelines.components.transformers import Transformer
from evalml.pipelines.components.transformers.imputers.simple_imputer import (
SimpleImputer
)


class PerColumnImputer(Transformer):
Expand Down Expand Up @@ -50,7 +51,7 @@ def fit(self, X, y=None):
strategy_dict = self.impute_strategies.get(column, dict())
strategy = strategy_dict.get('impute_strategy', self.default_impute_strategy)
fill_value = strategy_dict.get('fill_value', None)
self.imputers[column] = SkImputer(strategy=strategy, fill_value=fill_value)
self.imputers[column] = SimpleImputer(impute_strategy=strategy, fill_value=fill_value)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than use SkImputer, I'm using our SimpleImputer which already handles the nan's from the previous PR :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤯 oh, great idea!! Modularity always wins 👍😁


for column, imputer in self.imputers.items():
imputer.fit(X[[column]])
Expand All @@ -67,8 +68,14 @@ def transform(self, X, y=None):
pd.DataFrame: Transformed X
"""
X_t = X.copy()
cols_to_drop = []
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't set column to empty dataframe, so updating to check first.

for column, imputer in self.imputers.items():
X_t[column] = imputer.transform(X[[column]]).astype(X.dtypes[column])
transformed = imputer.transform(X[[column]])
if transformed.empty:
cols_to_drop += [column]
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
else:
X_t[column] = transformed
X_t = X_t.drop(cols_to_drop, axis=1)
return X_t

def fit_transform(self, X, y=None):
Expand Down
46 changes: 46 additions & 0 deletions evalml/tests/component_tests/test_per_column_imputer.py
Expand Up @@ -129,3 +129,49 @@ def test_non_numeric_valid(non_numeric_df):

X_t = transformer.fit_transform(X)
assert_frame_equal(X_expected, X_t, check_dtype=False)


def test_fit_transform_drop_all_nan_columns():
X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan],
"some_nan": [np.nan, 1, 0],
"another_col": [0, 1, 2]})
strategies = {'all_nan': {"impute_strategy": "most_frequent"},
'some_nan': {"impute_strategy": "most_frequent"},
'another_col': {"impute_strategy": "most_frequent"}}
transformer = PerColumnImputer(impute_strategies=strategies)
X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]})
X_t = transformer.fit_transform(X)
assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
assert_frame_equal(X, pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan],
"some_nan": [np.nan, 1, 0],
"another_col": [0, 1, 2]}))


def test_transform_drop_all_nan_columns():
X = pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan],
"some_nan": [np.nan, 1, 0],
"another_col": [0, 1, 2]})
strategies = {'all_nan': {"impute_strategy": "most_frequent"},
'some_nan': {"impute_strategy": "most_frequent"},
'another_col': {"impute_strategy": "most_frequent"}}
transformer = PerColumnImputer(impute_strategies=strategies)
transformer.fit(X)
X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]})
assert_frame_equal(X_expected_arr, transformer.transform(X), check_dtype=False)
assert_frame_equal(X, pd.DataFrame({"all_nan": [np.nan, np.nan, np.nan],
"some_nan": [np.nan, 1, 0],
"another_col": [0, 1, 2]}))


def test_transform_drop_all_nan_columns_empty():
X = pd.DataFrame([[np.nan, np.nan, np.nan]])
strategies = {'0': {"impute_strategy": "most_frequent"}, }
transformer = PerColumnImputer(impute_strategies=strategies)
assert transformer.fit_transform(X).empty
assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))

strategies = {'0': {"impute_strategy": "most_frequent"}}
transformer = PerColumnImputer(impute_strategies=strategies)
transformer.fit(X)
assert transformer.transform(X).empty
assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))