Skip to content

Commit

Permalink
Split fill_value into categorical_fill_value and `numeric_fill_va…
Browse files Browse the repository at this point in the history
…lue` for Imputer (#1019)

* init

* fix

* fix simpleimputer and add tests

* remove unnecessary lines

* address some tests, cleanup

* fix tests and release note

* use evalml simpleimputer for imputer

* cleanup

* category update

* add pyfixture and reorganize tests

* update via most comments

* fix test

* update simple imputer

* cleanup

* linting
  • Loading branch information
angela97lin committed Aug 10, 2020
1 parent 2c1aa75 commit bbc315f
Show file tree
Hide file tree
Showing 6 changed files with 179 additions and 76 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Release Notes

**Future Releases**
* Enhancements
* Split `fill_value` into `categorical_fill_value` and `numeric_fill_value` for Imputer :pr:`1019`
* Fixes
* Changes
* Documentation Changes
Expand Down
31 changes: 16 additions & 15 deletions evalml/pipelines/components/transformers/imputers/imputer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
from sklearn.impute import SimpleImputer as SkImputer

from evalml.pipelines.components.transformers import Transformer
from evalml.pipelines.components.transformers.imputers import SimpleImputer
from evalml.utils.gen_utils import boolean, categorical_dtypes, numeric_dtypes


Expand All @@ -16,15 +16,17 @@ class Imputer(Transformer):
_valid_numeric_impute_strategies = set(["mean", "median", "most_frequent", "constant"])

def __init__(self, categorical_impute_strategy="most_frequent",
categorical_fill_value=None,
numeric_impute_strategy="mean",
fill_value=None, random_state=0, **kwargs):
numeric_fill_value=None,
random_state=0, **kwargs):
"""Initalizes an transformer that imputes missing data according to the specified imputation strategy."
Arguments:
categorical_impute_strategy (string): Impute strategy to use for string, object, boolean, categorical dtypes. Valid values include "most_frequent" and "constant".
numeric_impute_strategy (string): Impute strategy to use for numeric dtypes. Valid values include "mean", "median", "most_frequent", and "constant".
fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data.
Defaults to 0 when imputing data and "missing_value" for strings or object data types.
categorical_fill_value (string): When categorical_impute_strategy == "constant", fill_value is used to replace missing data. The default value of None will fill with the string "missing_value".
numeric_fill_value (int, float): When numeric_impute_strategy == "constant", fill_value is used to replace missing data. The default value of None will fill with 0.
"""
if categorical_impute_strategy not in self._valid_categorical_impute_strategies:
raise ValueError(f"{categorical_impute_strategy} is an invalid parameter. Valid categorical impute strategies are {', '.join(self._valid_numeric_impute_strategies)}")
Expand All @@ -33,14 +35,15 @@ def __init__(self, categorical_impute_strategy="most_frequent",

parameters = {"categorical_impute_strategy": categorical_impute_strategy,
"numeric_impute_strategy": numeric_impute_strategy,
"fill_value": fill_value}
"categorical_fill_value": categorical_fill_value,
"numeric_fill_value": numeric_fill_value}
parameters.update(kwargs)
self._categorical_imputer = SkImputer(strategy=categorical_impute_strategy,
fill_value=fill_value,
self._categorical_imputer = SimpleImputer(impute_strategy=categorical_impute_strategy,
fill_value=categorical_fill_value,
**kwargs)
self._numeric_imputer = SimpleImputer(impute_strategy=numeric_impute_strategy,
fill_value=numeric_fill_value,
**kwargs)
self._numeric_imputer = SkImputer(strategy=numeric_impute_strategy,
fill_value=fill_value,
**kwargs)
self._all_null_cols = None
self._numeric_cols = None
self._categorical_cols = None
Expand Down Expand Up @@ -87,20 +90,18 @@ def transform(self, X, y=None):
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

X_null_dropped = X.copy()
X_null_dropped.drop(self._all_null_cols, inplace=True, axis=1, errors='ignore')
X_null_dropped.reset_index(inplace=True, drop=True)
if X_null_dropped.empty:
return X_null_dropped
dtypes = X_null_dropped.dtypes.to_dict()

if self._numeric_cols is not None and len(self._numeric_cols) > 0:
X_numeric = X_null_dropped[self._numeric_cols]
X_null_dropped[X_numeric.columns] = self._numeric_imputer.transform(X_numeric)

if self._categorical_cols is not None and len(self._categorical_cols) > 0:
X_categorical = X_null_dropped[self._categorical_cols]
X_null_dropped[X_categorical.columns] = self._categorical_imputer.transform(X_categorical)

transformed = X_null_dropped.astype(dtypes)
transformed.reset_index(inplace=True, drop=True)
return transformed
return X_null_dropped
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,22 @@ def transform(self, X, y=None):
Returns:
pd.DataFrame: Transformed X
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

X_null_dropped = X.copy()
X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True)
category_cols = X_null_dropped.select_dtypes(include=['category']).columns
X_t = self._component_obj.transform(X)
if not isinstance(X_t, pd.DataFrame) and isinstance(X, pd.DataFrame):
# skLearn's SimpleImputer loses track of column type, so we need to restore
X_null_dropped = X.drop(self._all_null_cols, axis=1)
if X_null_dropped.empty:
return pd.DataFrame(X_t, columns=X_null_dropped.columns)
return pd.DataFrame(X_t, columns=X_null_dropped.columns).astype(X_null_dropped.dtypes.to_dict())
return pd.DataFrame(X_t)
if X_null_dropped.empty:
return pd.DataFrame(X_t, columns=X_null_dropped.columns)
X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
if len(category_cols) > 0:
X_t[category_cols] = X_t[category_cols].astype('category')
return X_t

def fit_transform(self, X, y=None):
"""Fits on X and transforms X
Arguments:
X (pd.DataFrame): Data to fit and transform
y (pd. DataFrame): Labels to fit and transform
Expand Down
144 changes: 93 additions & 51 deletions evalml/tests/component_tests/test_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,24 @@
from evalml.pipelines.components import Imputer


@pytest.fixture
def imputer_test_data():
return pd.DataFrame({
"categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
"int col": [0, 1, 2, 0, 3],
"object col": ["b", "b", "a", "c", "d"],
"float col": [0.0, 1.0, 0.0, -2.0, 5.],
"bool col": [True, False, False, True, True],
"categorical with nan": pd.Series([np.nan, "1", np.nan, "0", "3"], dtype='category'),
"int with nan": [np.nan, 1, 0, 0, 1],
"float with nan": [0.0, 1.0, np.nan, -1.0, 0.],
"object with nan": ["b", "b", np.nan, "c", np.nan],
"bool col with nan": [True, np.nan, False, np.nan, True],
"all nan": [np.nan, np.nan, np.nan, np.nan, np.nan],
"all nan cat": pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], dtype='category')
})


def test_invalid_strategy_parameters():
with pytest.raises(ValueError, match="Valid impute strategies are"):
Imputer(numeric_impute_strategy="not a valid strategy")
Expand All @@ -18,18 +36,25 @@ def test_imputer_default_parameters():
expected_parameters = {
'categorical_impute_strategy': 'most_frequent',
'numeric_impute_strategy': 'mean',
'fill_value': None
'categorical_fill_value': None,
'numeric_fill_value': None
}
assert imputer.parameters == expected_parameters


def test_imputer_init():
imputer = Imputer(categorical_impute_strategy="most_frequent",
numeric_impute_strategy="median")
@pytest.mark.parametrize("categorical_impute_strategy", ["most_frequent", "constant"])
@pytest.mark.parametrize("numeric_impute_strategy", ["mean", "median", "most_frequent", "constant"])
def test_imputer_init(categorical_impute_strategy, numeric_impute_strategy):

imputer = Imputer(categorical_impute_strategy=categorical_impute_strategy,
numeric_impute_strategy=numeric_impute_strategy,
categorical_fill_value="str_fill_value",
numeric_fill_value=-1)
expected_parameters = {
'categorical_impute_strategy': 'most_frequent',
'numeric_impute_strategy': 'median',
'fill_value': None
'categorical_impute_strategy': categorical_impute_strategy,
'numeric_impute_strategy': numeric_impute_strategy,
'categorical_fill_value': 'str_fill_value',
'numeric_fill_value': -1
}
expected_hyperparameters = {
"categorical_impute_strategy": ["most_frequent"],
Expand All @@ -40,22 +65,17 @@ def test_imputer_init():
assert imputer.hyperparameter_ranges == expected_hyperparameters


def test_numeric_only_input():
X = pd.DataFrame({
"int col": [0, 1, 2, 0, 3],
"float col": [0.0, 1.0, 0.0, -2.0, 5.],
"int with nan": [np.nan, 1, 2, 1, 0],
"float with nan": [0.0, 1.0, np.nan, -1.0, 0.],
"all nan": [np.nan, np.nan, np.nan, np.nan, np.nan]
})
def test_numeric_only_input(imputer_test_data):
X = imputer_test_data[["int col", "float col",
"int with nan", "float with nan", "all nan"]]
y = pd.Series([0, 0, 1, 0, 1])
imputer = Imputer(numeric_impute_strategy="median")
imputer.fit(X, y)
transformed = imputer.transform(X, y)
expected = pd.DataFrame({
"int col": [0, 1, 2, 0, 3],
"float col": [0.0, 1.0, 0.0, -2.0, 5.],
"int with nan": [1, 1, 2, 1, 0],
"int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
"float with nan": [0.0, 1.0, 0, -1.0, 0.]
})
assert_frame_equal(transformed, expected, check_dtype=False)
Expand All @@ -65,25 +85,19 @@ def test_numeric_only_input():
assert_frame_equal(transformed, expected, check_dtype=False)


def test_categorical_only_input():
X = pd.DataFrame({
"categorical col": pd.Series([0, 1, 2, 0, 3], dtype='category'),
"object col": ["b", "b", "a", "c", "d"],
"bool col": [True, False, False, True, True],
"categorical with nan": pd.Series([np.nan, 1, np.nan, 0, 3], dtype='category'),
"object with nan": ["b", "b", np.nan, "c", np.nan],
"bool col with nan": [True, np.nan, False, np.nan, True],
"all nan": pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], dtype='category')
})
def test_categorical_only_input(imputer_test_data):
X = imputer_test_data[["categorical col", "object col", "bool col",
"categorical with nan", "object with nan",
"bool col with nan", "all nan cat"]]
y = pd.Series([0, 0, 1, 0, 1])
imputer = Imputer()
imputer.fit(X, y)
transformed = imputer.transform(X, y)
expected = pd.DataFrame({
"categorical col": pd.Series([0, 1, 2, 0, 3], dtype='category'),
"categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
"object col": ["b", "b", "a", "c", "d"],
"bool col": [True, False, False, True, True],
"categorical with nan": pd.Series([0, 1, 0, 0, 3], dtype='category'),
"categorical with nan": pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
"object with nan": ["b", "b", "b", "c", "b"],
"bool col with nan": [True, True, False, True, True]
})
Expand All @@ -93,32 +107,20 @@ def test_categorical_only_input():
assert_frame_equal(transformed, expected, check_dtype=False)


def test_categorical_and_numeric_input():
X = pd.DataFrame({
"categorical col": pd.Series([0, 1, 2, 0, 3], dtype='category'),
"int col": [0, 1, 2, 0, 3],
"object col": ["b", "b", "a", "c", "d"],
"float col": [0.0, 1.0, 0.0, -2.0, 5.],
"bool col": [True, False, False, True, True],
"int with nan": [np.nan, 1, 2, 1, 0],
"categorical with nan": pd.Series([np.nan, 1, np.nan, 0, 3], dtype='category'),
"float with nan": [0.0, 1.0, np.nan, -1.0, 0.],
"object with nan": ["b", "b", np.nan, "c", np.nan],
"bool col with nan": [True, np.nan, False, np.nan, True],
"all nan": [np.nan, np.nan, np.nan, np.nan, np.nan]
})
def test_categorical_and_numeric_input(imputer_test_data):
X = imputer_test_data
y = pd.Series([0, 0, 1, 0, 1])
imputer = Imputer()
imputer.fit(X, y)
transformed = imputer.transform(X, y)
expected = pd.DataFrame({
"categorical col": pd.Series([0, 1, 2, 0, 3], dtype='category'),
"categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
"int col": [0, 1, 2, 0, 3],
"object col": ["b", "b", "a", "c", "d"],
"float col": [0.0, 1.0, 0.0, -2.0, 5.],
"bool col": [True, False, False, True, True],
"int with nan": [1, 1, 2, 1, 0],
"categorical with nan": pd.Series([0, 1, 0, 0, 3], dtype='category'),
"categorical with nan": pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
"int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
"float with nan": [0.0, 1.0, 0, -1.0, 0.],
"object with nan": ["b", "b", "b", "c", "b"],
"bool col with nan": [True, True, False, True, True]
Expand All @@ -130,11 +132,8 @@ def test_categorical_and_numeric_input():
assert_frame_equal(transformed, expected, check_dtype=False)


def test_drop_all_columns():
X = pd.DataFrame({
"all nan cat": pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], dtype='category'),
"all nan": [np.nan, np.nan, np.nan, np.nan, np.nan]
})
def test_drop_all_columns(imputer_test_data):
X = imputer_test_data[["all nan cat", "all nan"]]
y = pd.Series([0, 0, 1, 0, 1])
imputer = Imputer()
imputer.fit(X, y)
Expand Down Expand Up @@ -187,7 +186,7 @@ def test_imputer_empty_data(data_type):
if data_type == 'pd':
X = pd.DataFrame()
y = pd.Series()
expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
expected = pd.DataFrame(index=pd.Int64Index([]), columns=pd.Index([]))
else:
X = np.array([[]])
y = np.array([])
Expand Down Expand Up @@ -221,3 +220,46 @@ def test_imputer_resets_index():
pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9]},
dtype=float,
index=list(range(0, 9))))


def test_imputer_fill_value(imputer_test_data):
X = imputer_test_data[["int with nan", "categorical with nan",
"float with nan", "object with nan", "bool col with nan"]]
y = pd.Series([0, 0, 1, 0, 1])
imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
categorical_fill_value="fill", numeric_fill_value=-1)
imputer.fit(X, y)
transformed = imputer.transform(X, y)
expected = pd.DataFrame({
"int with nan": [-1, 1, 0, 0, 1],
"categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'),
"float with nan": [0.0, 1.0, -1, -1.0, 0.],
"object with nan": ["b", "b", "fill", "c", "fill"],
"bool col with nan": [True, "fill", False, "fill", True]
})
assert_frame_equal(transformed, expected, check_dtype=False)

imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
categorical_fill_value="fill", numeric_fill_value=-1)
transformed = imputer.fit_transform(X, y)
assert_frame_equal(transformed, expected, check_dtype=False)


def test_imputer_no_nans(imputer_test_data):
X = imputer_test_data[["categorical col", "object col", "bool col"]]
y = pd.Series([0, 0, 1, 0, 1])
imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
categorical_fill_value="fill", numeric_fill_value=-1)
imputer.fit(X, y)
transformed = imputer.transform(X, y)
expected = pd.DataFrame({
"categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
"object col": ["b", "b", "a", "c", "d"],
"bool col": [True, False, False, True, True],
})
assert_frame_equal(transformed, expected, check_dtype=False)

imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
categorical_fill_value="fill", numeric_fill_value=-1)
transformed = imputer.fit_transform(X, y)
assert_frame_equal(transformed, expected, check_dtype=False)
54 changes: 54 additions & 0 deletions evalml/tests/component_tests/test_simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,57 @@ def test_numpy_input():
np.testing.assert_almost_equal(X, np.array([[np.nan, 0, 1, np.nan],
[np.nan, 2, 3, 2],
[np.nan, 2, 3, 0]]))


@pytest.mark.parametrize("data_type", ["numeric", "categorical"])
def test_simple_imputer_fill_value(data_type):
if data_type == "numeric":
X = pd.DataFrame({
"some numeric": [np.nan, 1, 0],
"another numeric": [0, np.nan, 2]
})
fill_value = -1
expected = pd.DataFrame({
"some numeric": [-1, 1, 0],
"another numeric": [0, -1, 2]
})
else:
X = pd.DataFrame({
"categorical with nan": pd.Series([np.nan, "1", np.nan, "0", "3"], dtype='category'),
"object with nan": ["b", "b", np.nan, "c", np.nan]
})
fill_value = "fill"
expected = pd.DataFrame({
"categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'),
"object with nan": ["b", "b", "fill", "c", "fill"],
})
y = pd.Series([0, 0, 1, 0, 1])
imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value)
imputer.fit(X, y)
transformed = imputer.transform(X, y)
assert_frame_equal(transformed, expected, check_dtype=False)

imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value)
transformed = imputer.fit_transform(X, y)
assert_frame_equal(transformed, expected, check_dtype=False)


def test_simple_imputer_resets_index():
X = pd.DataFrame({'input_val': np.arange(10), 'target': np.arange(10)})
X.loc[5, 'input_val'] = np.nan
assert X.index.tolist() == list(range(10))

X.drop(0, inplace=True)
y = X.pop('target')
pd.testing.assert_frame_equal(X,
pd.DataFrame({'input_val': [1.0, 2, 3, 4, np.nan, 6, 7, 8, 9]},
dtype=float,
index=list(range(1, 10))))

imputer = SimpleImputer(impute_strategy="mean")
imputer.fit(X, y=y)
transformed = imputer.transform(X)
pd.testing.assert_frame_equal(transformed,
pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9]},
dtype=float,
index=list(range(0, 9))))

0 comments on commit bbc315f

Please sign in to comment.