Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Release Notes
* Standardized names of featurization components :pr:`3192`
* Changes
* Changed the default objective to ``MedianAE`` from ``R2`` for time series regression :pr:`3205`
* Removed all-nan Unknown to Double logical conversion in ``infer_feature_types`` :pr:`3196`
* Documentation Changes
* Testing Changes

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def fit(self, X, y=None):
)

for column, imputer in self.imputers.items():
imputer.fit(X[[column]])
imputer.fit(X.ww[[column]])

return self

Expand All @@ -107,7 +107,7 @@ def transform(self, X, y=None):

cols_to_drop = []
for column, imputer in self.imputers.items():
transformed = imputer.transform(X_ww[[column]])
transformed = imputer.transform(X_ww.ww[[column]])
if transformed.empty:
cols_to_drop.append(column)
else:
Expand Down
3 changes: 2 additions & 1 deletion evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None):

def _get_drop_all_null(X, y, problem_type, estimator_class, sampler_name=None):
component = []
all_null_cols = X.columns[X.isnull().all()]
non_index_unknown = X.ww.select(exclude=["index", "unknown"])
all_null_cols = non_index_unknown.columns[non_index_unknown.isnull().all()]
if len(all_null_cols) > 0:
component.append(DropNullColumns)
return component
Expand Down
13 changes: 10 additions & 3 deletions evalml/tests/component_tests/test_per_column_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ def test_fit_transform_drop_all_nan_columns():
"another_col": [0, 1, 2],
}
)
X.ww.init(logical_types={"all_nan": "Double"})
strategies = {
"all_nan": {"impute_strategy": "most_frequent"},
"some_nan": {"impute_strategy": "most_frequent"},
Expand All @@ -212,7 +213,7 @@ def test_fit_transform_drop_all_nan_columns():
pd.DataFrame(
{
"all_nan": [np.nan, np.nan, np.nan],
"some_nan": [np.nan, 1, 0],
"some_nan": [0.0, 1.0, 0.0],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did this need to change? Also, we should rename this column since there are no longer nans

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I thought I had answered this yesterday.

The only way to pass an all-NaN column to the imputer is if it's not typed as Unknown, which is what the default woodwork inference returns. So we need to init woodwork on the dataframe. However, because of #2055, the imputer will modify the input data.

So the some_nan column will have the NaNs imputed. We can't change the name of the column because the intention is to compare the input data before and after running through the imputer and the column name doesn't change after going through the imputer.

"another_col": [0, 1, 2],
}
),
Expand All @@ -227,6 +228,7 @@ def test_transform_drop_all_nan_columns():
"another_col": [0, 1, 2],
}
)
X.ww.init(logical_types={"all_nan": "Double"})
strategies = {
"all_nan": {"impute_strategy": "most_frequent"},
"some_nan": {"impute_strategy": "most_frequent"},
Expand All @@ -243,7 +245,7 @@ def test_transform_drop_all_nan_columns():
pd.DataFrame(
{
"all_nan": [np.nan, np.nan, np.nan],
"some_nan": [np.nan, 1, 0],
"some_nan": [0.0, 1.0, 0.0],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as above ^

"another_col": [0, 1, 2],
}
),
Expand All @@ -255,6 +257,7 @@ def test_transform_drop_all_nan_columns_empty():
strategies = {
"0": {"impute_strategy": "most_frequent"},
}
X.ww.init(logical_types={0: "Double", 1: "Double", 2: "Double"})
transformer = PerColumnImputer(impute_strategies=strategies)
assert transformer.fit_transform(X).empty
assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))
Expand Down Expand Up @@ -335,6 +338,9 @@ def test_per_column_imputer_impute_all_is_false():
"column_with_nan_included": "double",
}
)
X.ww.init(
logical_types={"all_nan_included": "Double", "all_nan_not_included": "Double"}
)
X_t = transformer.fit_transform(X)
assert_frame_equal(X_expected, X_t)
assert_frame_equal(
Expand All @@ -344,7 +350,8 @@ def test_per_column_imputer_impute_all_is_false():
"all_nan_not_included": [np.nan, np.nan, np.nan],
"all_nan_included": [np.nan, np.nan, np.nan],
"column_with_nan_not_included": [np.nan, 1, 0],
"column_with_nan_included": [0, 1, np.nan],
# Because of https://github.com/alteryx/evalml/issues/2055
"column_with_nan_included": [0.0, 1.0, 0.0],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rename

}
),
)
Expand Down
15 changes: 9 additions & 6 deletions evalml/tests/component_tests/test_simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_simple_imputer_fit_transform_drop_all_nan_columns():
"another_col": [0, 1, 2],
}
)

X.ww.init(logical_types={"all_nan": "Double"})
transformer = SimpleImputer(impute_strategy="most_frequent")
X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]})
X_t = transformer.fit_transform(X)
Expand All @@ -190,6 +190,7 @@ def test_simple_imputer_transform_drop_all_nan_columns():
"another_col": [0, 1, 2],
}
)
X.ww.init(logical_types={"all_nan": "Double"})
transformer = SimpleImputer(impute_strategy="most_frequent")
transformer.fit(X)
X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]})
Expand All @@ -208,6 +209,7 @@ def test_simple_imputer_transform_drop_all_nan_columns():

def test_simple_imputer_transform_drop_all_nan_columns_empty():
X = pd.DataFrame([[np.nan, np.nan, np.nan]])
X.ww.init(logical_types={0: "Double", 1: "Double", 2: "Double"})
transformer = SimpleImputer(impute_strategy="most_frequent")
assert transformer.fit_transform(X).empty
assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))
Expand All @@ -219,12 +221,12 @@ def test_simple_imputer_transform_drop_all_nan_columns_empty():


def test_simple_imputer_numpy_input():
X = np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]])
X = np.array([[1, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]])
transformer = SimpleImputer(impute_strategy="mean")
X_expected_arr = np.array([[0, 1, 1], [2, 3, 2], [2, 3, 0]])
X_expected_arr = np.array([[1, 0, 1, 1], [1, 2, 3, 2], [1, 2, 3, 0]])
assert np.allclose(X_expected_arr, transformer.fit_transform(X))
np.testing.assert_almost_equal(
X, np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]])
X, np.array([[1, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]])
)


Expand Down Expand Up @@ -305,11 +307,12 @@ def test_simple_imputer_does_not_reset_index():


def test_simple_imputer_with_none():
# No all none here because ww default inference will treat
# it as unknown which is not a supported feature.
X = pd.DataFrame(
{
"int with None": [1, 0, 5, None],
"float with None": [0.1, 0.0, 0.5, None],
"all None": [None, None, None, None],
}
)
y = pd.Series([0, 0, 1, 0, 1])
Expand Down Expand Up @@ -440,7 +443,7 @@ def test_component_handles_pre_init_ww():
df = pd.DataFrame(
{"part_null": [0, 1, 2, None], "all_null": [None, None, None, None]}
)
df.ww.init()
df.ww.init(logical_types={"all_null": "Double"})
imputed = SimpleImputer().fit_transform(df)

assert "all_null" not in imputed.columns
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1147,6 +1147,7 @@ def test_partial_dependence_all_nan_value_error(
logistic_regression_binary_pipeline.fit(X, y)

pred_df = pd.DataFrame({"a": [None] * 5, "b": [1, 2, 3, 4, 4], "c": [None] * 5})
pred_df.ww.init(logical_types={"a": "Double", "c": "Double", "b": "Integer"})
message = "The following features have all NaN values and so the partial dependence cannot be computed: {}"
with pytest.raises(PartialDependenceError, match=message.format("'a'")) as e:
partial_dependence(
Expand Down Expand Up @@ -1181,7 +1182,7 @@ def test_partial_dependence_all_nan_value_error(
)
assert e.value.code == PartialDependenceErrorCode.FEATURE_IS_ALL_NANS

pred_df = pred_df.rename(columns={"a": 0})
pred_df = pred_df.ww.rename(columns={"a": 0})
with pytest.raises(PartialDependenceError, match=message.format("'0'")) as e:
partial_dependence(
logistic_regression_binary_pipeline, pred_df, features=0, grid_resolution=10
Expand Down
5 changes: 2 additions & 3 deletions evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from evalml.pipelines.components import (
DateTimeFeaturizer,
DropColumns,
DropNullColumns,
DropRowsTransformer,
EmailFeaturizer,
Estimator,
Expand Down Expand Up @@ -131,7 +130,7 @@ def test_make_pipeline(
if estimator_class.model_family == ModelFamily.LINEAR_MODEL
else []
)
drop_null = [DropNullColumns] if "all_null" in column_names else []
drop_null = [DropColumns] if "all_null" in column_names else []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is because all_null will be considered an unknown column now, rather than double right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes!

replace_null = (
[ReplaceNullableTypes]
if (
Expand All @@ -147,7 +146,7 @@ def test_make_pipeline(
)
email_featurizer = [EmailFeaturizer] if "email" in column_names else []
url_featurizer = [URLFeaturizer] if "url" in column_names else []
imputer = [] if (column_names in [["ip"]]) else [Imputer]
imputer = [] if (column_names in [["ip"], ["all_null"]]) else [Imputer]

if is_time_series(problem_type):
expected_components = (
Expand Down
3 changes: 1 addition & 2 deletions evalml/tests/utils_tests/test_woodwork_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,7 @@ def test_infer_feature_types_NA_to_nan(null_col, already_inited):
df.ww.init()
inferred_df = infer_feature_types(df)
if all(df["unknown"].isnull()):
assert isinstance(inferred_df.ww.logical_types["unknown"], Double)
assert all([isinstance(x, type(np.nan)) for x in inferred_df["unknown"]])
assert all([isinstance(x, type(pd.NA)) for x in inferred_df["unknown"]])
else:
assert all([isinstance(x, str) for x in df["unknown"]])

Expand Down
23 changes: 2 additions & 21 deletions evalml/utils/woodwork_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import numpy as np
import pandas as pd
import woodwork as ww
from woodwork.logical_types import Unknown

from evalml.utils.gen_utils import is_all_numeric

Expand Down Expand Up @@ -48,24 +47,6 @@ def infer_feature_types(data, feature_types=None):
elif isinstance(data, np.ndarray):
data = _numpy_to_pandas(data)

def convert_all_nan_unknown_to_double(data):
def is_column_pd_na(data, col):
return data[col].isna().all()

def is_column_unknown(data, col):
return isinstance(data.ww.logical_types[col], Unknown)

if isinstance(data, pd.DataFrame):
all_null_unk_cols = [
col
for col in data.columns
if (is_column_unknown(data, col) and is_column_pd_na(data, col))
]
if len(all_null_unk_cols):
for col in all_null_unk_cols:
data.ww.set_types({col: "Double"})
return data

if data.ww.schema is not None:
if isinstance(data, pd.DataFrame) and not ww.is_schema_valid(
data, data.ww.schema
Expand All @@ -81,7 +62,7 @@ def is_column_unknown(data, col):
ww_error = f"{ww_error}. Please initialize ww with df.ww.init() to get rid of this message."
raise ValueError(ww_error)
data.ww.init(schema=data.ww.schema)
return convert_all_nan_unknown_to_double(data)
return data

if isinstance(data, pd.Series):
if all(data.isna()):
Expand All @@ -91,7 +72,7 @@ def is_column_unknown(data, col):
else:
ww_data = data.copy()
ww_data.ww.init(logical_types=feature_types)
return convert_all_nan_unknown_to_double(ww_data)
return ww_data


def _convert_numeric_dataset_pandas(X, y):
Expand Down