From b2c387014d44a92db447261ccb79545d219dcdab Mon Sep 17 00:00:00 2001 From: freddyaboulton Date: Thu, 6 Jan 2022 17:06:55 -0500 Subject: [PATCH 1/4] Fix tests --- .../imputers/per_column_imputer.py | 4 +-- evalml/pipelines/utils.py | 3 +- .../test_per_column_imputer.py | 12 ++++++-- .../component_tests/test_simple_imputer.py | 15 ++++++---- .../test_partial_dependence.py | 3 +- .../pipeline_tests/test_pipeline_utils.py | 4 +-- .../tests/utils_tests/test_woodwork_utils.py | 3 +- evalml/utils/woodwork_utils.py | 30 +++++++++---------- 8 files changed, 42 insertions(+), 32 deletions(-) diff --git a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py index cc018ed25c..7a047d3e0b 100644 --- a/evalml/pipelines/components/transformers/imputers/per_column_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/per_column_imputer.py @@ -88,7 +88,7 @@ def fit(self, X, y=None): ) for column, imputer in self.imputers.items(): - imputer.fit(X[[column]]) + imputer.fit(X.ww[[column]]) return self @@ -107,7 +107,7 @@ def transform(self, X, y=None): cols_to_drop = [] for column, imputer in self.imputers.items(): - transformed = imputer.transform(X_ww[[column]]) + transformed = imputer.transform(X_ww.ww[[column]]) if transformed.empty: cols_to_drop.append(column) else: diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 70ad593019..693d2268b6 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -71,7 +71,8 @@ def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None): def _get_drop_all_null(X, y, problem_type, estimator_class, sampler_name=None): component = [] - all_null_cols = X.columns[X.isnull().all()] + non_index_unknown = X.ww.select(exclude=["index", "unknown"]) + all_null_cols = non_index_unknown.columns[non_index_unknown.isnull().all()] if len(all_null_cols) > 0: component.append(DropNullColumns) return component diff --git a/evalml/tests/component_tests/test_per_column_imputer.py b/evalml/tests/component_tests/test_per_column_imputer.py index caa6311fba..7b0c383768 100644 --- a/evalml/tests/component_tests/test_per_column_imputer.py +++ b/evalml/tests/component_tests/test_per_column_imputer.py @@ -198,6 +198,7 @@ def test_fit_transform_drop_all_nan_columns(): "another_col": [0, 1, 2], } ) + X.ww.init(logical_types={"all_nan": "Double"}) strategies = { "all_nan": {"impute_strategy": "most_frequent"}, "some_nan": {"impute_strategy": "most_frequent"}, @@ -212,7 +213,7 @@ def test_fit_transform_drop_all_nan_columns(): pd.DataFrame( { "all_nan": [np.nan, np.nan, np.nan], - "some_nan": [np.nan, 1, 0], + "some_nan": [0., 1., 0.], "another_col": [0, 1, 2], } ), @@ -227,6 +228,7 @@ def test_transform_drop_all_nan_columns(): "another_col": [0, 1, 2], } ) + X.ww.init(logical_types={"all_nan": "Double"}) strategies = { "all_nan": {"impute_strategy": "most_frequent"}, "some_nan": {"impute_strategy": "most_frequent"}, @@ -243,7 +245,7 @@ def test_transform_drop_all_nan_columns(): pd.DataFrame( { "all_nan": [np.nan, np.nan, np.nan], - "some_nan": [np.nan, 1, 0], + "some_nan": [0., 1., 0.], "another_col": [0, 1, 2], } ), @@ -255,6 +257,7 @@ def test_transform_drop_all_nan_columns_empty(): strategies = { "0": {"impute_strategy": "most_frequent"}, } + X.ww.init(logical_types={0: "Double", 1: "Double", 2: "Double"}) transformer = PerColumnImputer(impute_strategies=strategies) assert transformer.fit_transform(X).empty assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]])) @@ -335,6 +338,8 @@ def test_per_column_imputer_impute_all_is_false(): "column_with_nan_included": "double", } ) + X.ww.init(logical_types={"all_nan_included": "Double", + "all_nan_not_included": "Double"}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected, X_t) assert_frame_equal( @@ -344,7 +349,8 @@ def test_per_column_imputer_impute_all_is_false(): "all_nan_not_included": [np.nan, np.nan, np.nan], "all_nan_included": [np.nan, np.nan, np.nan], "column_with_nan_not_included": [np.nan, 1, 0], - "column_with_nan_included": [0, 1, np.nan], + # Because of https://github.com/alteryx/evalml/issues/2055 + "column_with_nan_included": [0., 1., 0.], } ), ) diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py index 7d0de8c978..a1b8336feb 100644 --- a/evalml/tests/component_tests/test_simple_imputer.py +++ b/evalml/tests/component_tests/test_simple_imputer.py @@ -165,7 +165,7 @@ def test_simple_imputer_fit_transform_drop_all_nan_columns(): "another_col": [0, 1, 2], } ) - + X.ww.init(logical_types={"all_nan": "Double"}) transformer = SimpleImputer(impute_strategy="most_frequent") X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]}) X_t = transformer.fit_transform(X) @@ -190,6 +190,7 @@ def test_simple_imputer_transform_drop_all_nan_columns(): "another_col": [0, 1, 2], } ) + X.ww.init(logical_types={"all_nan": "Double"}) transformer = SimpleImputer(impute_strategy="most_frequent") transformer.fit(X) X_expected_arr = pd.DataFrame({"some_nan": [0, 1, 0], "another_col": [0, 1, 2]}) @@ -208,6 +209,7 @@ def test_simple_imputer_transform_drop_all_nan_columns(): def test_simple_imputer_transform_drop_all_nan_columns_empty(): X = pd.DataFrame([[np.nan, np.nan, np.nan]]) + X.ww.init(logical_types={0: "Double", 1: "Double", 2: "Double"}) transformer = SimpleImputer(impute_strategy="most_frequent") assert transformer.fit_transform(X).empty assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]])) @@ -219,12 +221,12 @@ def test_simple_imputer_transform_drop_all_nan_columns_empty(): def test_simple_imputer_numpy_input(): - X = np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]]) + X = np.array([[1, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]]) transformer = SimpleImputer(impute_strategy="mean") - X_expected_arr = np.array([[0, 1, 1], [2, 3, 2], [2, 3, 0]]) + X_expected_arr = np.array([[1, 0, 1, 1], [1, 2, 3, 2], [1, 2, 3, 0]]) assert np.allclose(X_expected_arr, transformer.fit_transform(X)) np.testing.assert_almost_equal( - X, np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]]) + X, np.array([[1, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]]) ) @@ -305,11 +307,12 @@ def test_simple_imputer_does_not_reset_index(): def test_simple_imputer_with_none(): + # No all none here because ww default inference will treat + # it as unknown which is not a supported feature. X = pd.DataFrame( { "int with None": [1, 0, 5, None], "float with None": [0.1, 0.0, 0.5, None], - "all None": [None, None, None, None], } ) y = pd.Series([0, 0, 1, 0, 1]) @@ -440,7 +443,7 @@ def test_component_handles_pre_init_ww(): df = pd.DataFrame( {"part_null": [0, 1, 2, None], "all_null": [None, None, None, None]} ) - df.ww.init() + df.ww.init(logical_types={'all_null': "Double"}) imputed = SimpleImputer().fit_transform(df) assert "all_null" not in imputed.columns diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index 86a71cb646..669e03bdb1 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -1147,6 +1147,7 @@ def test_partial_dependence_all_nan_value_error( logistic_regression_binary_pipeline.fit(X, y) pred_df = pd.DataFrame({"a": [None] * 5, "b": [1, 2, 3, 4, 4], "c": [None] * 5}) + pred_df.ww.init(logical_types={"a": "Double", "c": "Double", "b": "Integer"}) message = "The following features have all NaN values and so the partial dependence cannot be computed: {}" with pytest.raises(PartialDependenceError, match=message.format("'a'")) as e: partial_dependence( @@ -1181,7 +1182,7 @@ def test_partial_dependence_all_nan_value_error( ) assert e.value.code == PartialDependenceErrorCode.FEATURE_IS_ALL_NANS - pred_df = pred_df.rename(columns={"a": 0}) + pred_df = pred_df.ww.rename(columns={"a": 0}) with pytest.raises(PartialDependenceError, match=message.format("'0'")) as e: partial_dependence( logistic_regression_binary_pipeline, pred_df, features=0, grid_resolution=10 diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 7c0379c10c..12d57b5f0e 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -131,7 +131,7 @@ def test_make_pipeline( if estimator_class.model_family == ModelFamily.LINEAR_MODEL else [] ) - drop_null = [DropNullColumns] if "all_null" in column_names else [] + drop_null = [DropColumns] if "all_null" in column_names else [] replace_null = ( [ReplaceNullableTypes] if ( @@ -147,7 +147,7 @@ def test_make_pipeline( ) email_featurizer = [EmailFeaturizer] if "email" in column_names else [] url_featurizer = [URLFeaturizer] if "url" in column_names else [] - imputer = [] if (column_names in [["ip"]]) else [Imputer] + imputer = [] if (column_names in [["ip"], ["all_null"]]) else [Imputer] if is_time_series(problem_type): expected_components = ( diff --git a/evalml/tests/utils_tests/test_woodwork_utils.py b/evalml/tests/utils_tests/test_woodwork_utils.py index 59df5e747a..5fd4446ab3 100644 --- a/evalml/tests/utils_tests/test_woodwork_utils.py +++ b/evalml/tests/utils_tests/test_woodwork_utils.py @@ -221,8 +221,7 @@ def test_infer_feature_types_NA_to_nan(null_col, already_inited): df.ww.init() inferred_df = infer_feature_types(df) if all(df["unknown"].isnull()): - assert isinstance(inferred_df.ww.logical_types["unknown"], Double) - assert all([isinstance(x, type(np.nan)) for x in inferred_df["unknown"]]) + assert all([isinstance(x, type(pd.NA)) for x in inferred_df["unknown"]]) else: assert all([isinstance(x, str) for x in df["unknown"]]) diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index 3331edb3f7..c1359a1ad3 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -49,21 +49,21 @@ def infer_feature_types(data, feature_types=None): data = _numpy_to_pandas(data) def convert_all_nan_unknown_to_double(data): - def is_column_pd_na(data, col): - return data[col].isna().all() - - def is_column_unknown(data, col): - return isinstance(data.ww.logical_types[col], Unknown) - - if isinstance(data, pd.DataFrame): - all_null_unk_cols = [ - col - for col in data.columns - if (is_column_unknown(data, col) and is_column_pd_na(data, col)) - ] - if len(all_null_unk_cols): - for col in all_null_unk_cols: - data.ww.set_types({col: "Double"}) + # def is_column_pd_na(data, col): + # return data[col].isna().all() + # + # def is_column_unknown(data, col): + # return isinstance(data.ww.logical_types[col], Unknown) + # + # if isinstance(data, pd.DataFrame): + # all_null_unk_cols = [ + # col + # for col in data.columns + # if (is_column_unknown(data, col) and is_column_pd_na(data, col)) + # ] + # if len(all_null_unk_cols): + # for col in all_null_unk_cols: + # data.ww.set_types({col: "Double"}) return data if data.ww.schema is not None: From 9ed3ead2e5c07e6339dc86aa2880fd7ae4cdca50 Mon Sep 17 00:00:00 2001 From: freddyaboulton Date: Thu, 6 Jan 2022 17:13:07 -0500 Subject: [PATCH 2/4] Fix lint --- .../test_per_column_imputer.py | 11 +++++---- .../component_tests/test_simple_imputer.py | 2 +- .../pipeline_tests/test_pipeline_utils.py | 1 - evalml/utils/woodwork_utils.py | 23 ++----------------- 4 files changed, 9 insertions(+), 28 deletions(-) diff --git a/evalml/tests/component_tests/test_per_column_imputer.py b/evalml/tests/component_tests/test_per_column_imputer.py index 7b0c383768..8f828f5521 100644 --- a/evalml/tests/component_tests/test_per_column_imputer.py +++ b/evalml/tests/component_tests/test_per_column_imputer.py @@ -213,7 +213,7 @@ def test_fit_transform_drop_all_nan_columns(): pd.DataFrame( { "all_nan": [np.nan, np.nan, np.nan], - "some_nan": [0., 1., 0.], + "some_nan": [0.0, 1.0, 0.0], "another_col": [0, 1, 2], } ), @@ -245,7 +245,7 @@ def test_transform_drop_all_nan_columns(): pd.DataFrame( { "all_nan": [np.nan, np.nan, np.nan], - "some_nan": [0., 1., 0.], + "some_nan": [0.0, 1.0, 0.0], "another_col": [0, 1, 2], } ), @@ -338,8 +338,9 @@ def test_per_column_imputer_impute_all_is_false(): "column_with_nan_included": "double", } ) - X.ww.init(logical_types={"all_nan_included": "Double", - "all_nan_not_included": "Double"}) + X.ww.init( + logical_types={"all_nan_included": "Double", "all_nan_not_included": "Double"} + ) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected, X_t) assert_frame_equal( @@ -350,7 +351,7 @@ def test_per_column_imputer_impute_all_is_false(): "all_nan_included": [np.nan, np.nan, np.nan], "column_with_nan_not_included": [np.nan, 1, 0], # Because of https://github.com/alteryx/evalml/issues/2055 - "column_with_nan_included": [0., 1., 0.], + "column_with_nan_included": [0.0, 1.0, 0.0], } ), ) diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py index a1b8336feb..2ba8b6d5f8 100644 --- a/evalml/tests/component_tests/test_simple_imputer.py +++ b/evalml/tests/component_tests/test_simple_imputer.py @@ -443,7 +443,7 @@ def test_component_handles_pre_init_ww(): df = pd.DataFrame( {"part_null": [0, 1, 2, None], "all_null": [None, None, None, None]} ) - df.ww.init(logical_types={'all_null': "Double"}) + df.ww.init(logical_types={"all_null": "Double"}) imputed = SimpleImputer().fit_transform(df) assert "all_null" not in imputed.columns diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 12d57b5f0e..b44a6fde9f 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -14,7 +14,6 @@ from evalml.pipelines.components import ( DateTimeFeaturizer, DropColumns, - DropNullColumns, DropRowsTransformer, EmailFeaturizer, Estimator, diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index c1359a1ad3..d1aa8a1bb5 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -2,7 +2,6 @@ import numpy as np import pandas as pd import woodwork as ww -from woodwork.logical_types import Unknown from evalml.utils.gen_utils import is_all_numeric @@ -48,24 +47,6 @@ def infer_feature_types(data, feature_types=None): elif isinstance(data, np.ndarray): data = _numpy_to_pandas(data) - def convert_all_nan_unknown_to_double(data): - # def is_column_pd_na(data, col): - # return data[col].isna().all() - # - # def is_column_unknown(data, col): - # return isinstance(data.ww.logical_types[col], Unknown) - # - # if isinstance(data, pd.DataFrame): - # all_null_unk_cols = [ - # col - # for col in data.columns - # if (is_column_unknown(data, col) and is_column_pd_na(data, col)) - # ] - # if len(all_null_unk_cols): - # for col in all_null_unk_cols: - # data.ww.set_types({col: "Double"}) - return data - if data.ww.schema is not None: if isinstance(data, pd.DataFrame) and not ww.is_schema_valid( data, data.ww.schema @@ -81,7 +62,7 @@ def convert_all_nan_unknown_to_double(data): ww_error = f"{ww_error}. Please initialize ww with df.ww.init() to get rid of this message." raise ValueError(ww_error) data.ww.init(schema=data.ww.schema) - return convert_all_nan_unknown_to_double(data) + return data if isinstance(data, pd.Series): if all(data.isna()): @@ -91,7 +72,7 @@ def convert_all_nan_unknown_to_double(data): else: ww_data = data.copy() ww_data.ww.init(logical_types=feature_types) - return convert_all_nan_unknown_to_double(ww_data) + return ww_data def _convert_numeric_dataset_pandas(X, y): From 4f0e10d027d5130bb487f6c018a1044ae114e3c1 Mon Sep 17 00:00:00 2001 From: freddyaboulton Date: Fri, 7 Jan 2022 11:22:02 -0500 Subject: [PATCH 3/4] Add to release notes --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 7519909165..75b1efbb9a 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -30,6 +30,7 @@ Release Notes * Removed usage of scikit-learn's ``LabelEncoder`` in favor of ours :pr:`3161` * Removed nullable types checking from ``infer_feature_types`` :pr:`3156` * Fixed ``mean_cv_data`` and ``validation_score`` values in AutoMLSearch.rankings to reflect cv score or ``NaN`` when appropriate :pr:`3162` + * Removed all-nan Unknown to Double logical conversion in ``infer_feature_types`` :pr:`3196` * Documentation Changes * Testing Changes * Updated tests to use new pipeline API instead of defining custom pipeline classes :pr:`3172` From c530d4a4441079f0b6873c68c14b7c1c8108f35d Mon Sep 17 00:00:00 2001 From: freddyaboulton Date: Mon, 10 Jan 2022 14:36:56 -0500 Subject: [PATCH 4/4] Move 3196 to future releases section --- docs/source/release_notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 75b1efbb9a..7bf22ca132 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -9,6 +9,7 @@ Release Notes * Standardized names of featurization components :pr:`3192` * Changes * Changed the default objective to ``MedianAE`` from ``R2`` for time series regression :pr:`3205` + * Removed all-nan Unknown to Double logical conversion in ``infer_feature_types`` :pr:`3196` * Documentation Changes * Testing Changes @@ -30,7 +31,6 @@ Release Notes * Removed usage of scikit-learn's ``LabelEncoder`` in favor of ours :pr:`3161` * Removed nullable types checking from ``infer_feature_types`` :pr:`3156` * Fixed ``mean_cv_data`` and ``validation_score`` values in AutoMLSearch.rankings to reflect cv score or ``NaN`` when appropriate :pr:`3162` - * Removed all-nan Unknown to Double logical conversion in ``infer_feature_types`` :pr:`3196` * Documentation Changes * Testing Changes * Updated tests to use new pipeline API instead of defining custom pipeline classes :pr:`3172`