diff --git a/.github/meta.yaml b/.github/meta.yaml index 8c50b73728..914d140c2e 100644 --- a/.github/meta.yaml +++ b/.github/meta.yaml @@ -37,7 +37,7 @@ outputs: - requirements-parser >=0.2.0 - shap >=0.40.0 - texttable >=1.6.2 - - woodwork <=0.19.0 + - woodwork >=0.21.1 - featuretools>=1.16.0 - nlp-primitives>=2.9.0 - python >=3.8.* diff --git a/= b/= new file mode 100644 index 0000000000..e69de29bb2 diff --git a/core-requirements.txt b/core-requirements.txt index 0b27f96e28..9afe031b46 100644 --- a/core-requirements.txt +++ b/core-requirements.txt @@ -11,7 +11,7 @@ requirements-parser>=0.2.0 shap>=0.40.0 statsmodels>=0.12.2 texttable>=1.6.2 -woodwork<=0.19.0 +woodwork>= 0.21.1 dask>=2022.2.0, !=2022.10.1 nlp-primitives>=2.9.0 featuretools>=1.16.0 diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 08b779e018..023332b6fc 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -9,6 +9,7 @@ Release Notes * Fix ARIMA not accounting for gap in prediction from end of training data :pr:`3884` * Changes * Added a threshold to ``DateTimeFormatDataCheck`` to account for too many duplicate or nan values :pr:`3883` + * Changed treatment of ``Boolean`` columns for ``SimpleImputer`` and ``ClassImbalanceDataCheck`` to be compatible with new Woodwork inference :pr:`3892` * Split decomposer ``seasonal_period`` parameter into ``seasonal_smoother`` and ``period`` parameters :pr:`3896` * Documentation Changes * Hid non-essential warning messages in time series docs :pr:`3890` diff --git a/evalml/data_checks/class_imbalance_data_check.py b/evalml/data_checks/class_imbalance_data_check.py index 15fe20ae22..223906bd64 100644 --- a/evalml/data_checks/class_imbalance_data_check.py +++ b/evalml/data_checks/class_imbalance_data_check.py @@ -3,6 +3,7 @@ Use for classification problems. """ import numpy as np +import pandas as pd from evalml.data_checks import ( DataCheck, @@ -146,7 +147,23 @@ def validate(self, X, y): """ messages = [] + original_vc = pd.Series(y).value_counts(sort=True) y = infer_feature_types(y) + new_vc = y.value_counts(sort=True) + if str(y.ww.logical_type) not in ["Boolean", "BooleanNullable"]: + # If the inferred logical type is not in Boolean/BooleanNullable, then a + # mapping to the original values is not necessary. + after_to_before_inference_mapping = {new: new for new in new_vc.keys()} + else: + # If the inferred logical type is in Boolean/BooleanNullable, then a + # mapping to the original values will be needed for the data check messages + after_to_before_inference_mapping = { + new: old for old, new in zip(original_vc.keys(), new_vc.keys()) + } + # Needed for checking severe imbalance to verify values present below threshold + before_to_after_inference_mapping = { + old: new for new, old in after_to_before_inference_mapping.items() + } fold_counts = y.value_counts(normalize=False, sort=True) fold_counts = np.floor(fold_counts * self.test_size).astype(int) @@ -155,7 +172,10 @@ def validate(self, X, y): # search for targets that occur less than twice the number of cv folds first below_threshold_folds = fold_counts.where(fold_counts < self.cv_folds).dropna() if len(below_threshold_folds): - below_threshold_values = below_threshold_folds.index.tolist() + below_threshold_values = [ + after_to_before_inference_mapping.get(each) + for each in below_threshold_folds.index.tolist() + ] error_msg = "The number of instances of these targets is less than 2 * the number of cross folds = {} instances: {}" messages.append( DataCheckError( @@ -173,7 +193,10 @@ def validate(self, X, y): below_threshold = counts.where(counts < self.threshold).dropna() # if there are items that occur less than the threshold, add them to the list of results if len(below_threshold): - below_threshold_values = below_threshold.index.tolist() + below_threshold_values = [ + after_to_before_inference_mapping.get(each) + for each in below_threshold.index.tolist() + ] warning_msg = "The following labels fall below {:.0f}% of the target: {}" messages.append( DataCheckWarning( @@ -188,8 +211,15 @@ def validate(self, X, y): ) sample_counts = fold_counts.where(fold_counts < self.min_samples).dropna() if len(below_threshold) and len(sample_counts): - sample_count_values = sample_counts.index.tolist() - severe_imbalance = [v for v in sample_count_values if v in below_threshold] + sample_count_values = [ + after_to_before_inference_mapping.get(each) + for each in sample_counts.index.tolist() + ] + severe_imbalance = [ + v + for v in sample_count_values + if before_to_after_inference_mapping.get(v) in below_threshold + ] warning_msg = "The following labels in the target have severe class imbalance because they fall under {:.0f}% of the target and have less than {} samples: {}" messages.append( DataCheckWarning( diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index 46c86b420f..48344ed562 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -102,6 +102,7 @@ def transform(self, X, y=None): """ X = infer_feature_types(X) original_schema = X.ww.schema + X = set_boolean_columns_to_categorical(X) # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (X.dtypes == bool).all(): diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index 998ef387b4..412aa1c29f 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -198,9 +198,9 @@ def set_boolean_columns_to_categorical(X): X = X.ww.copy() X_schema = X.ww.schema original_X_schema = X_schema.get_subset_schema( - subset_cols=X_schema._filter_cols(exclude=["Boolean"]), + subset_cols=X_schema._filter_cols(exclude=["Boolean", "BooleanNullable"]), ) - X_boolean_cols = X_schema._filter_cols(include=["Boolean"]) + X_boolean_cols = X_schema._filter_cols(include=["Boolean", "BooleanNullable"]) new_ltypes_for_boolean_cols = {col: "Categorical" for col in X_boolean_cols} X.ww.init(schema=original_X_schema, logical_types=new_ltypes_for_boolean_cols) return X diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py index 0197150a46..75c9c385bd 100644 --- a/evalml/tests/component_tests/test_simple_imputer.py +++ b/evalml/tests/component_tests/test_simple_imputer.py @@ -7,9 +7,11 @@ from pandas.testing import assert_frame_equal from woodwork.logical_types import ( Boolean, + BooleanNullable, Categorical, Double, Integer, + IntegerNullable, NaturalLanguage, ) @@ -459,12 +461,18 @@ def test_simple_imputer_woodwork_custom_overrides_returned_by_components( "categorical col": imputer_test_data[["categorical col"]], "bool col": imputer_test_data[["bool col"]], }[data] + if str(logical_type) == "Boolean" and has_nan == "has_nan": + logical_type = "BooleanNullable" + if str(logical_type) == "Integer" and has_nan == "has_nan": + logical_type = "IntegerNullable" logical_type = { "Integer": Integer, + "IntegerNullable": IntegerNullable, "Double": Double, "Categorical": Categorical, "NaturalLanguage": NaturalLanguage, "Boolean": Boolean, + "BooleanNullable": BooleanNullable, }[logical_type] y = pd.Series([1, 2, 1]) @@ -486,6 +494,8 @@ def test_simple_imputer_woodwork_custom_overrides_returned_by_components( imputer = SimpleImputer(impute_strategy=impute_strategy_to_use) transformed = imputer.fit_transform(X, y) assert isinstance(transformed, pd.DataFrame) + if str(logical_type) == "IntegerNullable": + logical_type = Double assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == { data: logical_type, } diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py index ef6baa6299..56cfe27964 100644 --- a/evalml/tests/component_tests/test_utils.py +++ b/evalml/tests/component_tests/test_utils.py @@ -284,12 +284,17 @@ def test_set_boolean_columns_to_categorical(): ) X_e = pd.DataFrame( { + "bool with nan": pd.Series( + [True, pd.NA, False, pd.NA, False], + dtype="boolean", + ), "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), }, ) X_e = infer_feature_types(X_e) X_e.ww.set_types( logical_types={ + "bool with nan": "Categorical", "bool no nan": "Categorical", }, ) @@ -298,7 +303,7 @@ def test_set_boolean_columns_to_categorical(): X = set_boolean_columns_to_categorical(X) - assert len(X.ww.select(["Categorical"]).columns) == 1 + assert len(X.ww.select(["Categorical"]).columns) == 2 assert len(X.ww.select(["Categorical"]) == 5) assert_frame_equal( diff --git a/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py b/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py index 7c3d5e6b18..aaf8a712cb 100644 --- a/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py +++ b/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py @@ -454,6 +454,24 @@ def test_class_imbalance_severe(test_size, min_samples, input_type): * 50 * int(1 / test_size), ) + y_values_binary_str = pd.Series( + [ + "no", + "yes", + "yes", + "yes", + "yes", + "yes", + "yes", + "yes", + "yes", + "yes", + "yes", + "yes", + ] + * 50 + * int(1 / test_size), + ) if input_type == "ww": X.ww.init() y_values_binary = ww.init_series(y_values_binary, logical_type="integer") @@ -487,6 +505,26 @@ def test_class_imbalance_severe(test_size, min_samples, input_type): assert class_imbalance_check.validate(X, y_values_binary) == warnings assert class_imbalance_check.validate(X, y_values_multiclass) == warnings + warnings = [ + DataCheckWarning( + message="The following labels fall below 10% of the target: ['no']", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, + details={"target_values": ["no"]}, + ).to_dict(), + ] + if min_samples > 50: + warnings.append( + DataCheckWarning( + message=f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than {min_samples} samples: ['no']", + data_check_name=class_imbalance_data_check_name, + message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, + details={"target_values": ["no"]}, + ).to_dict(), + ) + + assert class_imbalance_check.validate(X, y_values_binary_str) == warnings + @pytest.mark.parametrize("test_size", [1, 0.5, 0.2]) def test_class_imbalance_large_multiclass(test_size): diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt index 9d59c30fac..9fed4c4334 100644 --- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt +++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt @@ -29,5 +29,5 @@ sktime==0.14.1 statsmodels==0.13.5 texttable==1.6.7 vowpalwabbit==9.6.0 -woodwork==0.19.0 +woodwork==0.21.1 xgboost==1.7.2 diff --git a/evalml/tests/dependency_update_check/minimum_requirements.txt b/evalml/tests/dependency_update_check/minimum_requirements.txt index 15e0ddd286..389ac5880f 100644 --- a/evalml/tests/dependency_update_check/minimum_requirements.txt +++ b/evalml/tests/dependency_update_check/minimum_requirements.txt @@ -29,5 +29,5 @@ sktime==0.13.3 statsmodels==0.12.2 texttable==1.6.2 vowpalwabbit==8.11.0 -woodwork==0.19.0 +woodwork==0.21.1 xgboost==1.5.1 diff --git a/evalml/tests/utils_tests/test_woodwork_utils.py b/evalml/tests/utils_tests/test_woodwork_utils.py index 3858f4a719..56fd5f2833 100644 --- a/evalml/tests/utils_tests/test_woodwork_utils.py +++ b/evalml/tests/utils_tests/test_woodwork_utils.py @@ -84,22 +84,36 @@ def test_infer_feature_types_dataframe(): def test_infer_feature_types_series(): - X_pd = pd.Series([1, 2, 3, 4]) - X_expected = X_pd.astype("int64") - pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd)) + y_pd = pd.Series([1, 2, 3, 4]) + y_expected = y_pd.astype("int64") + pd.testing.assert_series_equal(y_expected, infer_feature_types(y_pd)) - X_pd = pd.Series([1, 2, 3, 4], dtype="int64") - pd.testing.assert_series_equal(X_pd, infer_feature_types(X_pd)) + y_pd = pd.Series([1, 2, 3, 4], dtype="int64") + pd.testing.assert_series_equal(y_pd, infer_feature_types(y_pd)) - X_pd = pd.Series([1, 2, 3, 4], dtype="int64") - X_expected = X_pd.astype("category") - pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd, "categorical")) + y_pd = pd.Series([1, 2, 3, 4], dtype="int64") + y_expected = y_pd.astype("category") + pd.testing.assert_series_equal(y_expected, infer_feature_types(y_pd, "categorical")) - X_pd = pd.Series([1, 2, 3, 4], dtype="int64") - X_expected = X_pd.astype("category") + y_pd = pd.Series([1, 2, 3, 4], dtype="int64") + y_expected = y_pd.astype("category") pd.testing.assert_series_equal( - X_expected, - infer_feature_types(X_pd, ww.logical_types.Categorical), + y_expected, + infer_feature_types(y_pd, ww.logical_types.Categorical), + ) + + y_pd = pd.Series([1, 0, 1, 0], dtype="int64") + y_expected = y_pd.astype("bool") + pd.testing.assert_series_equal( + y_expected, + infer_feature_types(y_pd, ww.logical_types.Boolean), + ) + + y_pd = pd.Series([1, 0, 1, None], dtype="object") + y_expected = y_pd.astype("boolean") + pd.testing.assert_series_equal( + y_expected, + infer_feature_types(y_pd, ww.logical_types.BooleanNullable), ) diff --git a/setup.cfg b/setup.cfg index a936bdae83..8ed88f0123 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,7 +53,7 @@ install_requires = shap >= 0.40.0 statsmodels >= 0.12.2 texttable >= 1.6.2 - woodwork <= 0.19.0 + woodwork >= 0.21.1 dask >= 2022.2.0, !=2022.10.1 featuretools >= 1.16.0 nlp-primitives >= 2.9.0