Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ outputs:
- requirements-parser >=0.2.0
- shap >=0.40.0
- texttable >=1.6.2
- woodwork <=0.19.0
- woodwork >=0.21.1
- featuretools>=1.16.0
- nlp-primitives>=2.9.0
- python >=3.8.*
Expand Down
Empty file added =
Empty file.
2 changes: 1 addition & 1 deletion core-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ requirements-parser>=0.2.0
shap>=0.40.0
statsmodels>=0.12.2
texttable>=1.6.2
woodwork<=0.19.0
woodwork>= 0.21.1
dask>=2022.2.0, !=2022.10.1
nlp-primitives>=2.9.0
featuretools>=1.16.0
Expand Down
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Release Notes
* Fix ARIMA not accounting for gap in prediction from end of training data :pr:`3884`
* Changes
* Added a threshold to ``DateTimeFormatDataCheck`` to account for too many duplicate or nan values :pr:`3883`
* Changed treatment of ``Boolean`` columns for ``SimpleImputer`` and ``ClassImbalanceDataCheck`` to be compatible with new Woodwork inference :pr:`3892`
* Split decomposer ``seasonal_period`` parameter into ``seasonal_smoother`` and ``period`` parameters :pr:`3896`
* Documentation Changes
* Hid non-essential warning messages in time series docs :pr:`3890`
Expand Down
38 changes: 34 additions & 4 deletions evalml/data_checks/class_imbalance_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Use for classification problems.
"""
import numpy as np
import pandas as pd

from evalml.data_checks import (
DataCheck,
Expand Down Expand Up @@ -146,7 +147,23 @@ def validate(self, X, y):
"""
messages = []

original_vc = pd.Series(y).value_counts(sort=True)
y = infer_feature_types(y)
new_vc = y.value_counts(sort=True)
if str(y.ww.logical_type) not in ["Boolean", "BooleanNullable"]:
# If the inferred logical type is not in Boolean/BooleanNullable, then a
# mapping to the original values is not necessary.
after_to_before_inference_mapping = {new: new for new in new_vc.keys()}
else:
# If the inferred logical type is in Boolean/BooleanNullable, then a
# mapping to the original values will be needed for the data check messages
after_to_before_inference_mapping = {
new: old for old, new in zip(original_vc.keys(), new_vc.keys())
}
# Needed for checking severe imbalance to verify values present below threshold
before_to_after_inference_mapping = {
old: new for new, old in after_to_before_inference_mapping.items()
}

fold_counts = y.value_counts(normalize=False, sort=True)
fold_counts = np.floor(fold_counts * self.test_size).astype(int)
Expand All @@ -155,7 +172,10 @@ def validate(self, X, y):
# search for targets that occur less than twice the number of cv folds first
below_threshold_folds = fold_counts.where(fold_counts < self.cv_folds).dropna()
if len(below_threshold_folds):
below_threshold_values = below_threshold_folds.index.tolist()
below_threshold_values = [
after_to_before_inference_mapping.get(each)
for each in below_threshold_folds.index.tolist()
]
error_msg = "The number of instances of these targets is less than 2 * the number of cross folds = {} instances: {}"
messages.append(
DataCheckError(
Expand All @@ -173,7 +193,10 @@ def validate(self, X, y):
below_threshold = counts.where(counts < self.threshold).dropna()
# if there are items that occur less than the threshold, add them to the list of results
if len(below_threshold):
below_threshold_values = below_threshold.index.tolist()
below_threshold_values = [
after_to_before_inference_mapping.get(each)
for each in below_threshold.index.tolist()
]
warning_msg = "The following labels fall below {:.0f}% of the target: {}"
messages.append(
DataCheckWarning(
Expand All @@ -188,8 +211,15 @@ def validate(self, X, y):
)
sample_counts = fold_counts.where(fold_counts < self.min_samples).dropna()
if len(below_threshold) and len(sample_counts):
sample_count_values = sample_counts.index.tolist()
severe_imbalance = [v for v in sample_count_values if v in below_threshold]
sample_count_values = [
after_to_before_inference_mapping.get(each)
for each in sample_counts.index.tolist()
]
severe_imbalance = [
v
for v in sample_count_values
if before_to_after_inference_mapping.get(v) in below_threshold
]
warning_msg = "The following labels in the target have severe class imbalance because they fall under {:.0f}% of the target and have less than {} samples: {}"
messages.append(
DataCheckWarning(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def transform(self, X, y=None):
"""
X = infer_feature_types(X)
original_schema = X.ww.schema
X = set_boolean_columns_to_categorical(X)

# Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
if (X.dtypes == bool).all():
Expand Down
4 changes: 2 additions & 2 deletions evalml/pipelines/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,9 @@ def set_boolean_columns_to_categorical(X):
X = X.ww.copy()
X_schema = X.ww.schema
original_X_schema = X_schema.get_subset_schema(
subset_cols=X_schema._filter_cols(exclude=["Boolean"]),
subset_cols=X_schema._filter_cols(exclude=["Boolean", "BooleanNullable"]),
)
X_boolean_cols = X_schema._filter_cols(include=["Boolean"])
X_boolean_cols = X_schema._filter_cols(include=["Boolean", "BooleanNullable"])
new_ltypes_for_boolean_cols = {col: "Categorical" for col in X_boolean_cols}
X.ww.init(schema=original_X_schema, logical_types=new_ltypes_for_boolean_cols)
return X
Expand Down
10 changes: 10 additions & 0 deletions evalml/tests/component_tests/test_simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from pandas.testing import assert_frame_equal
from woodwork.logical_types import (
Boolean,
BooleanNullable,
Categorical,
Double,
Integer,
IntegerNullable,
NaturalLanguage,
)

Expand Down Expand Up @@ -459,12 +461,18 @@ def test_simple_imputer_woodwork_custom_overrides_returned_by_components(
"categorical col": imputer_test_data[["categorical col"]],
"bool col": imputer_test_data[["bool col"]],
}[data]
if str(logical_type) == "Boolean" and has_nan == "has_nan":
logical_type = "BooleanNullable"
if str(logical_type) == "Integer" and has_nan == "has_nan":
logical_type = "IntegerNullable"
logical_type = {
"Integer": Integer,
"IntegerNullable": IntegerNullable,
"Double": Double,
"Categorical": Categorical,
"NaturalLanguage": NaturalLanguage,
"Boolean": Boolean,
"BooleanNullable": BooleanNullable,
}[logical_type]
y = pd.Series([1, 2, 1])

Expand All @@ -486,6 +494,8 @@ def test_simple_imputer_woodwork_custom_overrides_returned_by_components(
imputer = SimpleImputer(impute_strategy=impute_strategy_to_use)
transformed = imputer.fit_transform(X, y)
assert isinstance(transformed, pd.DataFrame)
if str(logical_type) == "IntegerNullable":
logical_type = Double
assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {
data: logical_type,
}
Expand Down
7 changes: 6 additions & 1 deletion evalml/tests/component_tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,12 +284,17 @@ def test_set_boolean_columns_to_categorical():
)
X_e = pd.DataFrame(
{
"bool with nan": pd.Series(
[True, pd.NA, False, pd.NA, False],
dtype="boolean",
),
"bool no nan": pd.Series([False, False, False, False, True], dtype=bool),
},
)
X_e = infer_feature_types(X_e)
X_e.ww.set_types(
logical_types={
"bool with nan": "Categorical",
"bool no nan": "Categorical",
},
)
Expand All @@ -298,7 +303,7 @@ def test_set_boolean_columns_to_categorical():

X = set_boolean_columns_to_categorical(X)

assert len(X.ww.select(["Categorical"]).columns) == 1
assert len(X.ww.select(["Categorical"]).columns) == 2
assert len(X.ww.select(["Categorical"]) == 5)

assert_frame_equal(
Expand Down
38 changes: 38 additions & 0 deletions evalml/tests/data_checks_tests/test_class_imbalance_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,24 @@ def test_class_imbalance_severe(test_size, min_samples, input_type):
* 50
* int(1 / test_size),
)
y_values_binary_str = pd.Series(
[
"no",
"yes",
"yes",
"yes",
"yes",
"yes",
"yes",
"yes",
"yes",
"yes",
"yes",
"yes",
]
* 50
* int(1 / test_size),
)
if input_type == "ww":
X.ww.init()
y_values_binary = ww.init_series(y_values_binary, logical_type="integer")
Expand Down Expand Up @@ -487,6 +505,26 @@ def test_class_imbalance_severe(test_size, min_samples, input_type):
assert class_imbalance_check.validate(X, y_values_binary) == warnings
assert class_imbalance_check.validate(X, y_values_multiclass) == warnings

warnings = [
DataCheckWarning(
message="The following labels fall below 10% of the target: ['no']",
data_check_name=class_imbalance_data_check_name,
message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD,
details={"target_values": ["no"]},
).to_dict(),
]
if min_samples > 50:
warnings.append(
DataCheckWarning(
message=f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than {min_samples} samples: ['no']",
data_check_name=class_imbalance_data_check_name,
message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE,
details={"target_values": ["no"]},
).to_dict(),
)

assert class_imbalance_check.validate(X, y_values_binary_str) == warnings


@pytest.mark.parametrize("test_size", [1, 0.5, 0.2])
def test_class_imbalance_large_multiclass(test_size):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ sktime==0.14.1
statsmodels==0.13.5
texttable==1.6.7
vowpalwabbit==9.6.0
woodwork==0.19.0
woodwork==0.21.1
xgboost==1.7.2
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ sktime==0.13.3
statsmodels==0.12.2
texttable==1.6.2
vowpalwabbit==8.11.0
woodwork==0.19.0
woodwork==0.21.1
xgboost==1.5.1
38 changes: 26 additions & 12 deletions evalml/tests/utils_tests/test_woodwork_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,22 +84,36 @@ def test_infer_feature_types_dataframe():


def test_infer_feature_types_series():
X_pd = pd.Series([1, 2, 3, 4])
X_expected = X_pd.astype("int64")
pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd))
y_pd = pd.Series([1, 2, 3, 4])
y_expected = y_pd.astype("int64")
pd.testing.assert_series_equal(y_expected, infer_feature_types(y_pd))

X_pd = pd.Series([1, 2, 3, 4], dtype="int64")
pd.testing.assert_series_equal(X_pd, infer_feature_types(X_pd))
y_pd = pd.Series([1, 2, 3, 4], dtype="int64")
pd.testing.assert_series_equal(y_pd, infer_feature_types(y_pd))

X_pd = pd.Series([1, 2, 3, 4], dtype="int64")
X_expected = X_pd.astype("category")
pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd, "categorical"))
y_pd = pd.Series([1, 2, 3, 4], dtype="int64")
y_expected = y_pd.astype("category")
pd.testing.assert_series_equal(y_expected, infer_feature_types(y_pd, "categorical"))

X_pd = pd.Series([1, 2, 3, 4], dtype="int64")
X_expected = X_pd.astype("category")
y_pd = pd.Series([1, 2, 3, 4], dtype="int64")
y_expected = y_pd.astype("category")
pd.testing.assert_series_equal(
X_expected,
infer_feature_types(X_pd, ww.logical_types.Categorical),
y_expected,
infer_feature_types(y_pd, ww.logical_types.Categorical),
)

y_pd = pd.Series([1, 0, 1, 0], dtype="int64")
y_expected = y_pd.astype("bool")
pd.testing.assert_series_equal(
y_expected,
infer_feature_types(y_pd, ww.logical_types.Boolean),
)

y_pd = pd.Series([1, 0, 1, None], dtype="object")
y_expected = y_pd.astype("boolean")
pd.testing.assert_series_equal(
y_expected,
infer_feature_types(y_pd, ww.logical_types.BooleanNullable),
)


Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ install_requires =
shap >= 0.40.0
statsmodels >= 0.12.2
texttable >= 1.6.2
woodwork <= 0.19.0
woodwork >= 0.21.1
dask >= 2022.2.0, !=2022.10.1
featuretools >= 1.16.0
nlp-primitives >= 2.9.0
Expand Down