Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update InvalidTargetDataCheck to validate time series regression problems #3251

Merged
merged 3 commits into from
Jan 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* Removed empty cell in text_input.ipynb :pr:`3234`
* Removed potential prediction explanations failure when pipelines predicted a class with probability 1 :pr:`3221`
* Dropped NaNs before partial dependence grid generation :pr:`3235`
* Fixed bug where ``InvalidTargetDataCheck`` would not check time series regression targets :pr:`3251`
* Changes
* Raised lowest compatible numpy version to 1.21.0 to address security concerns :pr:`3207`
* Changed the default objective to ``MedianAE`` from ``R2`` for time series regression :pr:`3205`
Expand Down
6 changes: 1 addition & 5 deletions evalml/data_checks/invalid_target_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
)
from evalml.objectives import get_objective
from evalml.problem_types import (
ProblemTypes,
handle_problem_types,
is_binary,
is_multiclass,
Expand Down Expand Up @@ -329,10 +328,7 @@ def _check_for_non_positive_target(self, y, results):
return results

def _check_regression_target(self, y, results):
if (
self.problem_type == ProblemTypes.REGRESSION
and "numeric" not in y.ww.semantic_tags
):
if is_regression(self.problem_type) and "numeric" not in y.ww.semantic_tags:
DataCheck._add_message(
DataCheckError(
message="Target data type should be numeric for regression type problems.",
Expand Down
18 changes: 18 additions & 0 deletions evalml/tests/data_checks_tests/test_data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,24 @@ def test_errors_warnings_in_invalid_target_data_check(objective, ts_data):
"errors": [data_check_error],
"actions": [],
}
y = ww.init_series(y, logical_type="Categorical")
default_data_check = DefaultDataChecks(
problem_type="time series regression",
objective=objective,
problem_configuration=problem_config,
).data_checks
data_check_error_type = DataCheckError(
message=f"Target data type should be numeric for regression type problems.",
data_check_name="InvalidTargetDataCheck",
message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
).to_dict()
for check in default_data_check:
if check.name == "InvalidTargetDataCheck":
assert check.validate(X, y) == {
"warnings": [],
"errors": [data_check_error_type],
"actions": [],
}


def test_data_checks_do_not_duplicate_actions(X_y_binary):
Expand Down
21 changes: 16 additions & 5 deletions evalml/tests/data_checks_tests/test_invalid_target_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,10 @@ def test_invalid_target_data_check_initialize_with_none_objective():
)


def test_invalid_target_data_check_regression_problem_nonnumeric_data():
@pytest.mark.parametrize(
"problem_type", [ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.REGRESSION]
)
def test_invalid_target_data_check_regression_problem_nonnumeric_data(problem_type):
y_categorical = pd.Series(["Peace", "Is", "A", "Lie"] * 100)
y_mixed_cat_numeric = pd.Series(["Peace", 2, "A", 4] * 100)
y_integer = pd.Series([1, 2, 3, 4])
Expand All @@ -508,7 +511,7 @@ def test_invalid_target_data_check_regression_problem_nonnumeric_data():
).to_dict()

invalid_targets_check = InvalidTargetDataCheck(
"regression", get_default_primary_search_objective("regression")
problem_type, get_default_primary_search_objective(problem_type)
)
assert invalid_targets_check.validate(
X=pd.DataFrame({"col": range(len(y_categorical))}), y=y_categorical
Expand All @@ -527,7 +530,10 @@ def test_invalid_target_data_check_regression_problem_nonnumeric_data():
) == {"warnings": [], "errors": [], "actions": []}


def test_invalid_target_data_check_multiclass_problem_binary_data():
@pytest.mark.parametrize(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to the issue but just adding coverage for time series classification. Same below.

"problem_type", [ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS]
)
def test_invalid_target_data_check_multiclass_problem_binary_data(problem_type):
y_multiclass = pd.Series([1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] * 25)
y_binary = pd.Series([0, 1, 1, 1, 0, 0] * 25)

Expand Down Expand Up @@ -702,11 +708,16 @@ def test_invalid_target_data_check_different_lengths():
}


def test_invalid_target_data_check_numeric_binary_does_not_return_warnings():
@pytest.mark.parametrize(
"problem_type", [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY]
)
def test_invalid_target_data_check_numeric_binary_does_not_return_warnings(
problem_type,
):
y = pd.Series([1, 5, 1, 5, 1, 1])
X = pd.DataFrame({"col": range(len(y))})
invalid_targets_check = InvalidTargetDataCheck(
"binary", get_default_primary_search_objective("binary")
problem_type, get_default_primary_search_objective(problem_type)
)
assert invalid_targets_check.validate(X, y) == {
"warnings": [],
Expand Down