Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow dropping rows for InvalidTargetDataCheck #4116

Merged
merged 12 commits into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Allow ``InvalidTargetDataCheck`` to return a ``DROP_ROWS`` ``DataCheckActionOption`` :pr:`4116`
* Fixes
* Changes
* Removed unnecessary logic from imputer components prior to nullable type handling :pr:`4038`, :pr:`4043`
Expand Down
7 changes: 4 additions & 3 deletions evalml/data_checks/data_check_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ class DataCheckMessage:

Args:
message (str): Message string.
data_check_name (str): Name of data check.
message_code (DataCheckMessageCode): Message code associated with message. Defaults to None.
details (dict): Additional useful information associated with the message. Defaults to None.
data_check_name (str): Name of the associated data check.
message_code (DataCheckMessageCode, optional): Message code associated with the message. Defaults to None.
details (dict, optional): Additional useful information associated with the message. Defaults to None.
action_options (list, optional): A list of `DataCheckActionOption`s associated with the message. Defaults to None.
"""

message_type = None
Expand Down
62 changes: 42 additions & 20 deletions evalml/data_checks/invalid_target_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,25 @@ class InvalidTargetDataCheck(DataCheck):
objective (str or ObjectiveBase): Name or instance of the objective class.
n_unique (int): Number of unique target values to store when problem type is binary and target
incorrectly has more than 2 unique values. Non-negative integer. If None, stores all unique values. Defaults to 100.
null_strategy (str): The type of action option that should be returned if the target is partially null.
The options are `impute` (default) and `drop`.
`impute` - Will return a `DataCheckActionOption` for imputing the target column.
`drop` - Will return a `DataCheckActionOption` for dropping the null rows in the target column.
"""

multiclass_continuous_threshold = 0.05

def __init__(self, problem_type, objective, n_unique=100):
def __init__(self, problem_type, objective, n_unique=100, null_strategy="impute"):
self.problem_type = handle_problem_types(problem_type)
self.objective = get_objective(objective)
if n_unique is not None and n_unique <= 0:
raise ValueError("`n_unique` must be a non-negative integer value.")
self.n_unique = n_unique
if null_strategy is None or null_strategy.lower() not in ["impute", "drop"]:
raise ValueError(
"The acceptable values for 'null_strategy' are 'impute' and 'drop'.",
)
self.null_strategy = null_strategy
ParthivNaresh marked this conversation as resolved.
Show resolved Hide resolved

def validate(self, X, y):
"""Check if the target data is considered invalid. If the input features argument is not None, it will be used to check that the target and features have the same dimensions and indices.
Expand Down Expand Up @@ -243,6 +252,37 @@ def _check_target_has_nan(self, y, messages):
elif null_rows.any():
num_null_rows = null_rows.sum()
pct_null_rows = null_rows.mean() * 100
rows_to_drop = null_rows.loc[null_rows].index.tolist()

action_options = []
impute_action_option = DataCheckActionOption(
ParthivNaresh marked this conversation as resolved.
Show resolved Hide resolved
DataCheckActionCode.IMPUTE_COL,
data_check_name=self.name,
parameters={
"impute_strategy": {
"parameter_type": DCAOParameterType.GLOBAL,
"type": "category",
"categories": ["mean", "most_frequent"]
if is_regression(self.problem_type)
else ["most_frequent"],
"default_value": "mean"
if is_regression(self.problem_type)
else "most_frequent",
},
},
metadata={"is_target": True},
)
drop_action_option = DataCheckActionOption(
DataCheckActionCode.DROP_ROWS,
data_check_name=self.name,
metadata={"is_target": True, "rows": rows_to_drop},
)

if self.null_strategy.lower() == "impute":
action_options.append(impute_action_option)
elif self.null_strategy.lower() == "drop":
action_options.append(drop_action_option)

messages.append(
DataCheckError(
message="{} row(s) ({}%) of target values are null".format(
Expand All @@ -255,25 +295,7 @@ def _check_target_has_nan(self, y, messages):
"num_null_rows": num_null_rows,
"pct_null_rows": pct_null_rows,
},
action_options=[
DataCheckActionOption(
DataCheckActionCode.IMPUTE_COL,
data_check_name=self.name,
parameters={
"impute_strategy": {
"parameter_type": DCAOParameterType.GLOBAL,
"type": "category",
"categories": ["mean", "most_frequent"]
if is_regression(self.problem_type)
else ["most_frequent"],
"default_value": "mean"
if is_regression(self.problem_type)
else "most_frequent",
},
},
metadata={"is_target": True},
),
],
action_options=action_options,
).to_dict(),
)

Expand Down
62 changes: 62 additions & 0 deletions evalml/tests/data_checks_tests/test_invalid_target_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,19 @@ def test_invalid_target_data_check_invalid_n_unique():
)


@pytest.mark.parametrize("null_strategy", ["invalid", None])
def test_invalid_target_data_check_invalid_null_strategy(null_strategy):
with pytest.raises(
ValueError,
match="The acceptable values for 'null_strategy' are 'impute' and 'drop'.",
):
InvalidTargetDataCheck(
"regression",
get_default_primary_search_objective("regression"),
null_strategy=null_strategy,
)


def test_invalid_target_data_check_nan_error():
X = pd.DataFrame({"col": [1, 2, 3]})
invalid_targets_check = InvalidTargetDataCheck(
Expand Down Expand Up @@ -148,6 +161,55 @@ def test_invalid_target_y_none():
]


@pytest.mark.parametrize("null_strategy", ["Impute", "DROP"])
def test_invalid_target_data_null_strategies(null_strategy):
invalid_targets_check = InvalidTargetDataCheck(
"regression",
get_default_primary_search_objective("regression"),
null_strategy=null_strategy,
)

expected_action_options = []
impute_action_option = DataCheckActionOption(
DataCheckActionCode.IMPUTE_COL,
data_check_name=invalid_targets_data_check_name,
parameters={
"impute_strategy": {
"parameter_type": DCAOParameterType.GLOBAL,
"type": "category",
"categories": ["mean", "most_frequent"],
"default_value": "mean",
},
},
metadata={"is_target": True},
)
drop_action_option = DataCheckActionOption(
DataCheckActionCode.DROP_ROWS,
data_check_name=invalid_targets_data_check_name,
metadata={"is_target": True, "rows": [0, 3]},
)
if null_strategy.lower() == "impute":
expected_action_options.append(impute_action_option)
elif null_strategy.lower() == "drop":
expected_action_options.append(drop_action_option)

expected = [
DataCheckError(
message="2 row(s) (40.0%) of target values are null",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 2, "pct_null_rows": 40.0},
action_options=expected_action_options,
).to_dict(),
]

y = pd.Series([None, 3.5, 2.8, None, 0])
X = pd.DataFrame({"col": range(len(y))})

messages = invalid_targets_check.validate(X, y)
assert messages == expected


def test_invalid_target_data_input_formats():
invalid_targets_check = InvalidTargetDataCheck(
"binary",
Expand Down