Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename LabelLeakageDataCheck to TargetLeakageDataCheck. #1319

Merged
merged 4 commits into from
Oct 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ Data Check Classes
InvalidTargetDataCheck
HighlyNullDataCheck
IDColumnsDataCheck
LabelLeakageDataCheck
TargetLeakageDataCheck
OutliersDataCheck
NoVarianceDataCheck
ClassImbalanceDataCheck
Expand Down
5 changes: 5 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ Release Notes
* Cleaned up ``make_pipeline`` tests to test for all estimators :pr:`1257`
* Added a test to check conda build after merge to main :pr:`1247`

.. warning::

**Breaking Changes**
* Renamed ``LabelLeakageDataCheck`` to ``TargetLeakageDataCheck`` :pr:`1319`


**v0.14.1 Sep. 29, 2020**
* Enhancements
Expand Down
2 changes: 1 addition & 1 deletion evalml/data_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .invalid_targets_data_check import InvalidTargetDataCheck
from .highly_null_data_check import HighlyNullDataCheck
from .id_columns_data_check import IDColumnsDataCheck
from .label_leakage_data_check import LabelLeakageDataCheck
from .target_leakage_data_check import TargetLeakageDataCheck
from .outliers_data_check import OutliersDataCheck
from .no_variance_data_check import NoVarianceDataCheck
from .class_imbalance_data_check import ClassImbalanceDataCheck
Expand Down
6 changes: 3 additions & 3 deletions evalml/data_checks/default_data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
from .highly_null_data_check import HighlyNullDataCheck
from .id_columns_data_check import IDColumnsDataCheck
from .invalid_targets_data_check import InvalidTargetDataCheck
from .label_leakage_data_check import LabelLeakageDataCheck
from .no_variance_data_check import NoVarianceDataCheck
from .target_leakage_data_check import TargetLeakageDataCheck


class DefaultDataChecks(DataChecks):
"""A collection of basic data checks that is used by AutoML by default.
Includes HighlyNullDataCheck, IDColumnsDataCheck, LabelLeakageDataCheck, InvalidTargetDataCheck,
Includes HighlyNullDataCheck, IDColumnsDataCheck, TargetLeakageDataCheck, InvalidTargetDataCheck,
and NoVarianceDataCheck."""

_DEFAULT_DATA_CHECK_CLASSES = [HighlyNullDataCheck, IDColumnsDataCheck,
LabelLeakageDataCheck, InvalidTargetDataCheck, NoVarianceDataCheck]
TargetLeakageDataCheck, InvalidTargetDataCheck, NoVarianceDataCheck]

def __init__(self, problem_type):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from evalml.utils.gen_utils import numeric_and_boolean_dtypes


class LabelLeakageDataCheck(DataCheck):
class TargetLeakageDataCheck(DataCheck):
"""Check if any of the features are highly correlated with the target."""

def __init__(self, pct_corr_threshold=0.95):
Expand All @@ -32,7 +32,7 @@ def validate(self, X, y):
y (pd.Series): The target data

Returns:
list (DataCheckWarning): List with a DataCheckWarning if there is label leakage detected.
list (DataCheckWarning): List with a DataCheckWarning if target leakage is detected.

Example:
>>> X = pd.DataFrame({
Expand All @@ -41,8 +41,8 @@ def validate(self, X, y):
... 'y': [12, 5, 13, 74, 24],
... })
>>> y = pd.Series([10, 42, 31, 51, 40])
>>> label_leakage_check = LabelLeakageDataCheck(pct_corr_threshold=0.8)
>>> assert label_leakage_check.validate(X, y) == [DataCheckWarning("Column 'leak' is 80.0% or more correlated with the target", "LabelLeakageDataCheck")]
>>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)
>>> assert target_leakage_check.validate(X, y) == [DataCheckWarning("Column 'leak' is 80.0% or more correlated with the target", "TargetLeakageDataCheck")]
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
Expand Down
2 changes: 1 addition & 1 deletion evalml/tests/data_checks_tests/test_data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_default_data_checks_classification():
y = pd.Series([0, 1, np.nan, 1, 0])
data_checks = DefaultDataChecks("binary")

leakage = [DataCheckWarning("Column 'has_label_leakage' is 95.0% or more correlated with the target", "LabelLeakageDataCheck")]
leakage = [DataCheckWarning("Column 'has_label_leakage' is 95.0% or more correlated with the target", "TargetLeakageDataCheck")]

assert data_checks.validate(X, y) == messages[:3] + leakage + messages[3:]

Expand Down
71 changes: 0 additions & 71 deletions evalml/tests/data_checks_tests/test_label_leakage_data_check.py

This file was deleted.

71 changes: 71 additions & 0 deletions evalml/tests/data_checks_tests/test_target_leakage_data_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import pandas as pd
import pytest

from evalml.data_checks.data_check_message import DataCheckWarning
from evalml.data_checks.target_leakage_data_check import TargetLeakageDataCheck


def test_target_leakage_data_check_init():
target_leakage_check = TargetLeakageDataCheck()
assert target_leakage_check.pct_corr_threshold == 0.95

target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.0)
assert target_leakage_check.pct_corr_threshold == 0

target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5)
assert target_leakage_check.pct_corr_threshold == 0.5

target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=1.0)
assert target_leakage_check.pct_corr_threshold == 1.0

with pytest.raises(ValueError, match="pct_corr_threshold must be a float between 0 and 1, inclusive."):
TargetLeakageDataCheck(pct_corr_threshold=-0.1)
with pytest.raises(ValueError, match="pct_corr_threshold must be a float between 0 and 1, inclusive."):
TargetLeakageDataCheck(pct_corr_threshold=1.1)


def test_target_leakage_data_check_warnings():
y = pd.Series([1, 0, 1, 1])
X = pd.DataFrame()
X["a"] = y * 3
X["b"] = y - 1
X["c"] = y / 10
X["d"] = ~y
X["e"] = [0, 0, 0, 0]
y = y.astype(bool)

leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5)
assert leakage_check.validate(X, y) == [DataCheckWarning("Column 'a' is 50.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'b' is 50.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'c' is 50.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'd' is 50.0% or more correlated with the target", "TargetLeakageDataCheck")]


def test_target_leakage_data_check_input_formats():
leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)

# test empty pd.DataFrame, empty pd.Series
assert leakage_check.validate(pd.DataFrame(), pd.Series()) == []

y = pd.Series([1, 0, 1, 1])
X = pd.DataFrame()
X["a"] = y * 3
X["b"] = y - 1
X["c"] = y / 10
X["d"] = ~y
X["e"] = [0, 0, 0, 0]
y = y.astype(bool)

expected_messages = [DataCheckWarning("Column 'a' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'b' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'c' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'd' is 80.0% or more correlated with the target", "TargetLeakageDataCheck")]

# test y as list
assert leakage_check.validate(X, y.values) == expected_messages

# test X as np.array
assert leakage_check.validate(X.to_numpy(), y) == [DataCheckWarning("Column '0' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column '1' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column '2' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column '3' is 80.0% or more correlated with the target", "TargetLeakageDataCheck")]