Skip to content

Commit

Permalink
Rename LabelLeakageDataCheck to TargetLeakageDataCheck. (#1319)
Browse files Browse the repository at this point in the history
* Rename LabelLeakageDataCheck to TargetLeakageDataCheck.

* Adding PR 1319 to relase notes.

* Sorting imports in default_data_checks.py

* Renaming tests from _label_ to _target_
  • Loading branch information
freddyaboulton committed Oct 20, 2020
1 parent dec69b1 commit 1f8a964
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 81 deletions.
2 changes: 1 addition & 1 deletion docs/source/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ Data Check Classes
InvalidTargetDataCheck
HighlyNullDataCheck
IDColumnsDataCheck
LabelLeakageDataCheck
TargetLeakageDataCheck
OutliersDataCheck
NoVarianceDataCheck
ClassImbalanceDataCheck
Expand Down
5 changes: 5 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ Release Notes
* Cleaned up ``make_pipeline`` tests to test for all estimators :pr:`1257`
* Added a test to check conda build after merge to main :pr:`1247`

.. warning::

**Breaking Changes**
* Renamed ``LabelLeakageDataCheck`` to ``TargetLeakageDataCheck`` :pr:`1319`


**v0.14.1 Sep. 29, 2020**
* Enhancements
Expand Down
2 changes: 1 addition & 1 deletion evalml/data_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .invalid_targets_data_check import InvalidTargetDataCheck
from .highly_null_data_check import HighlyNullDataCheck
from .id_columns_data_check import IDColumnsDataCheck
from .label_leakage_data_check import LabelLeakageDataCheck
from .target_leakage_data_check import TargetLeakageDataCheck
from .outliers_data_check import OutliersDataCheck
from .no_variance_data_check import NoVarianceDataCheck
from .class_imbalance_data_check import ClassImbalanceDataCheck
Expand Down
6 changes: 3 additions & 3 deletions evalml/data_checks/default_data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
from .highly_null_data_check import HighlyNullDataCheck
from .id_columns_data_check import IDColumnsDataCheck
from .invalid_targets_data_check import InvalidTargetDataCheck
from .label_leakage_data_check import LabelLeakageDataCheck
from .no_variance_data_check import NoVarianceDataCheck
from .target_leakage_data_check import TargetLeakageDataCheck


class DefaultDataChecks(DataChecks):
"""A collection of basic data checks that is used by AutoML by default.
Includes HighlyNullDataCheck, IDColumnsDataCheck, LabelLeakageDataCheck, InvalidTargetDataCheck,
Includes HighlyNullDataCheck, IDColumnsDataCheck, TargetLeakageDataCheck, InvalidTargetDataCheck,
and NoVarianceDataCheck."""

_DEFAULT_DATA_CHECK_CLASSES = [HighlyNullDataCheck, IDColumnsDataCheck,
LabelLeakageDataCheck, InvalidTargetDataCheck, NoVarianceDataCheck]
TargetLeakageDataCheck, InvalidTargetDataCheck, NoVarianceDataCheck]

def __init__(self, problem_type):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from evalml.utils.gen_utils import numeric_and_boolean_dtypes


class LabelLeakageDataCheck(DataCheck):
class TargetLeakageDataCheck(DataCheck):
"""Check if any of the features are highly correlated with the target."""

def __init__(self, pct_corr_threshold=0.95):
Expand All @@ -32,7 +32,7 @@ def validate(self, X, y):
y (pd.Series): The target data
Returns:
list (DataCheckWarning): List with a DataCheckWarning if there is label leakage detected.
list (DataCheckWarning): List with a DataCheckWarning if target leakage is detected.
Example:
>>> X = pd.DataFrame({
Expand All @@ -41,8 +41,8 @@ def validate(self, X, y):
... 'y': [12, 5, 13, 74, 24],
... })
>>> y = pd.Series([10, 42, 31, 51, 40])
>>> label_leakage_check = LabelLeakageDataCheck(pct_corr_threshold=0.8)
>>> assert label_leakage_check.validate(X, y) == [DataCheckWarning("Column 'leak' is 80.0% or more correlated with the target", "LabelLeakageDataCheck")]
>>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)
>>> assert target_leakage_check.validate(X, y) == [DataCheckWarning("Column 'leak' is 80.0% or more correlated with the target", "TargetLeakageDataCheck")]
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
Expand Down
2 changes: 1 addition & 1 deletion evalml/tests/data_checks_tests/test_data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_default_data_checks_classification():
y = pd.Series([0, 1, np.nan, 1, 0])
data_checks = DefaultDataChecks("binary")

leakage = [DataCheckWarning("Column 'has_label_leakage' is 95.0% or more correlated with the target", "LabelLeakageDataCheck")]
leakage = [DataCheckWarning("Column 'has_label_leakage' is 95.0% or more correlated with the target", "TargetLeakageDataCheck")]

assert data_checks.validate(X, y) == messages[:3] + leakage + messages[3:]

Expand Down
71 changes: 0 additions & 71 deletions evalml/tests/data_checks_tests/test_label_leakage_data_check.py

This file was deleted.

71 changes: 71 additions & 0 deletions evalml/tests/data_checks_tests/test_target_leakage_data_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import pandas as pd
import pytest

from evalml.data_checks.data_check_message import DataCheckWarning
from evalml.data_checks.target_leakage_data_check import TargetLeakageDataCheck


def test_target_leakage_data_check_init():
target_leakage_check = TargetLeakageDataCheck()
assert target_leakage_check.pct_corr_threshold == 0.95

target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.0)
assert target_leakage_check.pct_corr_threshold == 0

target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5)
assert target_leakage_check.pct_corr_threshold == 0.5

target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=1.0)
assert target_leakage_check.pct_corr_threshold == 1.0

with pytest.raises(ValueError, match="pct_corr_threshold must be a float between 0 and 1, inclusive."):
TargetLeakageDataCheck(pct_corr_threshold=-0.1)
with pytest.raises(ValueError, match="pct_corr_threshold must be a float between 0 and 1, inclusive."):
TargetLeakageDataCheck(pct_corr_threshold=1.1)


def test_target_leakage_data_check_warnings():
y = pd.Series([1, 0, 1, 1])
X = pd.DataFrame()
X["a"] = y * 3
X["b"] = y - 1
X["c"] = y / 10
X["d"] = ~y
X["e"] = [0, 0, 0, 0]
y = y.astype(bool)

leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5)
assert leakage_check.validate(X, y) == [DataCheckWarning("Column 'a' is 50.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'b' is 50.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'c' is 50.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'd' is 50.0% or more correlated with the target", "TargetLeakageDataCheck")]


def test_target_leakage_data_check_input_formats():
leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)

# test empty pd.DataFrame, empty pd.Series
assert leakage_check.validate(pd.DataFrame(), pd.Series()) == []

y = pd.Series([1, 0, 1, 1])
X = pd.DataFrame()
X["a"] = y * 3
X["b"] = y - 1
X["c"] = y / 10
X["d"] = ~y
X["e"] = [0, 0, 0, 0]
y = y.astype(bool)

expected_messages = [DataCheckWarning("Column 'a' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'b' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'c' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column 'd' is 80.0% or more correlated with the target", "TargetLeakageDataCheck")]

# test y as list
assert leakage_check.validate(X, y.values) == expected_messages

# test X as np.array
assert leakage_check.validate(X.to_numpy(), y) == [DataCheckWarning("Column '0' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column '1' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column '2' is 80.0% or more correlated with the target", "TargetLeakageDataCheck"),
DataCheckWarning("Column '3' is 80.0% or more correlated with the target", "TargetLeakageDataCheck")]

0 comments on commit 1f8a964

Please sign in to comment.