Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port over highly-null Data Check and define BasicDataChecks and DisableDataChecks classes #745

Merged
merged 18 commits into from May 8, 2020
Merged
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Expand Up @@ -4,6 +4,7 @@ Changelog
---------
**Future Releases**
* Enhancements
* Port over highly-null guardrail as a data check and define `DefaultDataChecks` and `DisableDataChecks` classes :pr:`745`
* Fixes
* Changes
* Cleanup pipeline `score` code, and cleanup codecov :pr:`711`
Expand Down
3 changes: 3 additions & 0 deletions evalml/data_checks/__init__.py
Expand Up @@ -3,3 +3,6 @@
from .data_checks import DataChecks
from .data_check_message import DataCheckMessage, DataCheckWarning, DataCheckError
from .data_check_message_type import DataCheckMessageType
from .detect_highly_null_data_check import DetectHighlyNullDataCheck
from .default_data_checks import DefaultDataChecks
from .utils import EmptyDataChecks
13 changes: 13 additions & 0 deletions evalml/data_checks/default_data_checks.py
@@ -0,0 +1,13 @@
from .data_checks import DataChecks
from .detect_highly_null_data_check import DetectHighlyNullDataCheck


class DefaultDataChecks(DataChecks):
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self, data_checks=None):
"""
A collection of data checks.
angela97lin marked this conversation as resolved.
Show resolved Hide resolved

Arguments:
data_checks (list (DataCheck)): Ignored.
"""
self.data_checks = [DetectHighlyNullDataCheck()]
53 changes: 53 additions & 0 deletions evalml/data_checks/detect_highly_null_data_check.py
@@ -0,0 +1,53 @@
import pandas as pd

from .data_check import DataCheck
from .data_check_message import DataCheckWarning


class DetectHighlyNullDataCheck(DataCheck):

def __init__(self, pct_null_threshold=0.95):
"""Checks if there are any highly-null columns in the input.

Arguments:
pct_null_threshold(float): If the percentage of values in an input feature exceeds this amount,
that feature will be considered highly-null. Defaults to 0.95.

"""
if pct_null_threshold < 0 or pct_null_threshold > 1:
raise ValueError("pct_null_threshold must be a float between 0 and 1, inclusive.")
self.pct_null_threshold = pct_null_threshold

def validate(self, X, y=None):
"""Checks if there are any highly-null columns in the input.

Arguments:
X (pd.DataFrame, pd.Series, np.array, list) : features
y : Ignored.

Returns:
list (DataCheckWarning): list with a DataCheckWarning if there are any highly-null columns.

Example:
>>> df = pd.DataFrame({
... 'lots_of_null': [None, None, None, None, 5],
... 'no_null': [1, 2, 3, 4, 5]
... })
>>> null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.8)
>>> assert null_check.validate(df) == [DataCheckWarning("Columns 'lots_of_null' are more than 80.0% null", "DetectHighlyNullDataCheck")]
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
percent_null = (X.isnull().mean()).to_dict()
if self.pct_null_threshold == 0.0:
has_null_cols = {key: value for key, value in percent_null.items() if value > self.pct_null_threshold}
warning_msg = "Column '{}' is more than 0% null"
return [DataCheckWarning(warning_msg.format(col_name), self.name) for col_name in has_null_cols]
elif self.pct_null_threshold == 1.0:
all_null_cols = {key: value for key, value in percent_null.items() if value == self.pct_null_threshold}
warning_msg = "Column '{}' is 100% null"
return [DataCheckWarning(warning_msg.format(col_name), self.name) for col_name in all_null_cols]
else:
highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.pct_null_threshold}
warning_msg = "Column '{}' is {}% or more null"
return [DataCheckWarning(warning_msg.format(col_name, self.pct_null_threshold * 100), self.name) for col_name in highly_null_cols]
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
12 changes: 12 additions & 0 deletions evalml/data_checks/utils.py
@@ -0,0 +1,12 @@
from .data_checks import DataChecks


class EmptyDataChecks(DataChecks):
def __init__(self, data_checks=None):
"""
An empty collection of data checks.

Arguments:
data_checks (list (DataCheck)): Ignored.
"""
self.data_checks = []
65 changes: 65 additions & 0 deletions evalml/tests/data_checks_tests/test_data_check.py
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
import pytest

Expand All @@ -6,6 +7,9 @@
DataCheckError,
DataCheckWarning
)
from evalml.data_checks.detect_highly_null_data_check import (
DetectHighlyNullDataCheck
)


@pytest.fixture
Expand Down Expand Up @@ -62,3 +66,64 @@ def validate(self, X, y=None):
data_check = MockDataCheckWithParam(num=0)
errors_warnings = data_check.validate(X, y=None)
assert errors_warnings == [DataCheckError("Expected num == 10", "MockDataCheckWithParam")]


def test_highly_null_data_check_init():
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
highly_null_check = DetectHighlyNullDataCheck()
assert highly_null_check.pct_null_threshold == 0.95

highly_null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.0)
assert highly_null_check.pct_null_threshold == 0

highly_null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.5)
assert highly_null_check.pct_null_threshold == 0.5

highly_null_check = DetectHighlyNullDataCheck(pct_null_threshold=1.0)
assert highly_null_check.pct_null_threshold == 1.0

with pytest.raises(ValueError, match="pct_null_threshold must be a float between 0 and 1, inclusive."):
DetectHighlyNullDataCheck(pct_null_threshold=-0.1)
with pytest.raises(ValueError, match="pct_null_threshold must be a float between 0 and 1, inclusive."):
DetectHighlyNullDataCheck(pct_null_threshold=1.1)


def test_highly_null_data_check_warnings():
data = pd.DataFrame({'lots_of_null': [None, None, None, None, 5],
'all_null': [None, None, None, None, None],
'no_null': [1, 2, 3, 4, 5]})
no_null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.0)
assert no_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is more than 0% null", "DetectHighlyNullDataCheck"),
DataCheckWarning("Column 'all_null' is more than 0% null", "DetectHighlyNullDataCheck")]
some_null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.5)
assert some_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is 50.0% or more null", "DetectHighlyNullDataCheck"),
DataCheckWarning("Column 'all_null' is 50.0% or more null", "DetectHighlyNullDataCheck")]
all_null_check = DetectHighlyNullDataCheck(pct_null_threshold=1.0)
assert all_null_check.validate(data) == [DataCheckWarning("Column 'all_null' is 100% null", "DetectHighlyNullDataCheck")]


def test_highly_null_data_check_input_formats():
highly_null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.8)

# test empty pd.DataFrame
messages = highly_null_check.validate(pd.DataFrame())
assert messages == []

# test list
messages = highly_null_check.validate([None, None, None, None, 5])
assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck")]

# test pd.Series
messages = highly_null_check.validate(pd.Series([None, None, None, None, 5]))
assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck")]

# test 2D list
messages = highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]])
assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck"),
DataCheckWarning("Column '1' is 80.0% or more null", "DetectHighlyNullDataCheck"),
DataCheckWarning("Column '2' is 80.0% or more null", "DetectHighlyNullDataCheck")]

# test np.array
messages = highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]]))
assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck"),
DataCheckWarning("Column '1' is 80.0% or more null", "DetectHighlyNullDataCheck"),
DataCheckWarning("Column '2' is 80.0% or more null", "DetectHighlyNullDataCheck")]
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
32 changes: 27 additions & 5 deletions evalml/tests/data_checks_tests/test_data_checks.py
@@ -1,9 +1,13 @@
import pandas as pd

from evalml.data_checks.data_check import DataCheck
from evalml.data_checks.data_check_message import (
DataCheckError,
DataCheckWarning
)
from evalml.data_checks.data_checks import DataChecks
from evalml.data_checks.default_data_checks import DefaultDataChecks
from evalml.data_checks.utils import EmptyDataChecks


def test_data_checks(X_y):
Expand All @@ -27,8 +31,26 @@ def validate(self, X, y):

data_checks_list = [MockDataCheck(), MockDataCheckWarning(), MockDataCheckError(), MockDataCheckErrorAndWarning()]
data_checks = DataChecks(data_checks=data_checks_list)
errors_warnings = data_checks.validate(X, y)
assert errors_warnings == [DataCheckWarning("warning one", "MockDataCheckWarning"),
DataCheckError("error one", "MockDataCheckError"),
DataCheckError("error two", "MockDataCheckErrorAndWarning"),
DataCheckWarning("warning two", "MockDataCheckErrorAndWarning")]
messages = data_checks.validate(X, y)
assert messages == [DataCheckWarning("warning one", "MockDataCheckWarning"),
DataCheckError("error one", "MockDataCheckError"),
DataCheckError("error two", "MockDataCheckErrorAndWarning"),
DataCheckWarning("warning two", "MockDataCheckErrorAndWarning")]


def test_empty_data_checks(X_y):
X, y = X_y
data_checks = EmptyDataChecks()
messages = data_checks.validate(X, y)
assert messages == []


def test_default_data_checks(X_y):
X = pd.DataFrame({'lots_of_null': [None, None, None, None, 5],
'all_null': [None, None, None, None, None],
'also_all_null': [None, None, None, None, None],
'no_null': [1, 2, 3, 4, 5]})
data_checks = DefaultDataChecks()
messages = data_checks.validate(X)
assert messages == [DataCheckWarning("Column 'all_null' is 95.0% or more null", "DetectHighlyNullDataCheck"),
DataCheckWarning("Column 'also_all_null' is 95.0% or more null", "DetectHighlyNullDataCheck")]