alteryx · angela97lin · May 8, 2020 · May 5, 2020 · May 5, 2020 · May 5, 2020
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -4,6 +4,7 @@ Changelog
 ---------
 **Future Releases**
     * Enhancements
+        * Port over highly-null guardrail as a data check and define `DefaultDataChecks` and `DisableDataChecks` classes :pr:`745`
     * Fixes
     * Changes
         * Cleanup pipeline `score` code, and cleanup codecov :pr:`711`

diff --git a/evalml/data_checks/__init__.py b/evalml/data_checks/__init__.py
@@ -3,3 +3,6 @@
 from .data_checks import DataChecks
 from .data_check_message import DataCheckMessage, DataCheckWarning, DataCheckError
 from .data_check_message_type import DataCheckMessageType
+from .detect_highly_null_data_check import DetectHighlyNullDataCheck
+from .default_data_checks import DefaultDataChecks
+from .utils import EmptyDataChecks
diff --git a/evalml/data_checks/default_data_checks.py b/evalml/data_checks/default_data_checks.py
@@ -0,0 +1,13 @@
+from .data_checks import DataChecks
+from .detect_highly_null_data_check import DetectHighlyNullDataCheck
+
+
+class DefaultDataChecks(DataChecks):
+    def __init__(self, data_checks=None):
+        """
+        A collection of data checks.
+
+        Arguments:
+            data_checks (list (DataCheck)): Ignored.
+        """
+        self.data_checks = [DetectHighlyNullDataCheck()]
diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py
@@ -0,0 +1,44 @@
+import pandas as pd
+
+from .data_check import DataCheck
+from .data_check_message import DataCheckWarning
+
+
+class DetectHighlyNullDataCheck(DataCheck):
+
+    def __init__(self, percent_threshold=0.95):
+        """Checks if there are any highly-null columns in the input.
+
+        Arguments:
+            percent_threshold(float): If the percentage of values in an input feature exceeds this amount,
+                that feature will be considered highly-null. Defaults to 0.95.
+
+        """
+        if percent_threshold < 0 or percent_threshold > 1:
+            raise ValueError("percent_threshold must be a float between 0 and 1, inclusive.")
+        self.percent_threshold = percent_threshold
+
+    def validate(self, X, y=None):
+        """Checks if there are any highly-null columns in the input.
+
+        Arguments:
+            X (pd.DataFrame, pd.Series, np.array, list) : features
+            y : Ignored.
+
+        Returns:
+            list (DataCheckWarning): list with a DataCheckWarning if there are any highly-null columns.
+
+        Example:
+            >>> df = pd.DataFrame({
+            ...    'lots_of_null': [None, None, None, None, 5],
+            ...    'no_null': [1, 2, 3, 4, 5]
+            ... })
+            >>> null_check = DetectHighlyNullDataCheck(percent_threshold=0.8)
+            >>> assert null_check.validate(df) == [DataCheckWarning("Columns 'lots_of_null' are more than 80.0% null", "DetectHighlyNullDataCheck")]
+        """
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
+        percent_null = (X.isnull().mean()).to_dict()
+        highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.percent_threshold}
+        warning_msg = "Column '{}' is {}% or more null"
+        return [DataCheckWarning(warning_msg.format(col_name, self.percent_threshold * 100), self.name) for col_name in highly_null_cols]
diff --git a/evalml/data_checks/utils.py b/evalml/data_checks/utils.py
@@ -0,0 +1,12 @@
+from .data_checks import DataChecks
+
+
+class EmptyDataChecks(DataChecks):
+    def __init__(self, data_checks=None):
+        """
+        An empty collection of data checks.
+
+        Arguments:
+            data_checks (list (DataCheck)): Ignored.
+        """
+        self.data_checks = []
diff --git a/evalml/tests/data_checks_tests/test_data_check.py b/evalml/tests/data_checks_tests/test_data_check.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -6,6 +7,9 @@
     DataCheckError,
     DataCheckWarning
 )
+from evalml.data_checks.detect_highly_null_data_check import (
+    DetectHighlyNullDataCheck
+)
 
 
 @pytest.fixture
@@ -62,3 +66,65 @@ def validate(self, X, y=None):
     data_check = MockDataCheckWithParam(num=0)
     errors_warnings = data_check.validate(X, y=None)
     assert errors_warnings == [DataCheckError("Expected num == 10", "MockDataCheckWithParam")]
+
+
+def test_highly_null_data_check_init():
+    highly_null_check = DetectHighlyNullDataCheck()
+    assert highly_null_check.percent_threshold == 0.95
+
+    highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.0)
+    assert highly_null_check.percent_threshold == 0
+
+    highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.5)
+    assert highly_null_check.percent_threshold == 0.5
+
+    highly_null_check = DetectHighlyNullDataCheck(percent_threshold=1.0)
+    assert highly_null_check.percent_threshold == 1.0
+
+    with pytest.raises(ValueError, match="percent_threshold must be a float between 0 and 1, inclusive."):
+        DetectHighlyNullDataCheck(percent_threshold=-0.1)
+    with pytest.raises(ValueError, match="percent_threshold must be a float between 0 and 1, inclusive."):
+        DetectHighlyNullDataCheck(percent_threshold=1.1)
+
+
+def test_highly_null_data_check_warnings():
+    data = pd.DataFrame({'lots_of_null': [None, None, None, None, 5],
+                         'all_null': [None, None, None, None, None],
+                         'no_null': [1, 2, 3, 4, 5]})
+    no_null_check = DetectHighlyNullDataCheck(percent_threshold=0.0)
+    assert no_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is 0.0% or more null", "DetectHighlyNullDataCheck"),
+                                            DataCheckWarning("Column 'all_null' is 0.0% or more null", "DetectHighlyNullDataCheck"),
+                                            DataCheckWarning("Column 'no_null' is 0.0% or more null", "DetectHighlyNullDataCheck")]
+    some_null_check = DetectHighlyNullDataCheck(percent_threshold=0.5)
+    assert some_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is 50.0% or more null", "DetectHighlyNullDataCheck"),
+                                              DataCheckWarning("Column 'all_null' is 50.0% or more null", "DetectHighlyNullDataCheck")]
+    all_null_check = DetectHighlyNullDataCheck(percent_threshold=1.0)
+    assert all_null_check.validate(data) == [DataCheckWarning("Column 'all_null' is 100.0% or more null", "DetectHighlyNullDataCheck")]
+
+
+def test_highly_null_data_check_input_formats():
+    highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.8)
+
+    # test empty pd.DataFrame
+    messages = highly_null_check.validate(pd.DataFrame())
+    assert messages == []
+
+    #  test list
+    messages = highly_null_check.validate([None, None, None, None, 5])
+    assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck")]
+
+    #  test pd.Series
+    messages = highly_null_check.validate(pd.Series([None, None, None, None, 5]))
+    assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck")]
+
+    #  test 2D list
+    messages = highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]])
+    assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck"),
+                        DataCheckWarning("Column '1' is 80.0% or more null", "DetectHighlyNullDataCheck"),
+                        DataCheckWarning("Column '2' is 80.0% or more null", "DetectHighlyNullDataCheck")]
+
+    # test np.array
+    messages = highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]]))
+    assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck"),
+                        DataCheckWarning("Column '1' is 80.0% or more null", "DetectHighlyNullDataCheck"),
+                        DataCheckWarning("Column '2' is 80.0% or more null", "DetectHighlyNullDataCheck")]
diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py
@@ -1,9 +1,13 @@
+import pandas as pd
+
 from evalml.data_checks.data_check import DataCheck
 from evalml.data_checks.data_check_message import (
     DataCheckError,
     DataCheckWarning
 )
 from evalml.data_checks.data_checks import DataChecks
+from evalml.data_checks.default_data_checks import DefaultDataChecks
+from evalml.data_checks.utils import EmptyDataChecks
 
 
 def test_data_checks(X_y):
@@ -27,8 +31,26 @@ def validate(self, X, y):
 
     data_checks_list = [MockDataCheck(), MockDataCheckWarning(), MockDataCheckError(), MockDataCheckErrorAndWarning()]
     data_checks = DataChecks(data_checks=data_checks_list)
-    errors_warnings = data_checks.validate(X, y)
-    assert errors_warnings == [DataCheckWarning("warning one", "MockDataCheckWarning"),
-                               DataCheckError("error one", "MockDataCheckError"),
-                               DataCheckError("error two", "MockDataCheckErrorAndWarning"),
-                               DataCheckWarning("warning two", "MockDataCheckErrorAndWarning")]
+    messages = data_checks.validate(X, y)
+    assert messages == [DataCheckWarning("warning one", "MockDataCheckWarning"),
+                        DataCheckError("error one", "MockDataCheckError"),
+                        DataCheckError("error two", "MockDataCheckErrorAndWarning"),
+                        DataCheckWarning("warning two", "MockDataCheckErrorAndWarning")]
+
+
+def test_empty_data_checks(X_y):
+    X, y = X_y
+    data_checks = EmptyDataChecks()
+    messages = data_checks.validate(X, y)
+    assert messages == []
+
+
+def test_default_data_checks(X_y):
+    X = pd.DataFrame({'lots_of_null': [None, None, None, None, 5],
+                      'all_null': [None, None, None, None, None],
+                      'also_all_null': [None, None, None, None, None],
+                      'no_null': [1, 2, 3, 4, 5]})
+    data_checks = DefaultDataChecks()
+    messages = data_checks.validate(X)
+    assert messages == [DataCheckWarning("Column 'all_null' is 95.0% or more null", "DetectHighlyNullDataCheck"),
+                        DataCheckWarning("Column 'also_all_null' is 95.0% or more null", "DetectHighlyNullDataCheck")]