alteryx · ParthivNaresh · Oct 22, 2021 · Oct 3, 2021 · Oct 6, 2021 · Oct 14, 2021
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -13,6 +13,7 @@ Release Notes
         * Fixed ``SelectColumns`` to only select available features for feature selection in ``DefaultAlgorithm`` :pr:`2944`
     * Changes
         * Changed ``make_pipeline`` function to place the ``DateTimeFeaturizer`` prior to the ``Imputer`` so that ``NaN`` dates can be imputed :pr:`2909`
+        * Refactored ``OutliersDataCheck`` and ``HighlyNullDataCheck`` to add more descriptive metadata :pr:`2907`
     * Documentation Changes
         * Added back Future Release section to release notes :pr:`2927`
         * Updated CI to run doctest (docstring tests) and apply necessary fixes to docstrings :pr:`2933`

diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py
@@ -71,7 +71,7 @@ def validate(self, X, y=None):
             ...                   "data_check_name": "HighlyNullDataCheck",
             ...                   "level": "warning",
             ...                   "code": "HIGHLY_NULL_COLS",
-            ...                   "details": {"columns": ["lots_of_null"], "pct_null_rows": {"lots_of_null": 0.8}, "rows": None}}],
+            ...                   "details": {"columns": ["lots_of_null"], "pct_null_rows": {"lots_of_null": 0.8}, "null_row_indices": {"lots_of_null": [0, 1, 2, 3]}, "rows": None}}],
             ...    "actions": [{"code": "DROP_ROWS", "metadata": {"rows": [0, 1, 2, 3], "columns": None}},
             ...                {"code": "DROP_COL", "metadata": {"columns": ["lots_of_null"], "rows": None}}]}
         """
@@ -109,6 +109,9 @@ def validate(self, X, y=None):
             for key, value in percent_null_cols.items()
             if value >= self.pct_null_col_threshold and value != 0
         }
+        highly_null_cols_indices = {
+            col_: X[col_][X[col_].isnull()].index.tolist() for col_ in highly_null_cols
+        }
         warning_msg = "Columns {} are {}% or more null"
         if highly_null_cols:
             results["warnings"].append(
@@ -124,6 +127,7 @@ def validate(self, X, y=None):
                     details={
                         "columns": list(highly_null_cols),
                         "pct_null_rows": highly_null_cols,
+                        "null_row_indices": highly_null_cols_indices,
                     },
                 ).to_dict()
             )

diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py
@@ -56,18 +56,15 @@ def validate(self, X, y=None):
         has_outliers = []
         outlier_row_indices = {}
         for col in X.columns:
-            box_plot_dict = X.ww[col].ww.box_plot_dict()
-            num_records = len(X[col])
-            pct_outliers = (
-                len(box_plot_dict["low_values"]) + len(box_plot_dict["high_values"])
-            ) / num_records
-            if (
-                pct_outliers > 0
-                and OutliersDataCheck._no_outlier_prob(num_records, pct_outliers) <= 0.9
-            ):
+            box_plot_dict = OutliersDataCheck.get_boxplot_data(X.ww[col])
+            box_plot_dict_values = box_plot_dict["values"]
+
+            pct_outliers = box_plot_dict["pct_outliers"]
+            if pct_outliers > 0 and box_plot_dict["score"] <= 0.9:
                 has_outliers.append(col)
                 outlier_row_indices[col] = (
-                    box_plot_dict["low_indices"] + box_plot_dict["high_indices"]
+                    box_plot_dict_values["low_indices"]
+                    + box_plot_dict_values["high_indices"]
                 )
 
         if not len(has_outliers):
@@ -103,6 +100,46 @@ def validate(self, X, y=None):
         )
         return results
 
+    @staticmethod
+    def get_boxplot_data(data_):
+        """Returns box plot information for the given data.
+
+        Args:
+            data_ (pd.Series, np.ndarray): Input data.
+
+        Returns:
+            dict: A payload of box plot statistics.
+        """
+        if not data_.ww._schema:
+            data_.ww.init()
+        num_records = data_.count()
+        box_plot_dict = data_.ww.box_plot_dict()
+        quantiles = box_plot_dict["quantiles"]
+
+        q1, q2, q3 = quantiles[0.25], quantiles[0.5], quantiles[0.75]
+
+        pct_outliers = (
+            len(box_plot_dict["low_values"]) + len(box_plot_dict["high_values"])
+        ) / num_records
+        score = OutliersDataCheck._no_outlier_prob(num_records, pct_outliers)
+
+        payload = {
+            "score": score,
+            "pct_outliers": pct_outliers,
+            "values": {
+                "q1": q1,
+                "median": q2,
+                "q3": q3,
+                "low_bound": box_plot_dict["low_bound"],
+                "high_bound": box_plot_dict["high_bound"],
+                "low_values": box_plot_dict["low_values"],
+                "high_values": box_plot_dict["high_values"],
+                "low_indices": box_plot_dict["low_indices"],
+                "high_indices": box_plot_dict["high_indices"],
+            },
+        }
+        return payload
+
     @staticmethod
     def _no_outlier_prob(num_records: int, pct_outliers: float) -> float:
         """Calculate the probability that there are no true outliers in a numeric (integer or float) column.
@@ -148,7 +185,7 @@ def _no_outlier_prob(num_records: int, pct_outliers: float) -> float:
         shape_param = np.exp(log_shape)
         log_scale = (
             -19.8196822259052
-            + 8.5359212447622 * log_n
+            + 18.5359212447622 * log_n
             + -8.80487628113388 * log_n ** 2
             + 2.27711870991327 * log_n ** 3
             + -0.344443407676357 * log_n ** 4

diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py
@@ -119,6 +119,10 @@ def validate(self, X, y):
         details={
             "columns": ["all_null", "also_all_null"],
             "pct_null_rows": {"all_null": 1.0, "also_all_null": 1.0},
+            "null_row_indices": {
+                "all_null": [0, 1, 2, 3, 4],
+                "also_all_null": [0, 1, 2, 3, 4],
+            },
         },
     ).to_dict(),
     DataCheckWarning(
@@ -434,6 +438,10 @@ def __eq__(self, series_2):
                 details={
                     "columns": ["all_null", "also_all_null"],
                     "pct_null_rows": {"all_null": 1.0, "also_all_null": 1.0},
+                    "null_row_indices": {
+                        "all_null": [0, 1, 2, 3, 4],
+                        "also_all_null": [0, 1, 2, 3, 4],
+                    },
                 },
             ).to_dict(),
         ],

diff --git a/evalml/tests/data_checks_tests/test_highly_null_data_check.py b/evalml/tests/data_checks_tests/test_highly_null_data_check.py
@@ -96,6 +96,10 @@ def test_highly_null_data_check_warnings():
                 details={
                     "columns": ["lots_of_null", "all_null"],
                     "pct_null_rows": {"all_null": 1.0, "lots_of_null": 0.8},
+                    "null_row_indices": {
+                        "all_null": [0, 1, 2, 3, 4],
+                        "lots_of_null": [0, 1, 2, 3],
+                    },
                 },
             ).to_dict(),
         ],
@@ -134,6 +138,10 @@ def test_highly_null_data_check_warnings():
                 details={
                     "columns": ["lots_of_null", "all_null"],
                     "pct_null_rows": {"all_null": 1.0, "lots_of_null": 0.8},
+                    "null_row_indices": {
+                        "all_null": [0, 1, 2, 3, 4],
+                        "lots_of_null": [0, 1, 2, 3],
+                    },
                 },
             ).to_dict(),
         ],
@@ -158,7 +166,11 @@ def test_highly_null_data_check_warnings():
                 message="Columns 'all_null' are 100.0% or more null",
                 data_check_name=highly_null_data_check_name,
                 message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
-                details={"columns": ["all_null"], "pct_null_rows": {"all_null": 1.0}},
+                details={
+                    "columns": ["all_null"],
+                    "pct_null_rows": {"all_null": 1.0},
+                    "null_row_indices": {"all_null": [0, 1, 2, 3, 4]},
+                },
             ).to_dict()
         ],
         "errors": [],
@@ -198,7 +210,11 @@ def test_highly_null_data_check_separate_rows_cols():
                 message="Columns 'all_null' are 90.0% or more null",
                 data_check_name=highly_null_data_check_name,
                 message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
-                details={"columns": ["all_null"], "pct_null_rows": {"all_null": 1.0}},
+                details={
+                    "columns": ["all_null"],
+                    "pct_null_rows": {"all_null": 1.0},
+                    "null_row_indices": {"all_null": [0, 1, 2, 3, 4]},
+                },
             ).to_dict(),
         ],
         "errors": [],
@@ -225,6 +241,10 @@ def test_highly_null_data_check_separate_rows_cols():
                 details={
                     "columns": ["lots_of_null", "all_null"],
                     "pct_null_rows": {"lots_of_null": 0.8, "all_null": 1.0},
+                    "null_row_indices": {
+                        "all_null": [0, 1, 2, 3, 4],
+                        "lots_of_null": [0, 1, 2, 3],
+                    },
                 },
             ).to_dict(),
         ],
@@ -266,6 +286,7 @@ def test_highly_null_data_check_input_formats():
                 details={
                     "columns": [0, 1, 2],
                     "pct_null_rows": {0: 1.0, 1: 1.0, 2: 1.0},
+                    "null_row_indices": {0: [0, 1], 1: [0, 1], 2: [0, 1]},
                 },
             ).to_dict(),
         ],

diff --git a/evalml/tests/data_checks_tests/test_outliers_data_check.py b/evalml/tests/data_checks_tests/test_outliers_data_check.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pandas as pd
+import pytest
 
 from evalml.data_checks import (
     DataCheckAction,
@@ -226,3 +227,34 @@ def test_outliers_data_check_warnings_has_nan():
             ).to_dict()
         ],
     }
+
+
+@pytest.mark.parametrize("data_type", ["int", "mixed"])
+def test_boxplot_stats(data_type):
+    test = pd.Series(
+        [32, 33, 34, None, 96, 36, 37, 1.5 if data_type == "mixed" else 1, 2]
+    )
+
+    quantiles = test.quantile([0.25, 0.5, 0.75]).to_dict()
+    iqr = quantiles[0.75] - quantiles[0.25]
+    field_bounds = (quantiles[0.25] - (iqr * 1.5), quantiles[0.75] + (iqr * 1.5))
+    pct_outliers = (
+        len(test[test <= field_bounds[0]].tolist())
+        + len(test[test >= field_bounds[1]].tolist())
+    ) / test.count()
+
+    assert OutliersDataCheck.get_boxplot_data(test) == {
+        "score": OutliersDataCheck._no_outlier_prob(test.count(), pct_outliers),
+        "pct_outliers": pct_outliers,
+        "values": {
+            "q1": quantiles[0.25],
+            "median": quantiles[0.5],
+            "q3": quantiles[0.75],
+            "low_bound": field_bounds[0],
+            "high_bound": field_bounds[1],
+            "low_values": test[test < field_bounds[0]].tolist(),
+            "high_values": test[test > field_bounds[1]].tolist(),
+            "low_indices": test[test < field_bounds[0]].index.tolist(),
+            "high_indices": test[test > field_bounds[1]].index.tolist(),
+        },
+    }