Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
2cb87e7
updates
ParthivNaresh Oct 3, 2021
d05bbba
Merge branch 'main' into Match-DataHealth-Functions
ParthivNaresh Oct 6, 2021
8681b54
updates
ParthivNaresh Oct 14, 2021
5bb8106
release notes
ParthivNaresh Oct 14, 2021
af130d2
Merge branch 'main' into Match-DataHealth-Functions
ParthivNaresh Oct 14, 2021
456f7d1
release notes
ParthivNaresh Oct 14, 2021
3bbc85b
release notes
ParthivNaresh Oct 14, 2021
f4137b3
rely on statsmodels for medcouple
ParthivNaresh Oct 14, 2021
22f912b
docstring update and import module in function
ParthivNaresh Oct 14, 2021
f492f0b
lint fixes
ParthivNaresh Oct 14, 2021
1dd7314
update to reflect nulls being counted as values in no variance data c…
ParthivNaresh Oct 14, 2021
5b056c4
set bounds to woodwork approach
ParthivNaresh Oct 14, 2021
6226441
Merge branch 'main' into Match-DataHealth-Functions
ParthivNaresh Oct 14, 2021
77aad04
Merge branch 'main' into Match-DataHealth-Functions
ParthivNaresh Oct 15, 2021
a8dd18c
Merge branch 'main' into Match-DataHealth-Functions
ParthivNaresh Oct 20, 2021
a908d04
changes
ParthivNaresh Oct 20, 2021
c278d53
Merge branch 'main' into Match-DataHealth-Functions
ParthivNaresh Oct 21, 2021
a1a3000
Merge branch 'Match-DataHealth-Functions' of https://github.com/alter…
ParthivNaresh Oct 21, 2021
354784d
Merge branch 'main' into Match-DataHealth-Functions
ParthivNaresh Oct 21, 2021
11edf48
no message
ParthivNaresh Oct 21, 2021
7ec0a05
update returned data
ParthivNaresh Oct 21, 2021
8e95d51
Merge branch 'main' into Match-DataHealth-Functions
ParthivNaresh Oct 21, 2021
edd73a4
Merge branch 'main' into Match-DataHealth-Functions
ParthivNaresh Oct 21, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Release Notes
* Fixed ``SelectColumns`` to only select available features for feature selection in ``DefaultAlgorithm`` :pr:`2944`
* Changes
* Changed ``make_pipeline`` function to place the ``DateTimeFeaturizer`` prior to the ``Imputer`` so that ``NaN`` dates can be imputed :pr:`2909`
* Refactored ``OutliersDataCheck`` and ``HighlyNullDataCheck`` to add more descriptive metadata :pr:`2907`
* Documentation Changes
* Added back Future Release section to release notes :pr:`2927`
* Updated CI to run doctest (docstring tests) and apply necessary fixes to docstrings :pr:`2933`
Expand Down
6 changes: 5 additions & 1 deletion evalml/data_checks/highly_null_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def validate(self, X, y=None):
... "data_check_name": "HighlyNullDataCheck",
... "level": "warning",
... "code": "HIGHLY_NULL_COLS",
... "details": {"columns": ["lots_of_null"], "pct_null_rows": {"lots_of_null": 0.8}, "rows": None}}],
... "details": {"columns": ["lots_of_null"], "pct_null_rows": {"lots_of_null": 0.8}, "null_row_indices": {"lots_of_null": [0, 1, 2, 3]}, "rows": None}}],
... "actions": [{"code": "DROP_ROWS", "metadata": {"rows": [0, 1, 2, 3], "columns": None}},
... {"code": "DROP_COL", "metadata": {"columns": ["lots_of_null"], "rows": None}}]}
"""
Expand Down Expand Up @@ -109,6 +109,9 @@ def validate(self, X, y=None):
for key, value in percent_null_cols.items()
if value >= self.pct_null_col_threshold and value != 0
}
highly_null_cols_indices = {
col_: X[col_][X[col_].isnull()].index.tolist() for col_ in highly_null_cols
}
warning_msg = "Columns {} are {}% or more null"
if highly_null_cols:
results["warnings"].append(
Expand All @@ -124,6 +127,7 @@ def validate(self, X, y=None):
details={
"columns": list(highly_null_cols),
"pct_null_rows": highly_null_cols,
"null_row_indices": highly_null_cols_indices,
},
).to_dict()
)
Expand Down
59 changes: 48 additions & 11 deletions evalml/data_checks/outliers_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,15 @@ def validate(self, X, y=None):
has_outliers = []
outlier_row_indices = {}
for col in X.columns:
box_plot_dict = X.ww[col].ww.box_plot_dict()
num_records = len(X[col])
pct_outliers = (
len(box_plot_dict["low_values"]) + len(box_plot_dict["high_values"])
) / num_records
if (
pct_outliers > 0
and OutliersDataCheck._no_outlier_prob(num_records, pct_outliers) <= 0.9
):
box_plot_dict = OutliersDataCheck.get_boxplot_data(X.ww[col])
box_plot_dict_values = box_plot_dict["values"]

pct_outliers = box_plot_dict["pct_outliers"]
if pct_outliers > 0 and box_plot_dict["score"] <= 0.9:
has_outliers.append(col)
outlier_row_indices[col] = (
box_plot_dict["low_indices"] + box_plot_dict["high_indices"]
box_plot_dict_values["low_indices"]
+ box_plot_dict_values["high_indices"]
)

if not len(has_outliers):
Expand Down Expand Up @@ -103,6 +100,46 @@ def validate(self, X, y=None):
)
return results

@staticmethod
def get_boxplot_data(data_):
"""Returns box plot information for the given data.

Args:
data_ (pd.Series, np.ndarray): Input data.

Returns:
dict: A payload of box plot statistics.
"""
if not data_.ww._schema:
data_.ww.init()
num_records = data_.count()
box_plot_dict = data_.ww.box_plot_dict()
quantiles = box_plot_dict["quantiles"]

q1, q2, q3 = quantiles[0.25], quantiles[0.5], quantiles[0.75]

pct_outliers = (
len(box_plot_dict["low_values"]) + len(box_plot_dict["high_values"])
) / num_records
score = OutliersDataCheck._no_outlier_prob(num_records, pct_outliers)

payload = {
"score": score,
"pct_outliers": pct_outliers,
"values": {
"q1": q1,
"median": q2,
"q3": q3,
"low_bound": box_plot_dict["low_bound"],
"high_bound": box_plot_dict["high_bound"],
"low_values": box_plot_dict["low_values"],
"high_values": box_plot_dict["high_values"],
"low_indices": box_plot_dict["low_indices"],
"high_indices": box_plot_dict["high_indices"],
},
}
return payload

@staticmethod
def _no_outlier_prob(num_records: int, pct_outliers: float) -> float:
"""Calculate the probability that there are no true outliers in a numeric (integer or float) column.
Expand Down Expand Up @@ -148,7 +185,7 @@ def _no_outlier_prob(num_records: int, pct_outliers: float) -> float:
shape_param = np.exp(log_shape)
log_scale = (
-19.8196822259052
+ 8.5359212447622 * log_n
+ 18.5359212447622 * log_n
Copy link
Contributor Author

@ParthivNaresh ParthivNaresh Oct 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if we missed this before on purpose but this was part of the original log scale

+ -8.80487628113388 * log_n ** 2
+ 2.27711870991327 * log_n ** 3
+ -0.344443407676357 * log_n ** 4
Expand Down
8 changes: 8 additions & 0 deletions evalml/tests/data_checks_tests/test_data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ def validate(self, X, y):
details={
"columns": ["all_null", "also_all_null"],
"pct_null_rows": {"all_null": 1.0, "also_all_null": 1.0},
"null_row_indices": {
"all_null": [0, 1, 2, 3, 4],
"also_all_null": [0, 1, 2, 3, 4],
},
},
).to_dict(),
DataCheckWarning(
Expand Down Expand Up @@ -434,6 +438,10 @@ def __eq__(self, series_2):
details={
"columns": ["all_null", "also_all_null"],
"pct_null_rows": {"all_null": 1.0, "also_all_null": 1.0},
"null_row_indices": {
"all_null": [0, 1, 2, 3, 4],
"also_all_null": [0, 1, 2, 3, 4],
},
},
).to_dict(),
],
Expand Down
25 changes: 23 additions & 2 deletions evalml/tests/data_checks_tests/test_highly_null_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ def test_highly_null_data_check_warnings():
details={
"columns": ["lots_of_null", "all_null"],
"pct_null_rows": {"all_null": 1.0, "lots_of_null": 0.8},
"null_row_indices": {
"all_null": [0, 1, 2, 3, 4],
"lots_of_null": [0, 1, 2, 3],
},
},
).to_dict(),
],
Expand Down Expand Up @@ -134,6 +138,10 @@ def test_highly_null_data_check_warnings():
details={
"columns": ["lots_of_null", "all_null"],
"pct_null_rows": {"all_null": 1.0, "lots_of_null": 0.8},
"null_row_indices": {
"all_null": [0, 1, 2, 3, 4],
"lots_of_null": [0, 1, 2, 3],
},
},
).to_dict(),
],
Expand All @@ -158,7 +166,11 @@ def test_highly_null_data_check_warnings():
message="Columns 'all_null' are 100.0% or more null",
data_check_name=highly_null_data_check_name,
message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
details={"columns": ["all_null"], "pct_null_rows": {"all_null": 1.0}},
details={
"columns": ["all_null"],
"pct_null_rows": {"all_null": 1.0},
"null_row_indices": {"all_null": [0, 1, 2, 3, 4]},
},
).to_dict()
],
"errors": [],
Expand Down Expand Up @@ -198,7 +210,11 @@ def test_highly_null_data_check_separate_rows_cols():
message="Columns 'all_null' are 90.0% or more null",
data_check_name=highly_null_data_check_name,
message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
details={"columns": ["all_null"], "pct_null_rows": {"all_null": 1.0}},
details={
"columns": ["all_null"],
"pct_null_rows": {"all_null": 1.0},
"null_row_indices": {"all_null": [0, 1, 2, 3, 4]},
},
).to_dict(),
],
"errors": [],
Expand All @@ -225,6 +241,10 @@ def test_highly_null_data_check_separate_rows_cols():
details={
"columns": ["lots_of_null", "all_null"],
"pct_null_rows": {"lots_of_null": 0.8, "all_null": 1.0},
"null_row_indices": {
"all_null": [0, 1, 2, 3, 4],
"lots_of_null": [0, 1, 2, 3],
},
},
).to_dict(),
],
Expand Down Expand Up @@ -266,6 +286,7 @@ def test_highly_null_data_check_input_formats():
details={
"columns": [0, 1, 2],
"pct_null_rows": {0: 1.0, 1: 1.0, 2: 1.0},
"null_row_indices": {0: [0, 1], 1: [0, 1], 2: [0, 1]},
},
).to_dict(),
],
Expand Down
32 changes: 32 additions & 0 deletions evalml/tests/data_checks_tests/test_outliers_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
import pandas as pd
import pytest

from evalml.data_checks import (
DataCheckAction,
Expand Down Expand Up @@ -226,3 +227,34 @@ def test_outliers_data_check_warnings_has_nan():
).to_dict()
],
}


@pytest.mark.parametrize("data_type", ["int", "mixed"])
def test_boxplot_stats(data_type):
test = pd.Series(
[32, 33, 34, None, 96, 36, 37, 1.5 if data_type == "mixed" else 1, 2]
)

quantiles = test.quantile([0.25, 0.5, 0.75]).to_dict()
iqr = quantiles[0.75] - quantiles[0.25]
field_bounds = (quantiles[0.25] - (iqr * 1.5), quantiles[0.75] + (iqr * 1.5))
pct_outliers = (
len(test[test <= field_bounds[0]].tolist())
+ len(test[test >= field_bounds[1]].tolist())
) / test.count()

assert OutliersDataCheck.get_boxplot_data(test) == {
"score": OutliersDataCheck._no_outlier_prob(test.count(), pct_outliers),
"pct_outliers": pct_outliers,
"values": {
"q1": quantiles[0.25],
"median": quantiles[0.5],
"q3": quantiles[0.75],
"low_bound": field_bounds[0],
"high_bound": field_bounds[1],
"low_values": test[test < field_bounds[0]].tolist(),
"high_values": test[test > field_bounds[1]].tolist(),
"low_indices": test[test < field_bounds[0]].index.tolist(),
"high_indices": test[test > field_bounds[1]].index.tolist(),
},
}