Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
684e419
Add highly null row data check
jeremyliweishih May 3, 2021
49f55fb
Merge branch 'main' of github.com:alteryx/evalml into js_2220_row_dat…
jeremyliweishih May 3, 2021
6182bad
RL
jeremyliweishih May 3, 2021
03cb3b7
Add to API reference
jeremyliweishih May 3, 2021
faa569f
Fix doctest
jeremyliweishih May 4, 2021
7918034
Merge branch 'main' of github.com:alteryx/evalml into js_2220_row_dat…
jeremyliweishih May 4, 2021
752b2c4
Add to default datachecks
jeremyliweishih May 4, 2021
1cc5bf3
Merge branch 'main' of github.com:alteryx/evalml into js_2220_row_dat…
jeremyliweishih May 4, 2021
85a540e
Fix tests
jeremyliweishih May 4, 2021
ffa7b5a
Fix docstring
jeremyliweishih May 5, 2021
813fa4f
Merge branch 'main' of github.com:alteryx/evalml into js_2220_row_dat…
jeremyliweishih May 5, 2021
2a3a1ef
Fix RL
jeremyliweishih May 5, 2021
41dc0fb
merge > 0 and 0 case
jeremyliweishih May 5, 2021
4e8139e
Bump default to 0.75
jeremyliweishih May 5, 2021
823ee83
Test with None and np.nan
jeremyliweishih May 5, 2021
2bdc3a3
Merge branch 'main' of github.com:alteryx/evalml into js_2220_row_dat…
jeremyliweishih May 10, 2021
26d12fd
Simplify highly null data check
jeremyliweishih May 10, 2021
b4e7e88
Combine row and cols check
jeremyliweishih May 10, 2021
c65a1e7
Fix default datacheck tests
jeremyliweishih May 10, 2021
23bea89
Fix removals
jeremyliweishih May 10, 2021
5e366f2
Add null row example to default data checks
jeremyliweishih May 10, 2021
1aab69a
Merge branch 'main' of github.com:alteryx/evalml into js_2220_row_dat…
jeremyliweishih May 13, 2021
8c96390
Merge branch 'main' into js_2220_row_datacheck
jeremyliweishih May 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Release Notes
**Future Releases**
* Enhancements
* Integrated ``ARIMARegressor`` into AutoML :pr:`2009`
* Updated ``HighlyNullDataCheck`` to also perform a null row check :pr:`2222`
* Set ``max_depth`` to 1 in calls to featuretools dfs :pr:`2231`
* Fixes
* Removed data splitter sampler calls during training :pr:`2253`
Expand Down
3 changes: 3 additions & 0 deletions evalml/data_checks/data_check_action_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,8 @@ class DataCheckActionCode(Enum):
DROP_COL = "drop_col"
"""Action code for dropping a column."""

DROP_ROWS = "drop_rows"
"""Action code for dropping rows."""

IMPUTE_COL = "impute_col"
"""Action code for imputing a column."""
5 changes: 4 additions & 1 deletion evalml/data_checks/data_check_message_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
class DataCheckMessageCode(Enum):
"""Enum for data check message code."""

HIGHLY_NULL = "highly_null"
HIGHLY_NULL_COLS = "highly_null_cols"
"""Message code for highly null columns."""

HIGHLY_NULL_ROWS = "highly_null_rows"
"""Message code for highly null rows."""

HAS_ID_COLUMN = "has_id_column"
"""Message code for data that has ID columns."""

Expand Down
1 change: 1 addition & 0 deletions evalml/data_checks/default_data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class DefaultDataChecks(DataChecks):
Includes:

- `HighlyNullDataCheck`
- `HighlyNullRowsDataCheck`
- `IDColumnsDataCheck`
- `TargetLeakageDataCheck`
- `InvalidTargetDataCheck`
Expand Down
83 changes: 50 additions & 33 deletions evalml/data_checks/highly_null_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,45 +9,62 @@


class HighlyNullDataCheck(DataCheck):
"""Checks if there are any highly-null columns in the input."""
"""Checks if there are any highly-null columns and rows in the input."""

def __init__(self, pct_null_threshold=0.95):
"""Checks if there are any highly-null columns in the input.
"""Checks if there are any highly-null columns and rows in the input.

Arguments:
pct_null_threshold(float): If the percentage of NaN values in an input feature exceeds this amount,
that feature will be considered highly-null. Defaults to 0.95.
that column/row will be considered highly-null. Defaults to 0.95.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I do think it would be helpful to have a separate threshold for rows vs columns.

I filed #2270 to track that separately.


"""
if pct_null_threshold < 0 or pct_null_threshold > 1:
raise ValueError("pct_null_threshold must be a float between 0 and 1, inclusive.")
self.pct_null_threshold = pct_null_threshold

def validate(self, X, y=None):
"""Checks if there are any highly-null columns in the input.
"""Checks if there are any highly-null columns or rows in the input.

Arguments:
X (ww.DataTable, pd.DataFrame, np.ndarray): Features
X (ww.DataTable, pd.DataFrame, np.ndarray): Data
y (ww.DataColumn, pd.Series, np.ndarray): Ignored.

Returns:
dict: dict with a DataCheckWarning if there are any highly-null columns.
dict: dict with a DataCheckWarning if there are any highly-null columns or rows.

Example:
>>> import pandas as pd
>>> class SeriesWrap():
... def __init__(self, series):
... self.series = series
...
... def __eq__(self, series_2):
... return all(self.series.eq(series_2.series))
...
>>> df = pd.DataFrame({
... 'lots_of_null': [None, None, None, None, 5],
... 'no_null': [1, 2, 3, 4, 5]
... })
>>> null_check = HighlyNullDataCheck(pct_null_threshold=0.8)
>>> assert null_check.validate(df) == {"errors": [],\
"warnings": [{"message": "Column 'lots_of_null' is 80.0% or more null",\
"data_check_name": "HighlyNullDataCheck",\
"level": "warning",\
"code": "HIGHLY_NULL",\
"details": {"column": "lots_of_null", "pct_null_rows": 0.8}}],\
"actions": [{"code": "DROP_COL",\
"metadata": {"column": "lots_of_null"}}]}
>>> null_check = HighlyNullDataCheck(pct_null_threshold=0.50)
>>> validation_results = null_check.validate(df)
>>> validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols'])
>>> highly_null_rows = SeriesWrap(pd.Series([0.5, 0.5, 0.5, 0.5]))
>>> assert validation_results== {"errors": [],\
"warnings": [{"message": "4 out of 5 rows are more than 50.0% null",\
"data_check_name": "HighlyNullDataCheck",\
"level": "warning",\
"code": "HIGHLY_NULL_ROWS",\
"details": {"pct_null_cols": highly_null_rows}},\
{"message": "Column 'lots_of_null' is 50.0% or more null",\
"data_check_name": "HighlyNullDataCheck",\
"level": "warning",\
"code": "HIGHLY_NULL_COLS",\
"details": {"column": "lots_of_null", "pct_null_rows": 0.8}}],\
"actions": [{"code": "DROP_ROWS", "metadata": {"rows": [0, 1, 2, 3]}},\
{"code": "DROP_COL",\
"metadata": {"column": "lots_of_null"}}]}

"""
results = {
"warnings": [],
Expand All @@ -58,25 +75,25 @@ def validate(self, X, y=None):
X = infer_feature_types(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())

percent_null = (X.isnull().mean()).to_dict()
highly_null_cols = []
if self.pct_null_threshold == 0.0:
highly_null_cols = {key: value for key, value in percent_null.items() if value > 0.0}
warning_msg = "Column '{}' is more than 0% null"
results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name),
data_check_name=self.name,
message_code=DataCheckMessageCode.HIGHLY_NULL,
details={"column": col_name, "pct_null_rows": highly_null_cols[col_name]}).to_dict()
for col_name in highly_null_cols])
else:
highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.pct_null_threshold}
warning_msg = "Column '{}' is {}% or more null"
results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.pct_null_threshold * 100),
data_check_name=self.name,
message_code=DataCheckMessageCode.HIGHLY_NULL,
details={"column": col_name, "pct_null_rows": highly_null_cols[col_name]}).to_dict()
for col_name in highly_null_cols])
percent_null_rows = X.isnull().mean(axis=1)
highly_null_rows = percent_null_rows[percent_null_rows >= self.pct_null_threshold]
if len(highly_null_rows) > 0:
warning_msg = f"{len(highly_null_rows)} out of {len(X)} rows are more than {self.pct_null_threshold*100}% null"
results["warnings"].append(DataCheckWarning(message=warning_msg,
data_check_name=self.name,
message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS,
details={"pct_null_cols": highly_null_rows}).to_dict())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Looks great

results["actions"].append(DataCheckAction(DataCheckActionCode.DROP_ROWS,
metadata={"rows": highly_null_rows.index.tolist()}).to_dict())

percent_null_cols = (X.isnull().mean()).to_dict()
highly_null_cols = {key: value for key, value in percent_null_cols.items() if value >= self.pct_null_threshold and value != 0}
warning_msg = "Column '{}' is {}% or more null"
results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.pct_null_threshold * 100),
data_check_name=self.name,
message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
details={"column": col_name, "pct_null_rows": highly_null_cols[col_name]}).to_dict()
for col_name in highly_null_cols])
results["actions"].extend([DataCheckAction(DataCheckActionCode.DROP_COL,
metadata={"column": col_name}).to_dict()
for col_name in highly_null_cols])
Expand Down
26 changes: 13 additions & 13 deletions evalml/tests/data_checks_tests/test_data_check_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,31 @@
def data_check_message():
return DataCheckMessage(message="test message",
data_check_name="test data check message name",
message_code=DataCheckMessageCode.HIGHLY_NULL,
message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
details={"message detail": "some message detail"})


@pytest.fixture
def data_check_warning():
return DataCheckWarning(message="test warning",
data_check_name="test data check warning name",
message_code=DataCheckMessageCode.HIGHLY_NULL,
message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
details={"warning detail": "some warning detail"})


@pytest.fixture
def data_check_error():
return DataCheckError(message="test error",
data_check_name="test data check error name",
message_code=DataCheckMessageCode.HIGHLY_NULL,
message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
details={"error detail": "some error detail"})


def test_data_check_message_attributes(data_check_message):
assert data_check_message.message == "test message"
assert data_check_message.data_check_name == "test data check message name"
assert data_check_message.message_type is None
assert data_check_message.message_code == DataCheckMessageCode.HIGHLY_NULL
assert data_check_message.message_code == DataCheckMessageCode.HIGHLY_NULL_COLS
assert data_check_message.details == {"message detail": "some message detail"}


Expand All @@ -46,7 +46,7 @@ def test_data_check_message_str(data_check_message):


def test_data_check_message_eq(data_check_message):
equal_msg = DataCheckMessage("test message", "test data check message name", DataCheckMessageCode.HIGHLY_NULL, {"message detail": "some message detail"})
equal_msg = DataCheckMessage("test message", "test data check message name", DataCheckMessageCode.HIGHLY_NULL_COLS, {"message detail": "some message detail"})
assert data_check_message == equal_msg

equal_msg = DataCheckMessage("different test message", "different test data check message name")
Expand All @@ -57,7 +57,7 @@ def test_data_check_warning_attributes(data_check_warning):
assert data_check_warning.message == "test warning"
assert data_check_warning.data_check_name == "test data check warning name"
assert data_check_warning.message_type == DataCheckMessageType.WARNING
assert data_check_warning.message_code == DataCheckMessageCode.HIGHLY_NULL
assert data_check_warning.message_code == DataCheckMessageCode.HIGHLY_NULL_COLS
assert data_check_warning.details == {"warning detail": "some warning detail"}


Expand All @@ -66,7 +66,7 @@ def test_data_check_warning_str(data_check_warning):


def test_data_check_warning_eq(data_check_warning):
equal_msg = DataCheckWarning("test warning", "test data check warning name", DataCheckMessageCode.HIGHLY_NULL, {"warning detail": "some warning detail"})
equal_msg = DataCheckWarning("test warning", "test data check warning name", DataCheckMessageCode.HIGHLY_NULL_COLS, {"warning detail": "some warning detail"})
assert data_check_warning == equal_msg

equal_msg = DataCheckWarning("different test warning", "different test data check warning name")
Expand All @@ -77,7 +77,7 @@ def test_data_check_error_attributes(data_check_error):
assert data_check_error.message == "test error"
assert data_check_error.data_check_name == "test data check error name"
assert data_check_error.message_type == DataCheckMessageType.ERROR
assert data_check_error.message_code == DataCheckMessageCode.HIGHLY_NULL
assert data_check_error.message_code == DataCheckMessageCode.HIGHLY_NULL_COLS
assert data_check_error.details == {"error detail": "some error detail"}


Expand All @@ -86,7 +86,7 @@ def test_data_check_error_str(data_check_error):


def test_data_check_error_eq(data_check_error):
equal_msg = DataCheckError("test error", "test data check error name", DataCheckMessageCode.HIGHLY_NULL, {"error detail": "some error detail"})
equal_msg = DataCheckError("test error", "test data check error name", DataCheckMessageCode.HIGHLY_NULL_COLS, {"error detail": "some error detail"})
assert data_check_error == equal_msg

equal_msg = DataCheckError("different test warning", "different test data check error name")
Expand Down Expand Up @@ -120,24 +120,24 @@ def test_warning_error_eq():
def test_data_check_message_to_dict():
error = DataCheckError(message="test message",
data_check_name="same test name",
message_code=DataCheckMessageCode.HIGHLY_NULL,
message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
details={"detail 1": "error info"})
assert error.to_dict() == {
"message": "test message",
"level": "error",
"data_check_name": "same test name",
"code": DataCheckMessageCode.HIGHLY_NULL.name,
"code": DataCheckMessageCode.HIGHLY_NULL_COLS.name,
"details": {"detail 1": "error info"}
}
warning = DataCheckWarning(message="test message",
data_check_name="same test name",
message_code=DataCheckMessageCode.HIGHLY_NULL,
message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
details={"detail 1": "warning info"})
assert warning.to_dict() == {
"message": "test message",
"level": "warning",
"data_check_name": "same test name",
"code": DataCheckMessageCode.HIGHLY_NULL.name,
"code": DataCheckMessageCode.HIGHLY_NULL_COLS.name,
"details": {"detail 1": "warning info"}
}

Expand Down
Loading