diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 1686c565fb..1192908a4c 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,6 +3,7 @@ Release Notes **Future Releases** * Enhancements * Integrated ``ARIMARegressor`` into AutoML :pr:`2009` + * Updated ``HighlyNullDataCheck`` to also perform a null row check :pr:`2222` * Set ``max_depth`` to 1 in calls to featuretools dfs :pr:`2231` * Fixes * Removed data splitter sampler calls during training :pr:`2253` diff --git a/evalml/data_checks/data_check_action_code.py b/evalml/data_checks/data_check_action_code.py index dc1b949bac..f2b13fca2d 100644 --- a/evalml/data_checks/data_check_action_code.py +++ b/evalml/data_checks/data_check_action_code.py @@ -7,5 +7,8 @@ class DataCheckActionCode(Enum): DROP_COL = "drop_col" """Action code for dropping a column.""" + DROP_ROWS = "drop_rows" + """Action code for dropping rows.""" + IMPUTE_COL = "impute_col" """Action code for imputing a column.""" diff --git a/evalml/data_checks/data_check_message_code.py b/evalml/data_checks/data_check_message_code.py index ac9efb5bc6..4329cdf377 100644 --- a/evalml/data_checks/data_check_message_code.py +++ b/evalml/data_checks/data_check_message_code.py @@ -4,9 +4,12 @@ class DataCheckMessageCode(Enum): """Enum for data check message code.""" - HIGHLY_NULL = "highly_null" + HIGHLY_NULL_COLS = "highly_null_cols" """Message code for highly null columns.""" + HIGHLY_NULL_ROWS = "highly_null_rows" + """Message code for highly null rows.""" + HAS_ID_COLUMN = "has_id_column" """Message code for data that has ID columns.""" diff --git a/evalml/data_checks/default_data_checks.py b/evalml/data_checks/default_data_checks.py index db7b57b574..95b13974fd 100644 --- a/evalml/data_checks/default_data_checks.py +++ b/evalml/data_checks/default_data_checks.py @@ -16,6 +16,7 @@ class DefaultDataChecks(DataChecks): Includes: - `HighlyNullDataCheck` + - `HighlyNullRowsDataCheck` - `IDColumnsDataCheck` - `TargetLeakageDataCheck` - `InvalidTargetDataCheck` diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py index 8a7deca482..d74f5624b6 100644 --- a/evalml/data_checks/highly_null_data_check.py +++ b/evalml/data_checks/highly_null_data_check.py @@ -9,14 +9,14 @@ class HighlyNullDataCheck(DataCheck): - """Checks if there are any highly-null columns in the input.""" + """Checks if there are any highly-null columns and rows in the input.""" def __init__(self, pct_null_threshold=0.95): - """Checks if there are any highly-null columns in the input. + """Checks if there are any highly-null columns and rows in the input. Arguments: pct_null_threshold(float): If the percentage of NaN values in an input feature exceeds this amount, - that feature will be considered highly-null. Defaults to 0.95. + that column/row will be considered highly-null. Defaults to 0.95. """ if pct_null_threshold < 0 or pct_null_threshold > 1: @@ -24,30 +24,47 @@ def __init__(self, pct_null_threshold=0.95): self.pct_null_threshold = pct_null_threshold def validate(self, X, y=None): - """Checks if there are any highly-null columns in the input. + """Checks if there are any highly-null columns or rows in the input. Arguments: - X (ww.DataTable, pd.DataFrame, np.ndarray): Features + X (ww.DataTable, pd.DataFrame, np.ndarray): Data y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Returns: - dict: dict with a DataCheckWarning if there are any highly-null columns. + dict: dict with a DataCheckWarning if there are any highly-null columns or rows. Example: >>> import pandas as pd + >>> class SeriesWrap(): + ... def __init__(self, series): + ... self.series = series + ... + ... def __eq__(self, series_2): + ... return all(self.series.eq(series_2.series)) + ... >>> df = pd.DataFrame({ ... 'lots_of_null': [None, None, None, None, 5], ... 'no_null': [1, 2, 3, 4, 5] ... }) - >>> null_check = HighlyNullDataCheck(pct_null_threshold=0.8) - >>> assert null_check.validate(df) == {"errors": [],\ - "warnings": [{"message": "Column 'lots_of_null' is 80.0% or more null",\ - "data_check_name": "HighlyNullDataCheck",\ - "level": "warning",\ - "code": "HIGHLY_NULL",\ - "details": {"column": "lots_of_null", "pct_null_rows": 0.8}}],\ - "actions": [{"code": "DROP_COL",\ - "metadata": {"column": "lots_of_null"}}]} + >>> null_check = HighlyNullDataCheck(pct_null_threshold=0.50) + >>> validation_results = null_check.validate(df) + >>> validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols']) + >>> highly_null_rows = SeriesWrap(pd.Series([0.5, 0.5, 0.5, 0.5])) + >>> assert validation_results== {"errors": [],\ + "warnings": [{"message": "4 out of 5 rows are more than 50.0% null",\ + "data_check_name": "HighlyNullDataCheck",\ + "level": "warning",\ + "code": "HIGHLY_NULL_ROWS",\ + "details": {"pct_null_cols": highly_null_rows}},\ + {"message": "Column 'lots_of_null' is 50.0% or more null",\ + "data_check_name": "HighlyNullDataCheck",\ + "level": "warning",\ + "code": "HIGHLY_NULL_COLS",\ + "details": {"column": "lots_of_null", "pct_null_rows": 0.8}}],\ + "actions": [{"code": "DROP_ROWS", "metadata": {"rows": [0, 1, 2, 3]}},\ + {"code": "DROP_COL",\ + "metadata": {"column": "lots_of_null"}}]} + """ results = { "warnings": [], @@ -58,25 +75,25 @@ def validate(self, X, y=None): X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) - percent_null = (X.isnull().mean()).to_dict() - highly_null_cols = [] - if self.pct_null_threshold == 0.0: - highly_null_cols = {key: value for key, value in percent_null.items() if value > 0.0} - warning_msg = "Column '{}' is more than 0% null" - results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name), - data_check_name=self.name, - message_code=DataCheckMessageCode.HIGHLY_NULL, - details={"column": col_name, "pct_null_rows": highly_null_cols[col_name]}).to_dict() - for col_name in highly_null_cols]) - else: - highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.pct_null_threshold} - warning_msg = "Column '{}' is {}% or more null" - results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.pct_null_threshold * 100), - data_check_name=self.name, - message_code=DataCheckMessageCode.HIGHLY_NULL, - details={"column": col_name, "pct_null_rows": highly_null_cols[col_name]}).to_dict() - for col_name in highly_null_cols]) + percent_null_rows = X.isnull().mean(axis=1) + highly_null_rows = percent_null_rows[percent_null_rows >= self.pct_null_threshold] + if len(highly_null_rows) > 0: + warning_msg = f"{len(highly_null_rows)} out of {len(X)} rows are more than {self.pct_null_threshold*100}% null" + results["warnings"].append(DataCheckWarning(message=warning_msg, + data_check_name=self.name, + message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, + details={"pct_null_cols": highly_null_rows}).to_dict()) + results["actions"].append(DataCheckAction(DataCheckActionCode.DROP_ROWS, + metadata={"rows": highly_null_rows.index.tolist()}).to_dict()) + percent_null_cols = (X.isnull().mean()).to_dict() + highly_null_cols = {key: value for key, value in percent_null_cols.items() if value >= self.pct_null_threshold and value != 0} + warning_msg = "Column '{}' is {}% or more null" + results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.pct_null_threshold * 100), + data_check_name=self.name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": col_name, "pct_null_rows": highly_null_cols[col_name]}).to_dict() + for col_name in highly_null_cols]) results["actions"].extend([DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": col_name}).to_dict() for col_name in highly_null_cols]) diff --git a/evalml/tests/data_checks_tests/test_data_check_message.py b/evalml/tests/data_checks_tests/test_data_check_message.py index b9a8112801..96cd24e493 100644 --- a/evalml/tests/data_checks_tests/test_data_check_message.py +++ b/evalml/tests/data_checks_tests/test_data_check_message.py @@ -13,7 +13,7 @@ def data_check_message(): return DataCheckMessage(message="test message", data_check_name="test data check message name", - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"message detail": "some message detail"}) @@ -21,7 +21,7 @@ def data_check_message(): def data_check_warning(): return DataCheckWarning(message="test warning", data_check_name="test data check warning name", - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"warning detail": "some warning detail"}) @@ -29,7 +29,7 @@ def data_check_warning(): def data_check_error(): return DataCheckError(message="test error", data_check_name="test data check error name", - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"error detail": "some error detail"}) @@ -37,7 +37,7 @@ def test_data_check_message_attributes(data_check_message): assert data_check_message.message == "test message" assert data_check_message.data_check_name == "test data check message name" assert data_check_message.message_type is None - assert data_check_message.message_code == DataCheckMessageCode.HIGHLY_NULL + assert data_check_message.message_code == DataCheckMessageCode.HIGHLY_NULL_COLS assert data_check_message.details == {"message detail": "some message detail"} @@ -46,7 +46,7 @@ def test_data_check_message_str(data_check_message): def test_data_check_message_eq(data_check_message): - equal_msg = DataCheckMessage("test message", "test data check message name", DataCheckMessageCode.HIGHLY_NULL, {"message detail": "some message detail"}) + equal_msg = DataCheckMessage("test message", "test data check message name", DataCheckMessageCode.HIGHLY_NULL_COLS, {"message detail": "some message detail"}) assert data_check_message == equal_msg equal_msg = DataCheckMessage("different test message", "different test data check message name") @@ -57,7 +57,7 @@ def test_data_check_warning_attributes(data_check_warning): assert data_check_warning.message == "test warning" assert data_check_warning.data_check_name == "test data check warning name" assert data_check_warning.message_type == DataCheckMessageType.WARNING - assert data_check_warning.message_code == DataCheckMessageCode.HIGHLY_NULL + assert data_check_warning.message_code == DataCheckMessageCode.HIGHLY_NULL_COLS assert data_check_warning.details == {"warning detail": "some warning detail"} @@ -66,7 +66,7 @@ def test_data_check_warning_str(data_check_warning): def test_data_check_warning_eq(data_check_warning): - equal_msg = DataCheckWarning("test warning", "test data check warning name", DataCheckMessageCode.HIGHLY_NULL, {"warning detail": "some warning detail"}) + equal_msg = DataCheckWarning("test warning", "test data check warning name", DataCheckMessageCode.HIGHLY_NULL_COLS, {"warning detail": "some warning detail"}) assert data_check_warning == equal_msg equal_msg = DataCheckWarning("different test warning", "different test data check warning name") @@ -77,7 +77,7 @@ def test_data_check_error_attributes(data_check_error): assert data_check_error.message == "test error" assert data_check_error.data_check_name == "test data check error name" assert data_check_error.message_type == DataCheckMessageType.ERROR - assert data_check_error.message_code == DataCheckMessageCode.HIGHLY_NULL + assert data_check_error.message_code == DataCheckMessageCode.HIGHLY_NULL_COLS assert data_check_error.details == {"error detail": "some error detail"} @@ -86,7 +86,7 @@ def test_data_check_error_str(data_check_error): def test_data_check_error_eq(data_check_error): - equal_msg = DataCheckError("test error", "test data check error name", DataCheckMessageCode.HIGHLY_NULL, {"error detail": "some error detail"}) + equal_msg = DataCheckError("test error", "test data check error name", DataCheckMessageCode.HIGHLY_NULL_COLS, {"error detail": "some error detail"}) assert data_check_error == equal_msg equal_msg = DataCheckError("different test warning", "different test data check error name") @@ -120,24 +120,24 @@ def test_warning_error_eq(): def test_data_check_message_to_dict(): error = DataCheckError(message="test message", data_check_name="same test name", - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"detail 1": "error info"}) assert error.to_dict() == { "message": "test message", "level": "error", "data_check_name": "same test name", - "code": DataCheckMessageCode.HIGHLY_NULL.name, + "code": DataCheckMessageCode.HIGHLY_NULL_COLS.name, "details": {"detail 1": "error info"} } warning = DataCheckWarning(message="test message", data_check_name="same test name", - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"detail 1": "warning info"}) assert warning.to_dict() == { "message": "test message", "level": "warning", "data_check_name": "same test name", - "code": DataCheckMessageCode.HIGHLY_NULL.name, + "code": DataCheckMessageCode.HIGHLY_NULL_COLS.name, "details": {"detail 1": "warning info"} } diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py index cf3a44a19d..bdf1c52037 100644 --- a/evalml/tests/data_checks_tests/test_data_checks.py +++ b/evalml/tests/data_checks_tests/test_data_checks.py @@ -76,11 +76,11 @@ def test_empty_data_checks(input_type, X_y_binary): messages = [DataCheckWarning(message="Column 'all_null' is 95.0% or more null", data_check_name="HighlyNullDataCheck", - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": "all_null", "pct_null_rows": 1.0}).to_dict(), DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null", data_check_name="HighlyNullDataCheck", - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": "also_all_null", "pct_null_rows": 1.0}).to_dict(), DataCheckWarning(message="Column 'id' is 100.0% or more likely to be an ID column", data_check_name="IDColumnsDataCheck", @@ -243,6 +243,53 @@ def test_default_data_checks_regression(input_type): "actions": expected_actions_with_drop_and_impute} +def test_default_data_checks_null_rows(): + class SeriesWrap(): + def __init__(self, series): + self.series = series + + def __eq__(self, series_2): + return all(self.series.eq(series_2.series)) + + X = pd.DataFrame({'all_null': [None, None, None, None, None], + 'also_all_null': [None, None, None, None, None]}) + y = pd.Series([0, 1, np.nan, 1, 0]) + data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) + highly_null_rows = SeriesWrap(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0])) + expected = { + "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 95.0% null", + data_check_name="HighlyNullDataCheck", + message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, + details={"pct_null_cols": highly_null_rows}).to_dict(), + DataCheckWarning(message="Column 'all_null' is 95.0% or more null", + data_check_name="HighlyNullDataCheck", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": 'all_null', "pct_null_rows": 1.0}).to_dict(), + DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null", + data_check_name="HighlyNullDataCheck", + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, + details={"column": 'also_all_null', "pct_null_rows": 1.0}).to_dict()], + "errors": [DataCheckError(message="1 row(s) (20.0%) of target values are null", + data_check_name="InvalidTargetDataCheck", + message_code=DataCheckMessageCode.TARGET_HAS_NULL, + details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(), + DataCheckError(message="all_null has 0 unique value.", + data_check_name="NoVarianceDataCheck", + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "all_null"}).to_dict(), + DataCheckError(message="also_all_null has 0 unique value.", + data_check_name="NoVarianceDataCheck", + message_code=DataCheckMessageCode.NO_VARIANCE, + details={"column": "also_all_null"}).to_dict()], + "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(), + DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(), + DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(), + DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "mean"}).to_dict()]} + validation_results = data_checks.validate(X, y) + validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols']) + assert validation_results == expected + + def test_default_data_checks_time_series_regression(): regression_data_check_classes = [check.__class__ for check in DefaultDataChecks("regression", get_default_primary_search_objective("regression")).data_checks] ts_regression_data_check_classes = [check.__class__ for check in DefaultDataChecks("time series regression", get_default_primary_search_objective("time series regression")).data_checks] diff --git a/evalml/tests/data_checks_tests/test_highly_null_data_check.py b/evalml/tests/data_checks_tests/test_highly_null_data_check.py index f19b3cacbe..b11d736324 100644 --- a/evalml/tests/data_checks_tests/test_highly_null_data_check.py +++ b/evalml/tests/data_checks_tests/test_highly_null_data_check.py @@ -14,6 +14,14 @@ highly_null_data_check_name = HighlyNullDataCheck.name +class SeriesWrap(): + def __init__(self, series): + self.series = series + + def __eq__(self, series_2): + return all(self.series.eq(series_2.series)) + + def test_highly_null_data_check_init(): highly_null_check = HighlyNullDataCheck() assert highly_null_check.pct_null_threshold == 0.95 @@ -38,32 +46,48 @@ def test_highly_null_data_check_warnings(): 'all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 4, 5]}) no_null_check = HighlyNullDataCheck(pct_null_threshold=0.0) - assert no_null_check.validate(data) == { - "warnings": [DataCheckWarning(message="Column 'lots_of_null' is more than 0% null", + highly_null_rows = SeriesWrap(pd.Series([2 / 3, 2 / 3, 2 / 3, 2 / 3, 1 / 3])) + validate_results = no_null_check.validate(data) + validate_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validate_results['warnings'][0]['details']['pct_null_cols']) + assert validate_results == { + "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 0.0% null", data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, + details={"pct_null_cols": highly_null_rows}).to_dict(), + DataCheckWarning(message="Column 'lots_of_null' is 0.0% or more null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": "lots_of_null", "pct_null_rows": 0.8}).to_dict(), - DataCheckWarning(message="Column 'all_null' is more than 0% null", + DataCheckWarning(message="Column 'all_null' is 0.0% or more null", data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": "all_null", "pct_null_rows": 1.0}).to_dict()], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(), + "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(), + DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] } some_null_check = HighlyNullDataCheck(pct_null_threshold=0.5) - assert some_null_check.validate(data) == { - "warnings": [DataCheckWarning(message="Column 'lots_of_null' is 50.0% or more null", + highly_null_rows = SeriesWrap(pd.Series([2 / 3, 2 / 3, 2 / 3, 2 / 3])) + validate_results = some_null_check.validate(data) + validate_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validate_results['warnings'][0]['details']['pct_null_cols']) + assert validate_results == { + "warnings": [DataCheckWarning(message="4 out of 5 rows are more than 50.0% null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, + details={"pct_null_cols": highly_null_rows}).to_dict(), + DataCheckWarning(message="Column 'lots_of_null' is 50.0% or more null", data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": "lots_of_null", "pct_null_rows": 0.8}).to_dict(), DataCheckWarning(message="Column 'all_null' is 50.0% or more null", data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": "all_null", "pct_null_rows": 1.0}).to_dict()], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(), + "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3]}).to_dict(), + DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] } @@ -72,7 +96,7 @@ def test_highly_null_data_check_warnings(): assert all_null_check.validate(data) == { "warnings": [DataCheckWarning(message="Column 'all_null' is 100.0% or more null", data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": "all_null", "pct_null_rows": 1.0}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] @@ -85,30 +109,42 @@ def test_highly_null_data_check_input_formats(): # test empty pd.DataFrame assert highly_null_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} + highly_null_rows = SeriesWrap(pd.Series([0.8])) expected = { - "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more null", + "warnings": [DataCheckWarning(message="1 out of 2 rows are more than 80.0% null", + data_check_name=highly_null_data_check_name, + message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, + details={"pct_null_cols": highly_null_rows}).to_dict(), + DataCheckWarning(message="Column '0' is 80.0% or more null", data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 0, "pct_null_rows": 1.0}).to_dict(), DataCheckWarning(message="Column '1' is 80.0% or more null", data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 1, "pct_null_rows": 1.0}).to_dict(), DataCheckWarning(message="Column '2' is 80.0% or more null", data_check_name=highly_null_data_check_name, - message_code=DataCheckMessageCode.HIGHLY_NULL, + message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 2, "pct_null_rows": 1.0}).to_dict()], "errors": [], - "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 0}).to_dict(), + "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0]}).to_dict(), + DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 0}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 1}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 2}).to_dict()] } # test Woodwork ww_input = ww.DataTable(pd.DataFrame([[None, None, None, None, 0], [None, None, None, "hi", 5]])) - assert highly_null_check.validate(ww_input) == expected + validate_results = highly_null_check.validate(ww_input) + validate_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validate_results['warnings'][0]['details']['pct_null_cols']) + assert validate_results == expected - # test 2D list - assert highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]]) == expected + # # test 2D list + validate_results = highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]]) + validate_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validate_results['warnings'][0]['details']['pct_null_cols']) + assert validate_results == expected # test np.array - assert highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]])) == expected + validate_results = highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]])) + validate_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validate_results['warnings'][0]['details']['pct_null_cols']) + assert validate_results == expected