From c267dff530c79867baed1b5fbb781abbcfe19530 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 5 May 2020 17:35:00 -0400 Subject: [PATCH 01/14] init --- evalml/data_checks/__init__.py | 1 + .../detect_highly_null_data_check.py | 45 ++++++++++++++++ .../data_checks_tests/test_data_check.py | 51 +++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 evalml/data_checks/detect_highly_null_data_check.py diff --git a/evalml/data_checks/__init__.py b/evalml/data_checks/__init__.py index bdd9897e07..17ec2cdbd5 100644 --- a/evalml/data_checks/__init__.py +++ b/evalml/data_checks/__init__.py @@ -3,3 +3,4 @@ from .data_checks import DataChecks from .data_check_message import DataCheckMessage, DataCheckWarning, DataCheckError from .data_check_message_type import DataCheckMessageType +from .detect_highly_null_data_check import DetectHighlyNullDataCheck diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py new file mode 100644 index 0000000000..c94dad007d --- /dev/null +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -0,0 +1,45 @@ +import pandas as pd + +from .data_check import DataCheck +from .data_check_message import DataCheckWarning + + +class DetectHighlyNullDataCheck(DataCheck): + + def __init__(self, percent_threshold=0.95): + """TODO + + Arguments: + percent_threshold(float): Require that percentage of null values to be considered "highly-null", defaults to 0.95 + """ + if percent_threshold < 0 or percent_threshold > 1: + raise ValueError("percent_threshold must be a float between 0 and 1, inclusive.") + self.percent_threshold = percent_threshold + + def validate(self, X, y=None): + """ Checks if there are any highly-null columns in a pd.Dataframe. + + Arguments: + X (pd.DataFrame) : features + y : Ignored. + + Returns: + Example: + >>> df = pd.DataFrame({ + ... 'lots_of_null': [None, None, None, None, 5], + ... 'no_null': [1, 2, 3, 4, 5] + ... }) + >>> null_check = DetectHighlyNullDataCheck(percent_threshold=0.8) + >>> null_check.validate(df) + """ + messages = [] + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + percent_null = (X.isnull().mean()).to_dict() + highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.percent_threshold} + if len(highly_null_cols) > 0: + col_names_str = ', '.join([f"'{name}'" for name in list(highly_null_cols.keys())]) + warning_msg = "Columns {} are more than {}% null".format(col_names_str, self.percent_threshold * 100.) + warning = DataCheckWarning(warning_msg, self.name) + messages.append(warning) + return messages diff --git a/evalml/tests/data_checks_tests/test_data_check.py b/evalml/tests/data_checks_tests/test_data_check.py index b0ea6ed11a..85714224e6 100644 --- a/evalml/tests/data_checks_tests/test_data_check.py +++ b/evalml/tests/data_checks_tests/test_data_check.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest @@ -6,6 +7,9 @@ DataCheckError, DataCheckWarning ) +from evalml.data_checks.detect_highly_null_data_check import ( + DetectHighlyNullDataCheck +) @pytest.fixture @@ -62,3 +66,50 @@ def validate(self, X, y=None): data_check = MockDataCheckWithParam(num=0) errors_warnings = data_check.validate(X, y=None) assert errors_warnings == [DataCheckError("Expected num == 10", "MockDataCheckWithParam")] + + +def test_highly_null_data_check_init(): + with pytest.raises(ValueError, match="percent_threshold must be a float between 0 and 1, inclusive."): + DetectHighlyNullDataCheck(percent_threshold=-0.1) + with pytest.raises(ValueError, match="percent_threshold must be a float between 0 and 1, inclusive."): + DetectHighlyNullDataCheck(percent_threshold=1.1) + + +def test_highly_null_data_check_empty_df(): + highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.1) + messages = highly_null_check.validate(pd.DataFrame()) + assert messages == [] + + +def test_highly_null_data_check_no_warnings(): + highly_null_check = DetectHighlyNullDataCheck(percent_threshold=1.0) + messages = highly_null_check.validate(pd.DataFrame({'lots_of_null': [None, None, None, None, 5], 'no_null': [1, 2, 3, 4, 5]})) + assert messages == [] + + +def test_highly_null_data_check_has_warnings(): + highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.8) + messages = highly_null_check.validate(pd.DataFrame({'lots_of_null': [None, None, None, None, 5], + 'all_null': [None, None, None, None, None], + 'no_null': [1, 2, 3, 4, 5]})) + assert messages == [DataCheckWarning("Columns 'lots_of_null', 'all_null' are more than 80.0% null", "DetectHighlyNullDataCheck")] + + +def test_highly_null_data_check_input_formats(): + highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.8) + + # test list + messages = highly_null_check.validate([None, None, None, None, 5]) + assert messages == [DataCheckWarning("Columns '0' are more than 80.0% null", "DetectHighlyNullDataCheck")] + + # test pd.Series + messages = highly_null_check.validate(pd.Series([None, None, None, None, 5])) + assert messages == [DataCheckWarning("Columns '0' are more than 80.0% null", "DetectHighlyNullDataCheck")] + + # test 2D list + messages = highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]]) + assert messages == [DataCheckWarning("Columns '0', '1', '2' are more than 80.0% null", "DetectHighlyNullDataCheck")] + + # test np.array + messages = highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]])) + assert messages == [DataCheckWarning("Columns '0', '1', '2' are more than 80.0% null", "DetectHighlyNullDataCheck")] From 92b6c94e3e04295990ae8ef078f1e65d1d64d4db Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 5 May 2020 17:37:33 -0400 Subject: [PATCH 02/14] changelog --- docs/source/changelog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 4fb23f1cea..f12805c45f 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -4,6 +4,7 @@ Changelog --------- **Future Releases** * Enhancements + * Port over highly-null guardrail as a data check and define `BasicDataChecks` and `DisableDataChecks` classes :pr:`745` * Fixes * Changes * Cleanup pipeline `score` code, and cleanup codecov :pr:`711` From 1de7c70df7ad2e1ce97026d46add8c9dbcaa4cb6 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 5 May 2020 17:47:51 -0400 Subject: [PATCH 03/14] docstr test' --- evalml/data_checks/detect_highly_null_data_check.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py index c94dad007d..d3e536dd03 100644 --- a/evalml/data_checks/detect_highly_null_data_check.py +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -30,7 +30,8 @@ def validate(self, X, y=None): ... 'no_null': [1, 2, 3, 4, 5] ... }) >>> null_check = DetectHighlyNullDataCheck(percent_threshold=0.8) - >>> null_check.validate(df) + >>> null_check.validate(df) == [DataCheckWarning("Columns 'lots_of_null' are more than 80.0% null", "DetectHighlyNullDataCheck")] + True """ messages = [] if not isinstance(X, pd.DataFrame): From 0095684ad5d9752276a9ed6f454fb4f143879121 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 5 May 2020 17:52:15 -0400 Subject: [PATCH 04/14] cleanup --- evalml/data_checks/detect_highly_null_data_check.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py index d3e536dd03..8c602c4ff6 100644 --- a/evalml/data_checks/detect_highly_null_data_check.py +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -7,23 +7,26 @@ class DetectHighlyNullDataCheck(DataCheck): def __init__(self, percent_threshold=0.95): - """TODO + """Checks if there are any highly-null columns in the input. Arguments: percent_threshold(float): Require that percentage of null values to be considered "highly-null", defaults to 0.95 + """ if percent_threshold < 0 or percent_threshold > 1: raise ValueError("percent_threshold must be a float between 0 and 1, inclusive.") self.percent_threshold = percent_threshold def validate(self, X, y=None): - """ Checks if there are any highly-null columns in a pd.Dataframe. + """Checks if there are any highly-null columns in the input. Arguments: - X (pd.DataFrame) : features + X (pd.DataFrame, pd.Series, np.array, list) : features y : Ignored. Returns: + list (DataCheckWarning): list with a DataCheckWarning if there are any highly-null columns. + Example: >>> df = pd.DataFrame({ ... 'lots_of_null': [None, None, None, None, 5], From fd660e97687d9849919af2b3c4e600384a9e0992 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 5 May 2020 18:21:22 -0400 Subject: [PATCH 05/14] add basic and empty data checks --- evalml/data_checks/__init__.py | 1 + evalml/data_checks/basic_data_checks.py | 13 ++++++++ .../detect_highly_null_data_check.py | 2 +- evalml/data_checks/empty_data_checks.py | 12 ++++++++ .../data_checks_tests/test_data_checks.py | 30 +++++++++++++++---- 5 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 evalml/data_checks/basic_data_checks.py create mode 100644 evalml/data_checks/empty_data_checks.py diff --git a/evalml/data_checks/__init__.py b/evalml/data_checks/__init__.py index 17ec2cdbd5..1f75ba334b 100644 --- a/evalml/data_checks/__init__.py +++ b/evalml/data_checks/__init__.py @@ -4,3 +4,4 @@ from .data_check_message import DataCheckMessage, DataCheckWarning, DataCheckError from .data_check_message_type import DataCheckMessageType from .detect_highly_null_data_check import DetectHighlyNullDataCheck +from .basic_data_checks import BasicDataChecks diff --git a/evalml/data_checks/basic_data_checks.py b/evalml/data_checks/basic_data_checks.py new file mode 100644 index 0000000000..bcc88276fa --- /dev/null +++ b/evalml/data_checks/basic_data_checks.py @@ -0,0 +1,13 @@ +from .data_checks import DataChecks +from .detect_highly_null_data_check import DetectHighlyNullDataCheck + + +class BasicDataChecks(DataChecks): + def __init__(self, data_checks=None): + """ + A collection of data checks. + + Arguments: + data_checks (list (DataCheck)): Ignored. + """ + self.data_checks = [DetectHighlyNullDataCheck()] diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py index 8c602c4ff6..8051e70e4d 100644 --- a/evalml/data_checks/detect_highly_null_data_check.py +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -43,7 +43,7 @@ def validate(self, X, y=None): highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.percent_threshold} if len(highly_null_cols) > 0: col_names_str = ', '.join([f"'{name}'" for name in list(highly_null_cols.keys())]) - warning_msg = "Columns {} are more than {}% null".format(col_names_str, self.percent_threshold * 100.) + warning_msg = "Columns {} are more than {}% null".format(col_names_str, self.percent_threshold * 100) warning = DataCheckWarning(warning_msg, self.name) messages.append(warning) return messages diff --git a/evalml/data_checks/empty_data_checks.py b/evalml/data_checks/empty_data_checks.py new file mode 100644 index 0000000000..259531e0e0 --- /dev/null +++ b/evalml/data_checks/empty_data_checks.py @@ -0,0 +1,12 @@ +from .data_checks import DataChecks + + +class EmptyDataChecks(DataChecks): + def __init__(self, data_checks=None): + """ + An empty collection of data checks. + + Arguments: + data_checks (list (DataCheck)): Ignored. + """ + self.data_checks = [] diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py index c24f339a31..5737e5514d 100644 --- a/evalml/tests/data_checks_tests/test_data_checks.py +++ b/evalml/tests/data_checks_tests/test_data_checks.py @@ -1,9 +1,13 @@ +import pandas as pd + +from evalml.data_checks.basic_data_checks import BasicDataChecks from evalml.data_checks.data_check import DataCheck from evalml.data_checks.data_check_message import ( DataCheckError, DataCheckWarning ) from evalml.data_checks.data_checks import DataChecks +from evalml.data_checks.empty_data_checks import EmptyDataChecks def test_data_checks(X_y): @@ -27,8 +31,24 @@ def validate(self, X, y): data_checks_list = [MockDataCheck(), MockDataCheckWarning(), MockDataCheckError(), MockDataCheckErrorAndWarning()] data_checks = DataChecks(data_checks=data_checks_list) - errors_warnings = data_checks.validate(X, y) - assert errors_warnings == [DataCheckWarning("warning one", "MockDataCheckWarning"), - DataCheckError("error one", "MockDataCheckError"), - DataCheckError("error two", "MockDataCheckErrorAndWarning"), - DataCheckWarning("warning two", "MockDataCheckErrorAndWarning")] + messages = data_checks.validate(X, y) + assert messages == [DataCheckWarning("warning one", "MockDataCheckWarning"), + DataCheckError("error one", "MockDataCheckError"), + DataCheckError("error two", "MockDataCheckErrorAndWarning"), + DataCheckWarning("warning two", "MockDataCheckErrorAndWarning")] + + +def test_empty_data_checks(X_y): + X, y = X_y + data_checks = EmptyDataChecks() + messages = data_checks.validate(X, y) + assert messages == [] + + +def test_basic_data_checks(X_y): + X = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], + 'all_null': [None, None, None, None, None], + 'no_null': [1, 2, 3, 4, 5]}) + data_checks = BasicDataChecks() + messages = data_checks.validate(X) + assert messages == [DataCheckWarning("Columns 'all_null' are more than 95.0% null", "DetectHighlyNullDataCheck")] From 583ca0e104a9e3e881c538b492942751509cba50 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Tue, 5 May 2020 18:38:38 -0400 Subject: [PATCH 06/14] codecov From 312d7f024e7dfe81592ba81f1347d34ca8f28adf Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 7 May 2020 13:08:56 -0400 Subject: [PATCH 07/14] address PR comments --- docs/source/changelog.rst | 2 +- evalml/data_checks/__init__.py | 3 +- ..._data_checks.py => default_data_checks.py} | 2 +- .../detect_highly_null_data_check.py | 15 ++--- .../{empty_data_checks.py => utils.py} | 0 .../data_checks_tests/test_data_check.py | 59 ++++++++++++------- .../data_checks_tests/test_data_checks.py | 12 ++-- 7 files changed, 53 insertions(+), 40 deletions(-) rename evalml/data_checks/{basic_data_checks.py => default_data_checks.py} (90%) rename evalml/data_checks/{empty_data_checks.py => utils.py} (100%) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index f12805c45f..deb5e66413 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -4,7 +4,7 @@ Changelog --------- **Future Releases** * Enhancements - * Port over highly-null guardrail as a data check and define `BasicDataChecks` and `DisableDataChecks` classes :pr:`745` + * Port over highly-null guardrail as a data check and define `DefaultDataChecks` and `DisableDataChecks` classes :pr:`745` * Fixes * Changes * Cleanup pipeline `score` code, and cleanup codecov :pr:`711` diff --git a/evalml/data_checks/__init__.py b/evalml/data_checks/__init__.py index 1f75ba334b..1c9262462f 100644 --- a/evalml/data_checks/__init__.py +++ b/evalml/data_checks/__init__.py @@ -4,4 +4,5 @@ from .data_check_message import DataCheckMessage, DataCheckWarning, DataCheckError from .data_check_message_type import DataCheckMessageType from .detect_highly_null_data_check import DetectHighlyNullDataCheck -from .basic_data_checks import BasicDataChecks +from .default_data_checks import DefaultDataChecks +from .utils import EmptyDataChecks diff --git a/evalml/data_checks/basic_data_checks.py b/evalml/data_checks/default_data_checks.py similarity index 90% rename from evalml/data_checks/basic_data_checks.py rename to evalml/data_checks/default_data_checks.py index bcc88276fa..8bd8d352be 100644 --- a/evalml/data_checks/basic_data_checks.py +++ b/evalml/data_checks/default_data_checks.py @@ -2,7 +2,7 @@ from .detect_highly_null_data_check import DetectHighlyNullDataCheck -class BasicDataChecks(DataChecks): +class DefaultDataChecks(DataChecks): def __init__(self, data_checks=None): """ A collection of data checks. diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py index 8051e70e4d..1c622f2ac8 100644 --- a/evalml/data_checks/detect_highly_null_data_check.py +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -10,7 +10,8 @@ def __init__(self, percent_threshold=0.95): """Checks if there are any highly-null columns in the input. Arguments: - percent_threshold(float): Require that percentage of null values to be considered "highly-null", defaults to 0.95 + percent_threshold(float): If the percentage of values in an input feature exceeds this amount, + that feature will be considered highly-null. Defaults to 0.95. """ if percent_threshold < 0 or percent_threshold > 1: @@ -33,17 +34,11 @@ def validate(self, X, y=None): ... 'no_null': [1, 2, 3, 4, 5] ... }) >>> null_check = DetectHighlyNullDataCheck(percent_threshold=0.8) - >>> null_check.validate(df) == [DataCheckWarning("Columns 'lots_of_null' are more than 80.0% null", "DetectHighlyNullDataCheck")] - True + >>> assert null_check.validate(df) == [DataCheckWarning("Columns 'lots_of_null' are more than 80.0% null", "DetectHighlyNullDataCheck")] """ - messages = [] if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) percent_null = (X.isnull().mean()).to_dict() highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.percent_threshold} - if len(highly_null_cols) > 0: - col_names_str = ', '.join([f"'{name}'" for name in list(highly_null_cols.keys())]) - warning_msg = "Columns {} are more than {}% null".format(col_names_str, self.percent_threshold * 100) - warning = DataCheckWarning(warning_msg, self.name) - messages.append(warning) - return messages + warning_msg = "Column '{}' is {}% or more null" + return [DataCheckWarning(warning_msg.format(col_name, self.percent_threshold * 100), self.name) for col_name in highly_null_cols] diff --git a/evalml/data_checks/empty_data_checks.py b/evalml/data_checks/utils.py similarity index 100% rename from evalml/data_checks/empty_data_checks.py rename to evalml/data_checks/utils.py diff --git a/evalml/tests/data_checks_tests/test_data_check.py b/evalml/tests/data_checks_tests/test_data_check.py index 85714224e6..a92d5ccc90 100644 --- a/evalml/tests/data_checks_tests/test_data_check.py +++ b/evalml/tests/data_checks_tests/test_data_check.py @@ -69,47 +69,62 @@ def validate(self, X, y=None): def test_highly_null_data_check_init(): - with pytest.raises(ValueError, match="percent_threshold must be a float between 0 and 1, inclusive."): - DetectHighlyNullDataCheck(percent_threshold=-0.1) - with pytest.raises(ValueError, match="percent_threshold must be a float between 0 and 1, inclusive."): - DetectHighlyNullDataCheck(percent_threshold=1.1) + highly_null_check = DetectHighlyNullDataCheck() + assert highly_null_check.percent_threshold == 0.95 + highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.0) + assert highly_null_check.percent_threshold == 0 -def test_highly_null_data_check_empty_df(): - highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.1) - messages = highly_null_check.validate(pd.DataFrame()) - assert messages == [] - + highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.5) + assert highly_null_check.percent_threshold == 0.5 -def test_highly_null_data_check_no_warnings(): highly_null_check = DetectHighlyNullDataCheck(percent_threshold=1.0) - messages = highly_null_check.validate(pd.DataFrame({'lots_of_null': [None, None, None, None, 5], 'no_null': [1, 2, 3, 4, 5]})) - assert messages == [] + assert highly_null_check.percent_threshold == 1.0 + with pytest.raises(ValueError, match="percent_threshold must be a float between 0 and 1, inclusive."): + DetectHighlyNullDataCheck(percent_threshold=-0.1) + with pytest.raises(ValueError, match="percent_threshold must be a float between 0 and 1, inclusive."): + DetectHighlyNullDataCheck(percent_threshold=1.1) -def test_highly_null_data_check_has_warnings(): - highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.8) - messages = highly_null_check.validate(pd.DataFrame({'lots_of_null': [None, None, None, None, 5], - 'all_null': [None, None, None, None, None], - 'no_null': [1, 2, 3, 4, 5]})) - assert messages == [DataCheckWarning("Columns 'lots_of_null', 'all_null' are more than 80.0% null", "DetectHighlyNullDataCheck")] + +def test_highly_null_data_check_warnings(): + data = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], + 'all_null': [None, None, None, None, None], + 'no_null': [1, 2, 3, 4, 5]}) + no_null_check = DetectHighlyNullDataCheck(percent_threshold=0.0) + assert no_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is 0.0% or more null", "DetectHighlyNullDataCheck"), + DataCheckWarning("Column 'all_null' is 0.0% or more null", "DetectHighlyNullDataCheck"), + DataCheckWarning("Column 'no_null' is 0.0% or more null", "DetectHighlyNullDataCheck")] + some_null_check = DetectHighlyNullDataCheck(percent_threshold=0.5) + assert some_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is 50.0% or more null", "DetectHighlyNullDataCheck"), + DataCheckWarning("Column 'all_null' is 50.0% or more null", "DetectHighlyNullDataCheck")] + all_null_check = DetectHighlyNullDataCheck(percent_threshold=1.0) + assert all_null_check.validate(data) == [DataCheckWarning("Column 'all_null' is 100.0% or more null", "DetectHighlyNullDataCheck")] def test_highly_null_data_check_input_formats(): highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.8) + # test empty pd.DataFrame + messages = highly_null_check.validate(pd.DataFrame()) + assert messages == [] + # test list messages = highly_null_check.validate([None, None, None, None, 5]) - assert messages == [DataCheckWarning("Columns '0' are more than 80.0% null", "DetectHighlyNullDataCheck")] + assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck")] # test pd.Series messages = highly_null_check.validate(pd.Series([None, None, None, None, 5])) - assert messages == [DataCheckWarning("Columns '0' are more than 80.0% null", "DetectHighlyNullDataCheck")] + assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck")] # test 2D list messages = highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]]) - assert messages == [DataCheckWarning("Columns '0', '1', '2' are more than 80.0% null", "DetectHighlyNullDataCheck")] + assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck"), + DataCheckWarning("Column '1' is 80.0% or more null", "DetectHighlyNullDataCheck"), + DataCheckWarning("Column '2' is 80.0% or more null", "DetectHighlyNullDataCheck")] # test np.array messages = highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]])) - assert messages == [DataCheckWarning("Columns '0', '1', '2' are more than 80.0% null", "DetectHighlyNullDataCheck")] + assert messages == [DataCheckWarning("Column '0' is 80.0% or more null", "DetectHighlyNullDataCheck"), + DataCheckWarning("Column '1' is 80.0% or more null", "DetectHighlyNullDataCheck"), + DataCheckWarning("Column '2' is 80.0% or more null", "DetectHighlyNullDataCheck")] diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py index 5737e5514d..094dfa2c9e 100644 --- a/evalml/tests/data_checks_tests/test_data_checks.py +++ b/evalml/tests/data_checks_tests/test_data_checks.py @@ -1,13 +1,13 @@ import pandas as pd -from evalml.data_checks.basic_data_checks import BasicDataChecks from evalml.data_checks.data_check import DataCheck from evalml.data_checks.data_check_message import ( DataCheckError, DataCheckWarning ) from evalml.data_checks.data_checks import DataChecks -from evalml.data_checks.empty_data_checks import EmptyDataChecks +from evalml.data_checks.default_data_checks import DefaultDataChecks +from evalml.data_checks.utils import EmptyDataChecks def test_data_checks(X_y): @@ -45,10 +45,12 @@ def test_empty_data_checks(X_y): assert messages == [] -def test_basic_data_checks(X_y): +def test_default_data_checks(X_y): X = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], 'all_null': [None, None, None, None, None], + 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 4, 5]}) - data_checks = BasicDataChecks() + data_checks = DefaultDataChecks() messages = data_checks.validate(X) - assert messages == [DataCheckWarning("Columns 'all_null' are more than 95.0% null", "DetectHighlyNullDataCheck")] + assert messages == [DataCheckWarning("Column 'all_null' is 95.0% or more null", "DetectHighlyNullDataCheck"), + DataCheckWarning("Column 'also_all_null' is 95.0% or more null", "DetectHighlyNullDataCheck")] From d9d7a13aa657fc0dced3976d9a98ce028f5fe835 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 7 May 2020 14:36:26 -0400 Subject: [PATCH 08/14] add edge cases --- evalml/data_checks/detect_highly_null_data_check.py | 13 +++++++++++-- evalml/tests/data_checks_tests/test_data_check.py | 7 +++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py index 1c622f2ac8..1d5bfa9123 100644 --- a/evalml/data_checks/detect_highly_null_data_check.py +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -39,6 +39,15 @@ def validate(self, X, y=None): if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) percent_null = (X.isnull().mean()).to_dict() - highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.percent_threshold} - warning_msg = "Column '{}' is {}% or more null" + if self.percent_threshold == 0.0: + has_null_cols = {key: value for key, value in percent_null.items() if value > self.percent_threshold} + warning_msg = "Column '{}' is more than 0% null" + return [DataCheckWarning(warning_msg.format(col_name), self.name) for col_name in has_null_cols] + elif self.percent_threshold == 1.0: + all_null_cols = {key: value for key, value in percent_null.items() if value == self.percent_threshold} + warning_msg = "Column '{}' is 100% null" + return [DataCheckWarning(warning_msg.format(col_name), self.name) for col_name in all_null_cols] + else: + highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.percent_threshold} + warning_msg = "Column '{}' is {}% or more null" return [DataCheckWarning(warning_msg.format(col_name, self.percent_threshold * 100), self.name) for col_name in highly_null_cols] diff --git a/evalml/tests/data_checks_tests/test_data_check.py b/evalml/tests/data_checks_tests/test_data_check.py index a92d5ccc90..45e6aa3037 100644 --- a/evalml/tests/data_checks_tests/test_data_check.py +++ b/evalml/tests/data_checks_tests/test_data_check.py @@ -92,14 +92,13 @@ def test_highly_null_data_check_warnings(): 'all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 4, 5]}) no_null_check = DetectHighlyNullDataCheck(percent_threshold=0.0) - assert no_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is 0.0% or more null", "DetectHighlyNullDataCheck"), - DataCheckWarning("Column 'all_null' is 0.0% or more null", "DetectHighlyNullDataCheck"), - DataCheckWarning("Column 'no_null' is 0.0% or more null", "DetectHighlyNullDataCheck")] + assert no_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is more than 0% null", "DetectHighlyNullDataCheck"), + DataCheckWarning("Column 'all_null' is more than 0% null", "DetectHighlyNullDataCheck")] some_null_check = DetectHighlyNullDataCheck(percent_threshold=0.5) assert some_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is 50.0% or more null", "DetectHighlyNullDataCheck"), DataCheckWarning("Column 'all_null' is 50.0% or more null", "DetectHighlyNullDataCheck")] all_null_check = DetectHighlyNullDataCheck(percent_threshold=1.0) - assert all_null_check.validate(data) == [DataCheckWarning("Column 'all_null' is 100.0% or more null", "DetectHighlyNullDataCheck")] + assert all_null_check.validate(data) == [DataCheckWarning("Column 'all_null' is 100% null", "DetectHighlyNullDataCheck")] def test_highly_null_data_check_input_formats(): From 0389f6baa1926a6fc825e4f2f29219df3583fcc5 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 7 May 2020 14:43:41 -0400 Subject: [PATCH 09/14] cleanup via pr --- .../detect_highly_null_data_check.py | 24 +++++++-------- evalml/guardrails/utils.py | 8 ++--- .../data_checks_tests/test_data_check.py | 30 +++++++++---------- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py index 1d5bfa9123..64e2503b18 100644 --- a/evalml/data_checks/detect_highly_null_data_check.py +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -6,17 +6,17 @@ class DetectHighlyNullDataCheck(DataCheck): - def __init__(self, percent_threshold=0.95): + def __init__(self, pct_null_threshold=0.95): """Checks if there are any highly-null columns in the input. Arguments: - percent_threshold(float): If the percentage of values in an input feature exceeds this amount, + pct_null_threshold(float): If the percentage of values in an input feature exceeds this amount, that feature will be considered highly-null. Defaults to 0.95. """ - if percent_threshold < 0 or percent_threshold > 1: - raise ValueError("percent_threshold must be a float between 0 and 1, inclusive.") - self.percent_threshold = percent_threshold + if pct_null_threshold < 0 or pct_null_threshold > 1: + raise ValueError("pct_null_threshold must be a float between 0 and 1, inclusive.") + self.pct_null_threshold = pct_null_threshold def validate(self, X, y=None): """Checks if there are any highly-null columns in the input. @@ -33,21 +33,21 @@ def validate(self, X, y=None): ... 'lots_of_null': [None, None, None, None, 5], ... 'no_null': [1, 2, 3, 4, 5] ... }) - >>> null_check = DetectHighlyNullDataCheck(percent_threshold=0.8) + >>> null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.8) >>> assert null_check.validate(df) == [DataCheckWarning("Columns 'lots_of_null' are more than 80.0% null", "DetectHighlyNullDataCheck")] """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) percent_null = (X.isnull().mean()).to_dict() - if self.percent_threshold == 0.0: - has_null_cols = {key: value for key, value in percent_null.items() if value > self.percent_threshold} + if self.pct_null_threshold == 0.0: + has_null_cols = {key: value for key, value in percent_null.items() if value > self.pct_null_threshold} warning_msg = "Column '{}' is more than 0% null" return [DataCheckWarning(warning_msg.format(col_name), self.name) for col_name in has_null_cols] - elif self.percent_threshold == 1.0: - all_null_cols = {key: value for key, value in percent_null.items() if value == self.percent_threshold} + elif self.pct_null_threshold == 1.0: + all_null_cols = {key: value for key, value in percent_null.items() if value == self.pct_null_threshold} warning_msg = "Column '{}' is 100% null" return [DataCheckWarning(warning_msg.format(col_name), self.name) for col_name in all_null_cols] else: - highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.percent_threshold} + highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.pct_null_threshold} warning_msg = "Column '{}' is {}% or more null" - return [DataCheckWarning(warning_msg.format(col_name, self.percent_threshold * 100), self.name) for col_name in highly_null_cols] + return [DataCheckWarning(warning_msg.format(col_name, self.pct_null_threshold * 100), self.name) for col_name in highly_null_cols] diff --git a/evalml/guardrails/utils.py b/evalml/guardrails/utils.py index 5d246df92f..385a1fe72c 100644 --- a/evalml/guardrails/utils.py +++ b/evalml/guardrails/utils.py @@ -37,12 +37,12 @@ def detect_label_leakage(X, y, threshold=.95): return corrs -def detect_highly_null(X, percent_threshold=.95): +def detect_highly_null(X, pct_null_threshold=.95): """ Checks if there are any highly-null columns in a dataframe. Args: X (pd.DataFrame) : features - percent_threshold(float): Require that percentage of null values to be considered "highly-null", defaults to .95 + pct_null_threshold(float): Require that percentage of null values to be considered "highly-null", defaults to .95 Returns: A dictionary of features with column name or index and their percentage of null values @@ -52,14 +52,14 @@ def detect_highly_null(X, percent_threshold=.95): ... 'lots_of_null': [None, None, None, None, 5], ... 'no_null': [1, 2, 3, 4, 5] ... }) - >>> detect_highly_null(df, percent_threshold=0.8) + >>> detect_highly_null(df, pct_null_threshold=0.8) {'lots_of_null': 0.8} """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) percent_null = (X.isnull().mean()).to_dict() - highly_null_cols = {key: value for key, value in percent_null.items() if value >= percent_threshold} + highly_null_cols = {key: value for key, value in percent_null.items() if value >= pct_null_threshold} return highly_null_cols diff --git a/evalml/tests/data_checks_tests/test_data_check.py b/evalml/tests/data_checks_tests/test_data_check.py index 45e6aa3037..9f771f4aa7 100644 --- a/evalml/tests/data_checks_tests/test_data_check.py +++ b/evalml/tests/data_checks_tests/test_data_check.py @@ -70,39 +70,39 @@ def validate(self, X, y=None): def test_highly_null_data_check_init(): highly_null_check = DetectHighlyNullDataCheck() - assert highly_null_check.percent_threshold == 0.95 + assert highly_null_check.pct_null_threshold == 0.95 - highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.0) - assert highly_null_check.percent_threshold == 0 + highly_null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.0) + assert highly_null_check.pct_null_threshold == 0 - highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.5) - assert highly_null_check.percent_threshold == 0.5 + highly_null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.5) + assert highly_null_check.pct_null_threshold == 0.5 - highly_null_check = DetectHighlyNullDataCheck(percent_threshold=1.0) - assert highly_null_check.percent_threshold == 1.0 + highly_null_check = DetectHighlyNullDataCheck(pct_null_threshold=1.0) + assert highly_null_check.pct_null_threshold == 1.0 - with pytest.raises(ValueError, match="percent_threshold must be a float between 0 and 1, inclusive."): - DetectHighlyNullDataCheck(percent_threshold=-0.1) - with pytest.raises(ValueError, match="percent_threshold must be a float between 0 and 1, inclusive."): - DetectHighlyNullDataCheck(percent_threshold=1.1) + with pytest.raises(ValueError, match="pct_null_threshold must be a float between 0 and 1, inclusive."): + DetectHighlyNullDataCheck(pct_null_threshold=-0.1) + with pytest.raises(ValueError, match="pct_null_threshold must be a float between 0 and 1, inclusive."): + DetectHighlyNullDataCheck(pct_null_threshold=1.1) def test_highly_null_data_check_warnings(): data = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], 'all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 4, 5]}) - no_null_check = DetectHighlyNullDataCheck(percent_threshold=0.0) + no_null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.0) assert no_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is more than 0% null", "DetectHighlyNullDataCheck"), DataCheckWarning("Column 'all_null' is more than 0% null", "DetectHighlyNullDataCheck")] - some_null_check = DetectHighlyNullDataCheck(percent_threshold=0.5) + some_null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.5) assert some_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is 50.0% or more null", "DetectHighlyNullDataCheck"), DataCheckWarning("Column 'all_null' is 50.0% or more null", "DetectHighlyNullDataCheck")] - all_null_check = DetectHighlyNullDataCheck(percent_threshold=1.0) + all_null_check = DetectHighlyNullDataCheck(pct_null_threshold=1.0) assert all_null_check.validate(data) == [DataCheckWarning("Column 'all_null' is 100% null", "DetectHighlyNullDataCheck")] def test_highly_null_data_check_input_formats(): - highly_null_check = DetectHighlyNullDataCheck(percent_threshold=0.8) + highly_null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.8) # test empty pd.DataFrame messages = highly_null_check.validate(pd.DataFrame()) From e3b97f105add97e88784f862e2cc983e07bdd2dd Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 7 May 2020 15:45:41 -0400 Subject: [PATCH 10/14] oops, revert to master --- evalml/guardrails/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/evalml/guardrails/utils.py b/evalml/guardrails/utils.py index 385a1fe72c..5d246df92f 100644 --- a/evalml/guardrails/utils.py +++ b/evalml/guardrails/utils.py @@ -37,12 +37,12 @@ def detect_label_leakage(X, y, threshold=.95): return corrs -def detect_highly_null(X, pct_null_threshold=.95): +def detect_highly_null(X, percent_threshold=.95): """ Checks if there are any highly-null columns in a dataframe. Args: X (pd.DataFrame) : features - pct_null_threshold(float): Require that percentage of null values to be considered "highly-null", defaults to .95 + percent_threshold(float): Require that percentage of null values to be considered "highly-null", defaults to .95 Returns: A dictionary of features with column name or index and their percentage of null values @@ -52,14 +52,14 @@ def detect_highly_null(X, pct_null_threshold=.95): ... 'lots_of_null': [None, None, None, None, 5], ... 'no_null': [1, 2, 3, 4, 5] ... }) - >>> detect_highly_null(df, pct_null_threshold=0.8) + >>> detect_highly_null(df, percent_threshold=0.8) {'lots_of_null': 0.8} """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) percent_null = (X.isnull().mean()).to_dict() - highly_null_cols = {key: value for key, value in percent_null.items() if value >= pct_null_threshold} + highly_null_cols = {key: value for key, value in percent_null.items() if value >= percent_threshold} return highly_null_cols From 0f8f160aa35d5fdbd72379df049c1b3789dfd5af Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 7 May 2020 19:02:33 -0400 Subject: [PATCH 11/14] clean up docstr --- evalml/data_checks/detect_highly_null_data_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py index 64e2503b18..97bce77158 100644 --- a/evalml/data_checks/detect_highly_null_data_check.py +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -34,7 +34,7 @@ def validate(self, X, y=None): ... 'no_null': [1, 2, 3, 4, 5] ... }) >>> null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.8) - >>> assert null_check.validate(df) == [DataCheckWarning("Columns 'lots_of_null' are more than 80.0% null", "DetectHighlyNullDataCheck")] + >>> assert null_check.validate(df) == [DataCheckWarning("Columns 'lots_of_null' is 80.0% or more null", "DetectHighlyNullDataCheck")] """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) From 59dcb4d191533880607adc4e943cb3b663698187 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 7 May 2020 23:37:32 -0400 Subject: [PATCH 12/14] cleanup --- evalml/data_checks/default_data_checks.py | 2 +- evalml/data_checks/detect_highly_null_data_check.py | 9 +++------ evalml/tests/data_checks_tests/test_data_check.py | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/evalml/data_checks/default_data_checks.py b/evalml/data_checks/default_data_checks.py index 8bd8d352be..d057fcc7d9 100644 --- a/evalml/data_checks/default_data_checks.py +++ b/evalml/data_checks/default_data_checks.py @@ -5,7 +5,7 @@ class DefaultDataChecks(DataChecks): def __init__(self, data_checks=None): """ - A collection of data checks. + A collection of basic data checks. Arguments: data_checks (list (DataCheck)): Ignored. diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py index 97bce77158..3e48118269 100644 --- a/evalml/data_checks/detect_highly_null_data_check.py +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -40,14 +40,11 @@ def validate(self, X, y=None): X = pd.DataFrame(X) percent_null = (X.isnull().mean()).to_dict() if self.pct_null_threshold == 0.0: - has_null_cols = {key: value for key, value in percent_null.items() if value > self.pct_null_threshold} + all_null_cols = {key: value for key, value in percent_null.items() if value > 0.0} warning_msg = "Column '{}' is more than 0% null" - return [DataCheckWarning(warning_msg.format(col_name), self.name) for col_name in has_null_cols] - elif self.pct_null_threshold == 1.0: - all_null_cols = {key: value for key, value in percent_null.items() if value == self.pct_null_threshold} - warning_msg = "Column '{}' is 100% null" return [DataCheckWarning(warning_msg.format(col_name), self.name) for col_name in all_null_cols] else: highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.pct_null_threshold} warning_msg = "Column '{}' is {}% or more null" - return [DataCheckWarning(warning_msg.format(col_name, self.pct_null_threshold * 100), self.name) for col_name in highly_null_cols] + + return [DataCheckWarning(warning_msg.format(col_name, self.pct_null_threshold * 100), self.name) for col_name in highly_null_cols] diff --git a/evalml/tests/data_checks_tests/test_data_check.py b/evalml/tests/data_checks_tests/test_data_check.py index 9f771f4aa7..d683a2545f 100644 --- a/evalml/tests/data_checks_tests/test_data_check.py +++ b/evalml/tests/data_checks_tests/test_data_check.py @@ -98,7 +98,7 @@ def test_highly_null_data_check_warnings(): assert some_null_check.validate(data) == [DataCheckWarning("Column 'lots_of_null' is 50.0% or more null", "DetectHighlyNullDataCheck"), DataCheckWarning("Column 'all_null' is 50.0% or more null", "DetectHighlyNullDataCheck")] all_null_check = DetectHighlyNullDataCheck(pct_null_threshold=1.0) - assert all_null_check.validate(data) == [DataCheckWarning("Column 'all_null' is 100% null", "DetectHighlyNullDataCheck")] + assert all_null_check.validate(data) == [DataCheckWarning("Column 'all_null' is 100.0% or more null", "DetectHighlyNullDataCheck")] def test_highly_null_data_check_input_formats(): From 865bce31499992163d7fd5d1e44dd6a2f9662b43 Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Thu, 7 May 2020 23:47:51 -0400 Subject: [PATCH 13/14] cleanup --- evalml/data_checks/detect_highly_null_data_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py index 3e48118269..fb10fe6804 100644 --- a/evalml/data_checks/detect_highly_null_data_check.py +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -34,7 +34,7 @@ def validate(self, X, y=None): ... 'no_null': [1, 2, 3, 4, 5] ... }) >>> null_check = DetectHighlyNullDataCheck(pct_null_threshold=0.8) - >>> assert null_check.validate(df) == [DataCheckWarning("Columns 'lots_of_null' is 80.0% or more null", "DetectHighlyNullDataCheck")] + >>> assert null_check.validate(df) == [DataCheckWarning("Column 'lots_of_null' is 80.0% or more null", "DetectHighlyNullDataCheck")] """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) From 41fc203a962b00e17477a603c35d0945fc084dbd Mon Sep 17 00:00:00 2001 From: Angela Lin Date: Fri, 8 May 2020 14:11:19 -0400 Subject: [PATCH 14/14] update docstr --- evalml/data_checks/detect_highly_null_data_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/data_checks/detect_highly_null_data_check.py b/evalml/data_checks/detect_highly_null_data_check.py index fb10fe6804..079c77fba6 100644 --- a/evalml/data_checks/detect_highly_null_data_check.py +++ b/evalml/data_checks/detect_highly_null_data_check.py @@ -10,7 +10,7 @@ def __init__(self, pct_null_threshold=0.95): """Checks if there are any highly-null columns in the input. Arguments: - pct_null_threshold(float): If the percentage of values in an input feature exceeds this amount, + pct_null_threshold(float): If the percentage of NaN values in an input feature exceeds this amount, that feature will be considered highly-null. Defaults to 0.95. """