From d63a71f6bcb35eefc2edbbe64ee6839db4f421a5 Mon Sep 17 00:00:00 2001 From: Nabil Fayak Date: Tue, 5 Sep 2023 11:34:19 -0400 Subject: [PATCH 1/8] distibution_data_check init --- .../checks/distribution_data_check.py | 158 ++++++++++++++++++ .../datacheck_meta/data_check_message_code.py | 3 + checkmates/pipelines/transformers.py | 50 ++++++ pyproject.toml | 2 + 4 files changed, 213 insertions(+) create mode 100644 checkmates/data_checks/checks/distribution_data_check.py diff --git a/checkmates/data_checks/checks/distribution_data_check.py b/checkmates/data_checks/checks/distribution_data_check.py new file mode 100644 index 0000000..de28a6b --- /dev/null +++ b/checkmates/data_checks/checks/distribution_data_check.py @@ -0,0 +1,158 @@ +"""Data check that checks if the target data contains certain distributions that may need to be transformed prior training to improve model performance.""" +import numpy as np +import woodwork as ww +import diptest + +from checkmates.data_checks import ( + DataCheck, + DataCheckActionCode, + DataCheckActionOption, + DataCheckError, + DataCheckMessageCode, + DataCheckWarning, +) +from checkmates.utils import infer_feature_types + + +class DistributionDataCheck(DataCheck): + """Check if the target data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation""" + + def validate(self, X, y): + """Check if the target data has a skewed or bimodal distribution. + + Args: + X (pd.DataFrame, np.ndarray): Features. Ignored. + y (pd.Series, np.ndarray): Target data to check for underlying distributions. + + Returns: + dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the target data. + + Examples: + >>> import pandas as pd + + Targets that exhibit a skewed distribution will raise a warning for the user to transform the target. + + >>> y = [0.946, 0.972, 1.154, 0.954, 0.969, 1.222, 1.038, 0.999, 0.973, 0.897] + >>> target_check = DistributionDataCheck() + >>> assert target_check.validate(None, y) == [ + ... { + ... "message": "Target may have a skewed distribution.", + ... "data_check_name": "DistributionDataCheck", + ... "level": "warning", + ... "code": "TARGET_SKEWED_DISTRIBUTION", + ... "details": {"normalization_method": "shapiro", "statistic": 0.8, "p-value": 0.045, "columns": None, "rows": None}, + ... "action_options": [ + ... { + ... "code": "TRANSFORM_TARGET", + ... "data_check_name": "DistributionDataCheck", + ... "parameters": {}, + ... "metadata": { + ... "transformation_strategy": "yeojohnson", + ... "is_target": True, + ... "columns": None, + ... "rows": None + ... } + ... } + ... ] + ... } + ... ] + ... + >>> y = pd.Series([1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 5]) + >>> assert target_check.validate(None, y) == [] + ... + ... + >>> y = pd.Series(pd.date_range("1/1/21", periods=10)) + >>> assert target_check.validate(None, y) == [ + ... { + ... "message": "Target is unsupported datetime type. Valid Woodwork logical types include: integer, double, age, age_fractional", + ... "data_check_name": "DistributionDataCheck", + ... "level": "error", + ... "details": {"columns": None, "rows": None, "unsupported_type": "datetime"}, + ... "code": "TARGET_UNSUPPORTED_TYPE", + ... "action_options": [] + ... } + ... ] + """ + messages = [] + + if y is None: + messages.append( + DataCheckError( + message="Target is None", + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_IS_NONE, + details={}, + ).to_dict(), + ) + return messages + + y = infer_feature_types(y) + allowed_types = [ + ww.logical_types.Integer.type_string, + ww.logical_types.Double.type_string, + ww.logical_types.Age.type_string, + ww.logical_types.AgeFractional.type_string, + ] + is_supported_type = y.ww.logical_type.type_string in allowed_types + + if not is_supported_type: + messages.append( + DataCheckError( + message="Target is unsupported {} type. Valid Woodwork logical types include: {}".format( + y.ww.logical_type.type_string, + ", ".join([ltype for ltype in allowed_types]), + ), + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, + details={"unsupported_type": y.ww.logical_type.type_string}, + ).to_dict(), + ) + return messages + + ( + is_skew, + distribution_type, + skew_value, + coef + ) = _detect_skew_distribution_helper(y) + + + if is_skew: + details = { + "distribution type": distribution_type, + "Skew Value": skew_value, + "Bimodal Coefficient": coef, + } + messages.append( + DataCheckWarning( + message="Target may have a skewed distribution.", + data_check_name=self.name, + message_code=DataCheckMessageCode.TARGET_SKEWED_DISTRIBUTION, + details=details, + action_options=[ + DataCheckActionOption( + DataCheckActionCode.TRANSFORM_TARGET, + data_check_name=self.name, + metadata={ + "is_target": True, + "transformation_strategy": "yeojohnson", + }, + ), + ], + ).to_dict(), + ) + return messages + + +def _detect_skew_distribution_helper(y): + """Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient.""" + skew_value = np.stats.skew(y) + coef = diptest.diptest(y)[1] + + if coef < 0.05: + return True, "bimodal distribution", skew_value, coef + if skew_value < -0.5: + return True, "negative skew", skew_value, coef + if skew_value > 0.5: + return True, "positive skew", skew_value, coef + return False, "no skew", skew_value, coef diff --git a/checkmates/data_checks/datacheck_meta/data_check_message_code.py b/checkmates/data_checks/datacheck_meta/data_check_message_code.py index a846581..12c8760 100644 --- a/checkmates/data_checks/datacheck_meta/data_check_message_code.py +++ b/checkmates/data_checks/datacheck_meta/data_check_message_code.py @@ -58,6 +58,9 @@ class DataCheckMessageCode(Enum): TARGET_LOGNORMAL_DISTRIBUTION = "target_lognormal_distribution" """Message code for target data with a lognormal distribution.""" + TARGET_SKEWED_DISTRIBUTION = "target_skewed_distribution" + """Message code for target data with a skewed distribution.""" + HIGH_VARIANCE = "high_variance" """Message code for when high variance is detected for cross-validation.""" diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py index af4b4c4..ed25885 100644 --- a/checkmates/pipelines/transformers.py +++ b/checkmates/pipelines/transformers.py @@ -4,6 +4,7 @@ import pandas as pd import woodwork from sklearn.impute import SimpleImputer as SkImputer +from scipy.stats import yeojohnson from checkmates.exceptions import MethodPropertyNotFoundError from checkmates.pipelines import ComponentBase @@ -82,6 +83,55 @@ def fit_transform(self, X, y=None): def _get_feature_provenance(self): return {} +"""Component that normalizes skewed distributions using the Yeo-Johnson method""" +class SimpleNormalizer(Transformer): + """Normalizes skewed data according to the Yeo-Johnson method. + + Args: + impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for + numerical data, and "most_frequent", "constant" for object data types. + fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. + Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types. + random_seed (int): Seed for the random number generator. Defaults to 0. + + """ + + def __init__( + self + ): + super().__init__( + parameters=None, + ) + + def transform(self, X, y=None): + """Transforms input by normalizing distribution. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Ignored. + + Returns: + pd.DataFrame: Transformed X + """ + + # Transform the data + X_t = yeojohnson(X_t) + + # Reinit woodwork + X_t.ww.init() + + def fit_transform(self, X, y=None): + """Fits on X and transforms X. + + Args: + X (pd.DataFrame): Data to fit and transform + y (pd.Series, optional): Target data. + + Returns: + pd.DataFrame: Transformed X + """ + return self.fit(X, y).transform(X, y) + """Component that imputes missing data according to a specified imputation strategy.""" diff --git a/pyproject.toml b/pyproject.toml index 7c71982..93f4105 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,8 @@ dependencies = [ "woodwork>=0.22.0", "click>=8.0.0", "black[jupyter]>=22.3.0", + "diptest>=0.5.2", + "scipy>=1.9.3", ] requires-python = ">=3.8,<4.0" readme = "README.md" From 15e07f72cd12f6c1f348ef414a601497d000778a Mon Sep 17 00:00:00 2001 From: Nabil Fayak Date: Tue, 5 Sep 2023 11:40:11 -0400 Subject: [PATCH 2/8] release notes updated --- .../data_checks/checks/distribution_data_check.py | 7 +++---- checkmates/pipelines/transformers.py | 12 ++++++------ docs/source/release_notes.rst | 1 + 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/checkmates/data_checks/checks/distribution_data_check.py b/checkmates/data_checks/checks/distribution_data_check.py index de28a6b..d51946b 100644 --- a/checkmates/data_checks/checks/distribution_data_check.py +++ b/checkmates/data_checks/checks/distribution_data_check.py @@ -1,7 +1,7 @@ """Data check that checks if the target data contains certain distributions that may need to be transformed prior training to improve model performance.""" +import diptest import numpy as np import woodwork as ww -import diptest from checkmates.data_checks import ( DataCheck, @@ -15,7 +15,7 @@ class DistributionDataCheck(DataCheck): - """Check if the target data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation""" + """Check if the target data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation.""" def validate(self, X, y): """Check if the target data has a skewed or bimodal distribution. @@ -113,10 +113,9 @@ def validate(self, X, y): is_skew, distribution_type, skew_value, - coef + coef, ) = _detect_skew_distribution_helper(y) - if is_skew: details = { "distribution type": distribution_type, diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py index ed25885..3fbce0a 100644 --- a/checkmates/pipelines/transformers.py +++ b/checkmates/pipelines/transformers.py @@ -3,8 +3,8 @@ import pandas as pd import woodwork -from sklearn.impute import SimpleImputer as SkImputer from scipy.stats import yeojohnson +from sklearn.impute import SimpleImputer as SkImputer from checkmates.exceptions import MethodPropertyNotFoundError from checkmates.pipelines import ComponentBase @@ -83,7 +83,10 @@ def fit_transform(self, X, y=None): def _get_feature_provenance(self): return {} + """Component that normalizes skewed distributions using the Yeo-Johnson method""" + + class SimpleNormalizer(Transformer): """Normalizes skewed data according to the Yeo-Johnson method. @@ -96,9 +99,7 @@ class SimpleNormalizer(Transformer): """ - def __init__( - self - ): + def __init__(self): super().__init__( parameters=None, ) @@ -113,9 +114,8 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed X """ - # Transform the data - X_t = yeojohnson(X_t) + X_t = yeojohnson(X) # Reinit woodwork X_t.ww.init() diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index a8b229d..cc70238 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,7 @@ Release Notes ------------- **Future Releases** * Enhancements + * Created ``distribution_data_check`` to screen for positive and negative skews as well as bimodal distributions :pr:`21` * Fixes * Changes * Documentation Changes From 6ff5fbf340bdfa78a497fc7141b3da3b6dadf46f Mon Sep 17 00:00:00 2001 From: Nabil Fayak Date: Tue, 5 Sep 2023 16:47:47 -0400 Subject: [PATCH 3/8] adjusted to check for overall data not just target --- .../checks/distribution_data_check.py | 54 +++++++++---------- .../datacheck_meta/data_check_message_code.py | 4 +- checkmates/pipelines/transformers.py | 13 +---- 3 files changed, 30 insertions(+), 41 deletions(-) diff --git a/checkmates/data_checks/checks/distribution_data_check.py b/checkmates/data_checks/checks/distribution_data_check.py index d51946b..236b874 100644 --- a/checkmates/data_checks/checks/distribution_data_check.py +++ b/checkmates/data_checks/checks/distribution_data_check.py @@ -15,54 +15,52 @@ class DistributionDataCheck(DataCheck): - """Check if the target data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation.""" + """Check if the overall data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation.""" def validate(self, X, y): - """Check if the target data has a skewed or bimodal distribution. + """Check if the overall data has a skewed or bimodal distribution. Args: - X (pd.DataFrame, np.ndarray): Features. Ignored. + X (pd.DataFrame, np.ndarray): Overall data to check for skewed or bimodal distributions. y (pd.Series, np.ndarray): Target data to check for underlying distributions. Returns: - dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the target data. + dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the overall data. Examples: >>> import pandas as pd - Targets that exhibit a skewed distribution will raise a warning for the user to transform the target. + Features and target data that exhibit a skewed distribution will raise a warning for the user to transform the data. - >>> y = [0.946, 0.972, 1.154, 0.954, 0.969, 1.222, 1.038, 0.999, 0.973, 0.897] - >>> target_check = DistributionDataCheck() - >>> assert target_check.validate(None, y) == [ + >>> X = [5, 7, 8, 9, 10, 11, 12, 15, 20] + >>> data_check = DistributionDataCheck() + >>> assert data_check.validate(X, y) == [ ... { - ... "message": "Target may have a skewed distribution.", + ... "message": "Data may have a skewed distribution.", ... "data_check_name": "DistributionDataCheck", ... "level": "warning", - ... "code": "TARGET_SKEWED_DISTRIBUTION", - ... "details": {"normalization_method": "shapiro", "statistic": 0.8, "p-value": 0.045, "columns": None, "rows": None}, + ... "code": "SKEWED_DISTRIBUTION", + ... "details": {"distribution type": "positive skew", "Skew Value": 0.7939, "Bimodal Coefficient": 1.0,}, ... "action_options": [ ... { ... "code": "TRANSFORM_TARGET", ... "data_check_name": "DistributionDataCheck", ... "parameters": {}, ... "metadata": { - ... "transformation_strategy": "yeojohnson", - ... "is_target": True, - ... "columns": None, - ... "rows": None + "is_skew": True, + "transformation_strategy": "yeojohnson", ... } ... } ... ] ... } ... ] ... - >>> y = pd.Series([1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 5]) - >>> assert target_check.validate(None, y) == [] + >>> X = pd.Series([1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 5]) + >>> assert target_check.validate(X, y) == [] ... ... - >>> y = pd.Series(pd.date_range("1/1/21", periods=10)) - >>> assert target_check.validate(None, y) == [ + >>> X = pd.Series(pd.date_range("1/1/21", periods=10)) + >>> assert target_check.validate(X, y) == [ ... { ... "message": "Target is unsupported datetime type. Valid Woodwork logical types include: integer, double, age, age_fractional", ... "data_check_name": "DistributionDataCheck", @@ -78,7 +76,7 @@ def validate(self, X, y): if y is None: messages.append( DataCheckError( - message="Target is None", + message="Data is None", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_IS_NONE, details={}, @@ -104,7 +102,7 @@ def validate(self, X, y): ), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, - details={"unsupported_type": y.ww.logical_type.type_string}, + details={"unsupported_type": X.ww.logical_type.type_string}, ).to_dict(), ) return messages @@ -114,7 +112,7 @@ def validate(self, X, y): distribution_type, skew_value, coef, - ) = _detect_skew_distribution_helper(y) + ) = _detect_skew_distribution_helper(X) if is_skew: details = { @@ -124,16 +122,16 @@ def validate(self, X, y): } messages.append( DataCheckWarning( - message="Target may have a skewed distribution.", + message="Data may have a skewed distribution.", data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_SKEWED_DISTRIBUTION, + message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION, details=details, action_options=[ DataCheckActionOption( DataCheckActionCode.TRANSFORM_TARGET, data_check_name=self.name, metadata={ - "is_target": True, + "is_skew": True, "transformation_strategy": "yeojohnson", }, ), @@ -143,10 +141,10 @@ def validate(self, X, y): return messages -def _detect_skew_distribution_helper(y): +def _detect_skew_distribution_helper(X): """Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient.""" - skew_value = np.stats.skew(y) - coef = diptest.diptest(y)[1] + skew_value = np.stats.skew(X) + coef = diptest.diptest(X)[1] if coef < 0.05: return True, "bimodal distribution", skew_value, coef diff --git a/checkmates/data_checks/datacheck_meta/data_check_message_code.py b/checkmates/data_checks/datacheck_meta/data_check_message_code.py index 12c8760..d8137f5 100644 --- a/checkmates/data_checks/datacheck_meta/data_check_message_code.py +++ b/checkmates/data_checks/datacheck_meta/data_check_message_code.py @@ -58,8 +58,8 @@ class DataCheckMessageCode(Enum): TARGET_LOGNORMAL_DISTRIBUTION = "target_lognormal_distribution" """Message code for target data with a lognormal distribution.""" - TARGET_SKEWED_DISTRIBUTION = "target_skewed_distribution" - """Message code for target data with a skewed distribution.""" + SKEWED_DISTRIBUTION = "skewed_distribution" + """Message code for data with a skewed distribution.""" HIGH_VARIANCE = "high_variance" """Message code for when high variance is detected for cross-validation.""" diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py index 3fbce0a..f69a811 100644 --- a/checkmates/pipelines/transformers.py +++ b/checkmates/pipelines/transformers.py @@ -88,16 +88,7 @@ def _get_feature_provenance(self): class SimpleNormalizer(Transformer): - """Normalizes skewed data according to the Yeo-Johnson method. - - Args: - impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for - numerical data, and "most_frequent", "constant" for object data types. - fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. - Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types. - random_seed (int): Seed for the random number generator. Defaults to 0. - - """ + """Normalizes skewed data according to the Yeo-Johnson method.""" def __init__(self): super().__init__( @@ -109,7 +100,7 @@ def transform(self, X, y=None): Args: X (pd.DataFrame): Data to transform. - y (pd.Series, optional): Ignored. + y (pd.Series, optional): Returns: pd.DataFrame: Transformed X From 7c3e224c82e1cf9724b6de8ca62666d21c644ed5 Mon Sep 17 00:00:00 2001 From: Nabil Fayak Date: Tue, 5 Sep 2023 16:52:49 -0400 Subject: [PATCH 4/8] lint fix --- checkmates/pipelines/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py index f69a811..815ee2f 100644 --- a/checkmates/pipelines/transformers.py +++ b/checkmates/pipelines/transformers.py @@ -100,7 +100,7 @@ def transform(self, X, y=None): Args: X (pd.DataFrame): Data to transform. - y (pd.Series, optional): + y (pd.Series, optional): Target Data Returns: pd.DataFrame: Transformed X From 9abccac21e3457c0b020cef8e836357cf9573a1a Mon Sep 17 00:00:00 2001 From: Nabil Fayak Date: Fri, 8 Sep 2023 11:18:39 -0400 Subject: [PATCH 5/8] added data checking logic --- .../checks/distribution_data_check.py | 182 +++++++++--------- .../datacheck_meta/data_check_action_code.py | 3 + checkmates/pipelines/transformers.py | 6 + checkmates/pipelines/utils.py | 8 + pyproject.toml | 1 - 5 files changed, 109 insertions(+), 91 deletions(-) diff --git a/checkmates/data_checks/checks/distribution_data_check.py b/checkmates/data_checks/checks/distribution_data_check.py index 236b874..ce21bdf 100644 --- a/checkmates/data_checks/checks/distribution_data_check.py +++ b/checkmates/data_checks/checks/distribution_data_check.py @@ -1,18 +1,15 @@ -"""Data check that checks if the target data contains certain distributions that may need to be transformed prior training to improve model performance.""" -import diptest -import numpy as np +"""Data check that screens data for skewed or bimodal distrbutions prior to model training to ensure model performance is unaffected.""" import woodwork as ww from checkmates.data_checks import ( DataCheck, DataCheckActionCode, DataCheckActionOption, - DataCheckError, DataCheckMessageCode, DataCheckWarning, ) -from checkmates.utils import infer_feature_types - +from scipy.stats import skew +from diptest import diptest class DistributionDataCheck(DataCheck): """Check if the overall data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation.""" @@ -43,7 +40,7 @@ def validate(self, X, y): ... "details": {"distribution type": "positive skew", "Skew Value": 0.7939, "Bimodal Coefficient": 1.0,}, ... "action_options": [ ... { - ... "code": "TRANSFORM_TARGET", + ... "code": "TRANSFORM_FEATURES", ... "data_check_name": "DistributionDataCheck", ... "parameters": {}, ... "metadata": { @@ -54,97 +51,51 @@ def validate(self, X, y): ... ] ... } ... ] - ... - >>> X = pd.Series([1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 5]) - >>> assert target_check.validate(X, y) == [] - ... - ... - >>> X = pd.Series(pd.date_range("1/1/21", periods=10)) - >>> assert target_check.validate(X, y) == [ - ... { - ... "message": "Target is unsupported datetime type. Valid Woodwork logical types include: integer, double, age, age_fractional", - ... "data_check_name": "DistributionDataCheck", - ... "level": "error", - ... "details": {"columns": None, "rows": None, "unsupported_type": "datetime"}, - ... "code": "TARGET_UNSUPPORTED_TYPE", - ... "action_options": [] - ... } - ... ] """ messages = [] - if y is None: - messages.append( - DataCheckError( - message="Data is None", - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_IS_NONE, - details={}, - ).to_dict(), - ) - return messages - - y = infer_feature_types(y) - allowed_types = [ - ww.logical_types.Integer.type_string, - ww.logical_types.Double.type_string, - ww.logical_types.Age.type_string, - ww.logical_types.AgeFractional.type_string, - ] - is_supported_type = y.ww.logical_type.type_string in allowed_types - - if not is_supported_type: - messages.append( - DataCheckError( - message="Target is unsupported {} type. Valid Woodwork logical types include: {}".format( - y.ww.logical_type.type_string, - ", ".join([ltype for ltype in allowed_types]), - ), - data_check_name=self.name, - message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, - details={"unsupported_type": X.ww.logical_type.type_string}, - ).to_dict(), - ) - return messages - - ( - is_skew, - distribution_type, - skew_value, - coef, - ) = _detect_skew_distribution_helper(X) - - if is_skew: - details = { - "distribution type": distribution_type, - "Skew Value": skew_value, - "Bimodal Coefficient": coef, - } - messages.append( - DataCheckWarning( - message="Data may have a skewed distribution.", - data_check_name=self.name, - message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION, - details=details, - action_options=[ - DataCheckActionOption( - DataCheckActionCode.TRANSFORM_TARGET, - data_check_name=self.name, - metadata={ - "is_skew": True, - "transformation_strategy": "yeojohnson", - }, - ), - ], - ).to_dict(), - ) + numeric_X = X.ww.select(["Integer", "Double"]) + + for col in numeric_X: + ( + is_skew, + distribution_type, + skew_value, + coef, + ) = _detect_skew_distribution_helper(col) + + if is_skew: + details = { + "distribution type": distribution_type, + "Skew Value": skew_value, + "Bimodal Coefficient": coef, + } + messages.append( + DataCheckWarning( + message="Data may have a skewed distribution.", + data_check_name=self.name, + message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION, + details=details, + action_options=[ + DataCheckActionOption( + DataCheckActionCode.TRANSFORM_FEATURES, + data_check_name=self.name, + metadata={ + "is_skew": True, + "transformation_strategy": "yeojohnson", + "columns" : col + }, + ), + ], + ).to_dict(), + ) return messages def _detect_skew_distribution_helper(X): """Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient.""" - skew_value = np.stats.skew(X) - coef = diptest.diptest(X)[1] + skew_value = skew(X) + coef = diptest(X)[1] if coef < 0.05: return True, "bimodal distribution", skew_value, coef @@ -153,3 +104,54 @@ def _detect_skew_distribution_helper(X): if skew_value > 0.5: return True, "positive skew", skew_value, coef return False, "no skew", skew_value, coef + + +# Testing Data to make sure skews are recognized-- successful +# import numpy as np +# import pandas as pd +# data = { +# 'Column1': np.random.normal(0, 1, 1000), # Normally distributed data +# 'Column2': np.random.exponential(1, 1000), # Right-skewed data +# 'Column3': np.random.gamma(2, 2, 1000) # Right-skewed data +# } + +# df = pd.DataFrame(data) +# df.ww.init() +# messages = [] + +# numeric_X = df.ww.select(["Integer", "Double"]) +# print(numeric_X) +# for col in numeric_X: +# ( +# is_skew, +# distribution_type, +# skew_value, +# coef, +# ) = _detect_skew_distribution_helper(numeric_X['Column2']) + +# if is_skew: +# details = { +# "distribution type": distribution_type, +# "Skew Value": skew_value, +# "Bimodal Coefficient": coef, +# } +# messages.append( +# DataCheckWarning( +# message="Data may have a skewed distribution.", +# data_check_name="Distribution Data Check", +# message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION, +# details=details, +# action_options=[ +# DataCheckActionOption( +# DataCheckActionCode.TRANSFORM_FEATURES, +# data_check_name="Distribution Data Check", +# metadata={ +# "is_skew": True, +# "transformation_strategy": "yeojohnson", +# "columns" : col +# }, +# ), +# ], +# ).to_dict(), +# ) +# print(messages) \ No newline at end of file diff --git a/checkmates/data_checks/datacheck_meta/data_check_action_code.py b/checkmates/data_checks/datacheck_meta/data_check_action_code.py index 0106221..7558195 100644 --- a/checkmates/data_checks/datacheck_meta/data_check_action_code.py +++ b/checkmates/data_checks/datacheck_meta/data_check_action_code.py @@ -19,6 +19,9 @@ class DataCheckActionCode(Enum): TRANSFORM_TARGET = "transform_target" """Action code for transforming the target data.""" + TRANSFORM_FEATURES = "transform_features" + """Action code for transforming the features data.""" + REGULARIZE_AND_IMPUTE_DATASET = "regularize_and_impute_dataset" """Action code for regularizing and imputing all features and target time series data.""" diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py index 815ee2f..2e26fee 100644 --- a/checkmates/pipelines/transformers.py +++ b/checkmates/pipelines/transformers.py @@ -105,6 +105,12 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed X """ + + # If there are no columns to normalize, return early + if not self._cols_to_normalize: + return self + + X = X[self._cols_to_normalize] # Transform the data X_t = yeojohnson(X) diff --git a/checkmates/pipelines/utils.py b/checkmates/pipelines/utils.py index 5f4e555..00776cd 100644 --- a/checkmates/pipelines/utils.py +++ b/checkmates/pipelines/utils.py @@ -15,6 +15,7 @@ TimeSeriesRegularizer, ) from checkmates.pipelines.training_validation_split import TrainingValidationSplit +from checkmates.pipelines.transformers import SimpleNormalizer from checkmates.problem_types import is_classification, is_regression, is_time_series from checkmates.utils import infer_feature_types @@ -31,6 +32,8 @@ def _make_component_list_from_actions(actions): components = [] cols_to_drop = [] indices_to_drop = [] + cols_to_normalize = [] + for action in actions: if action.action_code == DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET: @@ -47,6 +50,8 @@ def _make_component_list_from_actions(actions): ) elif action.action_code == DataCheckActionCode.DROP_COL: cols_to_drop.extend(action.metadata["columns"]) + elif action.action_code == DataCheckActionCode.TRANSFORM_FEATURES: + cols_to_normalize.extend(action.metadata["columns"]) elif action.action_code == DataCheckActionCode.IMPUTE_COL: metadata = action.metadata parameters = metadata.get("parameters", {}) @@ -65,6 +70,9 @@ def _make_component_list_from_actions(actions): if indices_to_drop: indices_to_drop = sorted(set(indices_to_drop)) components.append(DropRowsTransformer(indices_to_drop=indices_to_drop)) + if cols_to_normalize: + cols_to_normalize = sorted(set(cols_to_normalize)) + components.append(SimpleNormalizer(columns=cols_to_normalize)) return components diff --git a/pyproject.toml b/pyproject.toml index 93f4105..d19a80d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,6 @@ dependencies = [ "click>=8.0.0", "black[jupyter]>=22.3.0", "diptest>=0.5.2", - "scipy>=1.9.3", ] requires-python = ">=3.8,<4.0" readme = "README.md" From d40d5f51b0aacff38f1e305b3c4b15fad907feab Mon Sep 17 00:00:00 2001 From: Nabil Fayak Date: Fri, 8 Sep 2023 11:19:03 -0400 Subject: [PATCH 6/8] linter --- .../data_checks/checks/distribution_data_check.py | 11 ++++++----- checkmates/pipelines/transformers.py | 1 - checkmates/pipelines/utils.py | 1 - 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/checkmates/data_checks/checks/distribution_data_check.py b/checkmates/data_checks/checks/distribution_data_check.py index ce21bdf..8c6162e 100644 --- a/checkmates/data_checks/checks/distribution_data_check.py +++ b/checkmates/data_checks/checks/distribution_data_check.py @@ -1,5 +1,7 @@ """Data check that screens data for skewed or bimodal distrbutions prior to model training to ensure model performance is unaffected.""" -import woodwork as ww + +from diptest import diptest +from scipy.stats import skew from checkmates.data_checks import ( DataCheck, @@ -8,8 +10,7 @@ DataCheckMessageCode, DataCheckWarning, ) -from scipy.stats import skew -from diptest import diptest + class DistributionDataCheck(DataCheck): """Check if the overall data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation.""" @@ -83,7 +84,7 @@ def validate(self, X, y): metadata={ "is_skew": True, "transformation_strategy": "yeojohnson", - "columns" : col + "columns": col, }, ), ], @@ -154,4 +155,4 @@ def _detect_skew_distribution_helper(X): # ], # ).to_dict(), # ) -# print(messages) \ No newline at end of file +# print(messages) diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py index 2e26fee..3930318 100644 --- a/checkmates/pipelines/transformers.py +++ b/checkmates/pipelines/transformers.py @@ -105,7 +105,6 @@ def transform(self, X, y=None): Returns: pd.DataFrame: Transformed X """ - # If there are no columns to normalize, return early if not self._cols_to_normalize: return self diff --git a/checkmates/pipelines/utils.py b/checkmates/pipelines/utils.py index 00776cd..fe1efec 100644 --- a/checkmates/pipelines/utils.py +++ b/checkmates/pipelines/utils.py @@ -34,7 +34,6 @@ def _make_component_list_from_actions(actions): indices_to_drop = [] cols_to_normalize = [] - for action in actions: if action.action_code == DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET: metadata = action.metadata From 2fe1d7e9c5bd3849f1d9d021eafcd88add348fa6 Mon Sep 17 00:00:00 2001 From: Nabil Fayak Date: Fri, 8 Sep 2023 17:00:06 -0400 Subject: [PATCH 7/8] added some tests for normalizer and data check --- checkmates/data_checks/__init__.py | 4 +- .../checks/distribution_data_check.py | 53 +------------ checkmates/pipelines/__init__.py | 2 +- checkmates/pipelines/transformers.py | 8 +- checkmates/pipelines/utils.py | 2 +- .../test_distribution_data_check.py | 75 +++++++++++++++++++ tests/data_checks_tests/test_normalizer.py | 40 ++++++++++ 7 files changed, 127 insertions(+), 57 deletions(-) create mode 100644 tests/data_checks_tests/test_distribution_data_check.py create mode 100644 tests/data_checks_tests/test_normalizer.py diff --git a/checkmates/data_checks/__init__.py b/checkmates/data_checks/__init__.py index 42b3254..54b62fb 100644 --- a/checkmates/data_checks/__init__.py +++ b/checkmates/data_checks/__init__.py @@ -52,6 +52,8 @@ from checkmates.data_checks.checks.invalid_target_data_check import ( InvalidTargetDataCheck, ) - +from checkmates.data_checks.checks.distribution_data_check import ( + DistributionDataCheck +) from checkmates.data_checks.datacheck_meta.utils import handle_data_check_action_code diff --git a/checkmates/data_checks/checks/distribution_data_check.py b/checkmates/data_checks/checks/distribution_data_check.py index 8c6162e..62722c6 100644 --- a/checkmates/data_checks/checks/distribution_data_check.py +++ b/checkmates/data_checks/checks/distribution_data_check.py @@ -104,55 +104,4 @@ def _detect_skew_distribution_helper(X): return True, "negative skew", skew_value, coef if skew_value > 0.5: return True, "positive skew", skew_value, coef - return False, "no skew", skew_value, coef - - -# Testing Data to make sure skews are recognized-- successful -# import numpy as np -# import pandas as pd -# data = { -# 'Column1': np.random.normal(0, 1, 1000), # Normally distributed data -# 'Column2': np.random.exponential(1, 1000), # Right-skewed data -# 'Column3': np.random.gamma(2, 2, 1000) # Right-skewed data -# } - -# df = pd.DataFrame(data) -# df.ww.init() -# messages = [] - -# numeric_X = df.ww.select(["Integer", "Double"]) -# print(numeric_X) -# for col in numeric_X: -# ( -# is_skew, -# distribution_type, -# skew_value, -# coef, -# ) = _detect_skew_distribution_helper(numeric_X['Column2']) - -# if is_skew: -# details = { -# "distribution type": distribution_type, -# "Skew Value": skew_value, -# "Bimodal Coefficient": coef, -# } -# messages.append( -# DataCheckWarning( -# message="Data may have a skewed distribution.", -# data_check_name="Distribution Data Check", -# message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION, -# details=details, -# action_options=[ -# DataCheckActionOption( -# DataCheckActionCode.TRANSFORM_FEATURES, -# data_check_name="Distribution Data Check", -# metadata={ -# "is_skew": True, -# "transformation_strategy": "yeojohnson", -# "columns" : col -# }, -# ), -# ], -# ).to_dict(), -# ) -# print(messages) + return False, "no skew", skew_value, coef \ No newline at end of file diff --git a/checkmates/pipelines/__init__.py b/checkmates/pipelines/__init__.py index 8a04168..d0d8322 100644 --- a/checkmates/pipelines/__init__.py +++ b/checkmates/pipelines/__init__.py @@ -2,7 +2,7 @@ from checkmates.pipelines.component_base_meta import ComponentBaseMeta from checkmates.pipelines.component_base import ComponentBase -from checkmates.pipelines.transformers import Transformer +from checkmates.pipelines.transformers import Transformer, SimpleNormalizer from checkmates.pipelines.components import ( # noqa: F401 DropColumns, DropRowsTransformer, diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py index 3930318..a680b4f 100644 --- a/checkmates/pipelines/transformers.py +++ b/checkmates/pipelines/transformers.py @@ -93,6 +93,7 @@ class SimpleNormalizer(Transformer): def __init__(self): super().__init__( parameters=None, + _cols_to_normalize=None, ) def transform(self, X, y=None): @@ -109,9 +110,12 @@ def transform(self, X, y=None): if not self._cols_to_normalize: return self - X = X[self._cols_to_normalize] + #Only select the skewed column to normalize + x_t = X[self._cols_to_normalize] + X_t = X + # Transform the data - X_t = yeojohnson(X) + X_t[self._cols_to_normalize] = yeojohnson(x_t) # Reinit woodwork X_t.ww.init() diff --git a/checkmates/pipelines/utils.py b/checkmates/pipelines/utils.py index fe1efec..5837697 100644 --- a/checkmates/pipelines/utils.py +++ b/checkmates/pipelines/utils.py @@ -70,7 +70,7 @@ def _make_component_list_from_actions(actions): indices_to_drop = sorted(set(indices_to_drop)) components.append(DropRowsTransformer(indices_to_drop=indices_to_drop)) if cols_to_normalize: - cols_to_normalize = sorted(set(cols_to_normalize)) + cols_to_normalize = set(cols_to_normalize) components.append(SimpleNormalizer(columns=cols_to_normalize)) return components diff --git a/tests/data_checks_tests/test_distribution_data_check.py b/tests/data_checks_tests/test_distribution_data_check.py new file mode 100644 index 0000000..c4c9e28 --- /dev/null +++ b/tests/data_checks_tests/test_distribution_data_check.py @@ -0,0 +1,75 @@ +# Testing Data to make sure skews are recognized-- successful +import numpy as np +import pandas as pd +from diptest import diptest +from scipy.stats import skew + +from checkmates.data_checks import ( + DataCheck, + DataCheckActionCode, + DataCheckActionOption, + DataCheckMessageCode, + DataCheckWarning, + DistributionDataCheck +) +def _detect_skew_distribution_helper(X): + """Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient.""" + skew_value = skew(X) + coef = diptest(X)[1] + + if coef < 0.05: + return True, "bimodal distribution", skew_value, coef + if skew_value < -0.5: + return True, "negative skew", skew_value, coef + if skew_value > 0.5: + return True, "positive skew", skew_value, coef + return False, "no skew", skew_value, coef + + +data = { + 'Column1': np.random.normal(0, 1, 1000), # Normally distributed data + 'Column2': np.random.exponential(1, 1000), # Right-skewed data + 'Column3': 1 / (np.random.gamma(2, 2, 1000)) # Left-skewed data +} + +df = pd.DataFrame(data) +df.ww.init() +messages = [] + +numeric_X = df.ww.select(["Integer", "Double"]) +print(numeric_X) +for col in numeric_X: + ( + is_skew, + distribution_type, + skew_value, + coef, + ) = _detect_skew_distribution_helper(numeric_X['Column2']) + + if is_skew: + details = { + "distribution type": distribution_type, + "Skew Value": skew_value, + "Bimodal Coefficient": coef, + } + messages.append( + DataCheckWarning( + message="Data may have a skewed distribution.", + data_check_name="Distribution Data Check", + message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION, + details=details, + action_options=[ + DataCheckActionOption( + DataCheckActionCode.TRANSFORM_FEATURES, + data_check_name="Distribution Data Check", + metadata={ + "is_skew": True, + "transformation_strategy": "yeojohnson", + "columns" : col + }, + ), + ], + ).to_dict(), + ) +print(messages) + diff --git a/tests/data_checks_tests/test_normalizer.py b/tests/data_checks_tests/test_normalizer.py new file mode 100644 index 0000000..8ad46d4 --- /dev/null +++ b/tests/data_checks_tests/test_normalizer.py @@ -0,0 +1,40 @@ +from scipy.stats import yeojohnson +import numpy as np +import woodwork as ww +import pandas as pd + +data = { + 'Column1': np.random.normal(0, 1, 1000), # Normally distributed data + 'Column2': np.random.exponential(1, 1000), # Right-skewed data + 'Column3': 1 / (np.random.gamma(2, 2, 1000)) # Left-skewed data +} + +X = pd.DataFrame(data) + +_cols_to_normalize = 'Column2' + +def transform(self, X, _cols_to_normalize): + """Transforms input by normalizing distribution. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Target Data + + Returns: + pd.DataFrame: Transformed X + """ + # If there are no columns to normalize, return early + if not _cols_to_normalize: + return self + + #Only select the skewed column to normalize + x_t = X[_cols_to_normalize] + X_t = X + + # Transform the data + X_t[_cols_to_normalize] = yeojohnson(x_t) + + # Reinit woodwork + X_t.ww.init() + +transform(X, _cols_to_normalize, None) \ No newline at end of file From 670df8088a39cc0f6f819c784d647279d45e4aa8 Mon Sep 17 00:00:00 2001 From: Nabil Fayak Date: Fri, 8 Sep 2023 17:02:27 -0400 Subject: [PATCH 8/8] lint fix --- checkmates/data_checks/__init__.py | 4 +- .../checks/distribution_data_check.py | 2 +- checkmates/pipelines/transformers.py | 2 +- .../test_distribution_data_check.py | 15 +++--- tests/data_checks_tests/test_normalizer.py | 49 ++++++++++--------- 5 files changed, 35 insertions(+), 37 deletions(-) diff --git a/checkmates/data_checks/__init__.py b/checkmates/data_checks/__init__.py index 54b62fb..9122cfd 100644 --- a/checkmates/data_checks/__init__.py +++ b/checkmates/data_checks/__init__.py @@ -52,8 +52,6 @@ from checkmates.data_checks.checks.invalid_target_data_check import ( InvalidTargetDataCheck, ) -from checkmates.data_checks.checks.distribution_data_check import ( - DistributionDataCheck -) +from checkmates.data_checks.checks.distribution_data_check import DistributionDataCheck from checkmates.data_checks.datacheck_meta.utils import handle_data_check_action_code diff --git a/checkmates/data_checks/checks/distribution_data_check.py b/checkmates/data_checks/checks/distribution_data_check.py index 62722c6..1a5b87f 100644 --- a/checkmates/data_checks/checks/distribution_data_check.py +++ b/checkmates/data_checks/checks/distribution_data_check.py @@ -104,4 +104,4 @@ def _detect_skew_distribution_helper(X): return True, "negative skew", skew_value, coef if skew_value > 0.5: return True, "positive skew", skew_value, coef - return False, "no skew", skew_value, coef \ No newline at end of file + return False, "no skew", skew_value, coef diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py index a680b4f..9847e82 100644 --- a/checkmates/pipelines/transformers.py +++ b/checkmates/pipelines/transformers.py @@ -110,7 +110,7 @@ def transform(self, X, y=None): if not self._cols_to_normalize: return self - #Only select the skewed column to normalize + # Only select the skewed column to normalize x_t = X[self._cols_to_normalize] X_t = X diff --git a/tests/data_checks_tests/test_distribution_data_check.py b/tests/data_checks_tests/test_distribution_data_check.py index c4c9e28..167eb57 100644 --- a/tests/data_checks_tests/test_distribution_data_check.py +++ b/tests/data_checks_tests/test_distribution_data_check.py @@ -5,13 +5,13 @@ from scipy.stats import skew from checkmates.data_checks import ( - DataCheck, DataCheckActionCode, DataCheckActionOption, DataCheckMessageCode, DataCheckWarning, - DistributionDataCheck ) + + def _detect_skew_distribution_helper(X): """Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient.""" skew_value = skew(X) @@ -27,9 +27,9 @@ def _detect_skew_distribution_helper(X): data = { - 'Column1': np.random.normal(0, 1, 1000), # Normally distributed data - 'Column2': np.random.exponential(1, 1000), # Right-skewed data - 'Column3': 1 / (np.random.gamma(2, 2, 1000)) # Left-skewed data + "Column1": np.random.normal(0, 1, 1000), # Normally distributed data + "Column2": np.random.exponential(1, 1000), # Right-skewed data + "Column3": 1 / (np.random.gamma(2, 2, 1000)), # Left-skewed data } df = pd.DataFrame(data) @@ -44,7 +44,7 @@ def _detect_skew_distribution_helper(X): distribution_type, skew_value, coef, - ) = _detect_skew_distribution_helper(numeric_X['Column2']) + ) = _detect_skew_distribution_helper(numeric_X["Column2"]) if is_skew: details = { @@ -65,11 +65,10 @@ def _detect_skew_distribution_helper(X): metadata={ "is_skew": True, "transformation_strategy": "yeojohnson", - "columns" : col + "columns": col, }, ), ], ).to_dict(), ) print(messages) - diff --git a/tests/data_checks_tests/test_normalizer.py b/tests/data_checks_tests/test_normalizer.py index 8ad46d4..6d0acc7 100644 --- a/tests/data_checks_tests/test_normalizer.py +++ b/tests/data_checks_tests/test_normalizer.py @@ -1,40 +1,41 @@ -from scipy.stats import yeojohnson import numpy as np -import woodwork as ww import pandas as pd +from scipy.stats import yeojohnson data = { - 'Column1': np.random.normal(0, 1, 1000), # Normally distributed data - 'Column2': np.random.exponential(1, 1000), # Right-skewed data - 'Column3': 1 / (np.random.gamma(2, 2, 1000)) # Left-skewed data + "Column1": np.random.normal(0, 1, 1000), # Normally distributed data + "Column2": np.random.exponential(1, 1000), # Right-skewed data + "Column3": 1 / (np.random.gamma(2, 2, 1000)), # Left-skewed data } X = pd.DataFrame(data) -_cols_to_normalize = 'Column2' +_cols_to_normalize = "Column2" + def transform(self, X, _cols_to_normalize): - """Transforms input by normalizing distribution. + """Transforms input by normalizing distribution. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Target Data - Args: - X (pd.DataFrame): Data to transform. - y (pd.Series, optional): Target Data + Returns: + pd.DataFrame: Transformed X + """ + # If there are no columns to normalize, return early + if not _cols_to_normalize: + return self - Returns: - pd.DataFrame: Transformed X - """ - # If there are no columns to normalize, return early - if not _cols_to_normalize: - return self + # Only select the skewed column to normalize + x_t = X[_cols_to_normalize] + X_t = X - #Only select the skewed column to normalize - x_t = X[_cols_to_normalize] - X_t = X + # Transform the data + X_t[_cols_to_normalize] = yeojohnson(x_t) - # Transform the data - X_t[_cols_to_normalize] = yeojohnson(x_t) + # Reinit woodwork + X_t.ww.init() - # Reinit woodwork - X_t.ww.init() -transform(X, _cols_to_normalize, None) \ No newline at end of file +transform(X, _cols_to_normalize, None)