diff --git a/checkmates/data_checks/__init__.py b/checkmates/data_checks/__init__.py index 42b3254..9122cfd 100644 --- a/checkmates/data_checks/__init__.py +++ b/checkmates/data_checks/__init__.py @@ -52,6 +52,6 @@ from checkmates.data_checks.checks.invalid_target_data_check import ( InvalidTargetDataCheck, ) - +from checkmates.data_checks.checks.distribution_data_check import DistributionDataCheck from checkmates.data_checks.datacheck_meta.utils import handle_data_check_action_code diff --git a/checkmates/data_checks/checks/distribution_data_check.py b/checkmates/data_checks/checks/distribution_data_check.py new file mode 100644 index 0000000..1a5b87f --- /dev/null +++ b/checkmates/data_checks/checks/distribution_data_check.py @@ -0,0 +1,107 @@ +"""Data check that screens data for skewed or bimodal distrbutions prior to model training to ensure model performance is unaffected.""" + +from diptest import diptest +from scipy.stats import skew + +from checkmates.data_checks import ( + DataCheck, + DataCheckActionCode, + DataCheckActionOption, + DataCheckMessageCode, + DataCheckWarning, +) + + +class DistributionDataCheck(DataCheck): + """Check if the overall data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation.""" + + def validate(self, X, y): + """Check if the overall data has a skewed or bimodal distribution. + + Args: + X (pd.DataFrame, np.ndarray): Overall data to check for skewed or bimodal distributions. + y (pd.Series, np.ndarray): Target data to check for underlying distributions. + + Returns: + dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the overall data. + + Examples: + >>> import pandas as pd + + Features and target data that exhibit a skewed distribution will raise a warning for the user to transform the data. + + >>> X = [5, 7, 8, 9, 10, 11, 12, 15, 20] + >>> data_check = DistributionDataCheck() + >>> assert data_check.validate(X, y) == [ + ... { + ... "message": "Data may have a skewed distribution.", + ... "data_check_name": "DistributionDataCheck", + ... "level": "warning", + ... "code": "SKEWED_DISTRIBUTION", + ... "details": {"distribution type": "positive skew", "Skew Value": 0.7939, "Bimodal Coefficient": 1.0,}, + ... "action_options": [ + ... { + ... "code": "TRANSFORM_FEATURES", + ... "data_check_name": "DistributionDataCheck", + ... "parameters": {}, + ... "metadata": { + "is_skew": True, + "transformation_strategy": "yeojohnson", + ... } + ... } + ... ] + ... } + ... ] + """ + messages = [] + + numeric_X = X.ww.select(["Integer", "Double"]) + + for col in numeric_X: + ( + is_skew, + distribution_type, + skew_value, + coef, + ) = _detect_skew_distribution_helper(col) + + if is_skew: + details = { + "distribution type": distribution_type, + "Skew Value": skew_value, + "Bimodal Coefficient": coef, + } + messages.append( + DataCheckWarning( + message="Data may have a skewed distribution.", + data_check_name=self.name, + message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION, + details=details, + action_options=[ + DataCheckActionOption( + DataCheckActionCode.TRANSFORM_FEATURES, + data_check_name=self.name, + metadata={ + "is_skew": True, + "transformation_strategy": "yeojohnson", + "columns": col, + }, + ), + ], + ).to_dict(), + ) + return messages + + +def _detect_skew_distribution_helper(X): + """Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient.""" + skew_value = skew(X) + coef = diptest(X)[1] + + if coef < 0.05: + return True, "bimodal distribution", skew_value, coef + if skew_value < -0.5: + return True, "negative skew", skew_value, coef + if skew_value > 0.5: + return True, "positive skew", skew_value, coef + return False, "no skew", skew_value, coef diff --git a/checkmates/data_checks/datacheck_meta/data_check_action_code.py b/checkmates/data_checks/datacheck_meta/data_check_action_code.py index 0106221..7558195 100644 --- a/checkmates/data_checks/datacheck_meta/data_check_action_code.py +++ b/checkmates/data_checks/datacheck_meta/data_check_action_code.py @@ -19,6 +19,9 @@ class DataCheckActionCode(Enum): TRANSFORM_TARGET = "transform_target" """Action code for transforming the target data.""" + TRANSFORM_FEATURES = "transform_features" + """Action code for transforming the features data.""" + REGULARIZE_AND_IMPUTE_DATASET = "regularize_and_impute_dataset" """Action code for regularizing and imputing all features and target time series data.""" diff --git a/checkmates/data_checks/datacheck_meta/data_check_message_code.py b/checkmates/data_checks/datacheck_meta/data_check_message_code.py index a846581..d8137f5 100644 --- a/checkmates/data_checks/datacheck_meta/data_check_message_code.py +++ b/checkmates/data_checks/datacheck_meta/data_check_message_code.py @@ -58,6 +58,9 @@ class DataCheckMessageCode(Enum): TARGET_LOGNORMAL_DISTRIBUTION = "target_lognormal_distribution" """Message code for target data with a lognormal distribution.""" + SKEWED_DISTRIBUTION = "skewed_distribution" + """Message code for data with a skewed distribution.""" + HIGH_VARIANCE = "high_variance" """Message code for when high variance is detected for cross-validation.""" diff --git a/checkmates/pipelines/__init__.py b/checkmates/pipelines/__init__.py index 8a04168..d0d8322 100644 --- a/checkmates/pipelines/__init__.py +++ b/checkmates/pipelines/__init__.py @@ -2,7 +2,7 @@ from checkmates.pipelines.component_base_meta import ComponentBaseMeta from checkmates.pipelines.component_base import ComponentBase -from checkmates.pipelines.transformers import Transformer +from checkmates.pipelines.transformers import Transformer, SimpleNormalizer from checkmates.pipelines.components import ( # noqa: F401 DropColumns, DropRowsTransformer, diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py index af4b4c4..9847e82 100644 --- a/checkmates/pipelines/transformers.py +++ b/checkmates/pipelines/transformers.py @@ -3,6 +3,7 @@ import pandas as pd import woodwork +from scipy.stats import yeojohnson from sklearn.impute import SimpleImputer as SkImputer from checkmates.exceptions import MethodPropertyNotFoundError @@ -83,6 +84,55 @@ def _get_feature_provenance(self): return {} +"""Component that normalizes skewed distributions using the Yeo-Johnson method""" + + +class SimpleNormalizer(Transformer): + """Normalizes skewed data according to the Yeo-Johnson method.""" + + def __init__(self): + super().__init__( + parameters=None, + _cols_to_normalize=None, + ) + + def transform(self, X, y=None): + """Transforms input by normalizing distribution. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Target Data + + Returns: + pd.DataFrame: Transformed X + """ + # If there are no columns to normalize, return early + if not self._cols_to_normalize: + return self + + # Only select the skewed column to normalize + x_t = X[self._cols_to_normalize] + X_t = X + + # Transform the data + X_t[self._cols_to_normalize] = yeojohnson(x_t) + + # Reinit woodwork + X_t.ww.init() + + def fit_transform(self, X, y=None): + """Fits on X and transforms X. + + Args: + X (pd.DataFrame): Data to fit and transform + y (pd.Series, optional): Target data. + + Returns: + pd.DataFrame: Transformed X + """ + return self.fit(X, y).transform(X, y) + + """Component that imputes missing data according to a specified imputation strategy.""" diff --git a/checkmates/pipelines/utils.py b/checkmates/pipelines/utils.py index 5f4e555..5837697 100644 --- a/checkmates/pipelines/utils.py +++ b/checkmates/pipelines/utils.py @@ -15,6 +15,7 @@ TimeSeriesRegularizer, ) from checkmates.pipelines.training_validation_split import TrainingValidationSplit +from checkmates.pipelines.transformers import SimpleNormalizer from checkmates.problem_types import is_classification, is_regression, is_time_series from checkmates.utils import infer_feature_types @@ -31,6 +32,7 @@ def _make_component_list_from_actions(actions): components = [] cols_to_drop = [] indices_to_drop = [] + cols_to_normalize = [] for action in actions: if action.action_code == DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET: @@ -47,6 +49,8 @@ def _make_component_list_from_actions(actions): ) elif action.action_code == DataCheckActionCode.DROP_COL: cols_to_drop.extend(action.metadata["columns"]) + elif action.action_code == DataCheckActionCode.TRANSFORM_FEATURES: + cols_to_normalize.extend(action.metadata["columns"]) elif action.action_code == DataCheckActionCode.IMPUTE_COL: metadata = action.metadata parameters = metadata.get("parameters", {}) @@ -65,6 +69,9 @@ def _make_component_list_from_actions(actions): if indices_to_drop: indices_to_drop = sorted(set(indices_to_drop)) components.append(DropRowsTransformer(indices_to_drop=indices_to_drop)) + if cols_to_normalize: + cols_to_normalize = set(cols_to_normalize) + components.append(SimpleNormalizer(columns=cols_to_normalize)) return components diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index a8b229d..cc70238 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,7 @@ Release Notes ------------- **Future Releases** * Enhancements + * Created ``distribution_data_check`` to screen for positive and negative skews as well as bimodal distributions :pr:`21` * Fixes * Changes * Documentation Changes diff --git a/pyproject.toml b/pyproject.toml index 7c71982..d19a80d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "woodwork>=0.22.0", "click>=8.0.0", "black[jupyter]>=22.3.0", + "diptest>=0.5.2", ] requires-python = ">=3.8,<4.0" readme = "README.md" diff --git a/tests/data_checks_tests/test_distribution_data_check.py b/tests/data_checks_tests/test_distribution_data_check.py new file mode 100644 index 0000000..167eb57 --- /dev/null +++ b/tests/data_checks_tests/test_distribution_data_check.py @@ -0,0 +1,74 @@ +# Testing Data to make sure skews are recognized-- successful +import numpy as np +import pandas as pd +from diptest import diptest +from scipy.stats import skew + +from checkmates.data_checks import ( + DataCheckActionCode, + DataCheckActionOption, + DataCheckMessageCode, + DataCheckWarning, +) + + +def _detect_skew_distribution_helper(X): + """Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient.""" + skew_value = skew(X) + coef = diptest(X)[1] + + if coef < 0.05: + return True, "bimodal distribution", skew_value, coef + if skew_value < -0.5: + return True, "negative skew", skew_value, coef + if skew_value > 0.5: + return True, "positive skew", skew_value, coef + return False, "no skew", skew_value, coef + + +data = { + "Column1": np.random.normal(0, 1, 1000), # Normally distributed data + "Column2": np.random.exponential(1, 1000), # Right-skewed data + "Column3": 1 / (np.random.gamma(2, 2, 1000)), # Left-skewed data +} + +df = pd.DataFrame(data) +df.ww.init() +messages = [] + +numeric_X = df.ww.select(["Integer", "Double"]) +print(numeric_X) +for col in numeric_X: + ( + is_skew, + distribution_type, + skew_value, + coef, + ) = _detect_skew_distribution_helper(numeric_X["Column2"]) + + if is_skew: + details = { + "distribution type": distribution_type, + "Skew Value": skew_value, + "Bimodal Coefficient": coef, + } + messages.append( + DataCheckWarning( + message="Data may have a skewed distribution.", + data_check_name="Distribution Data Check", + message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION, + details=details, + action_options=[ + DataCheckActionOption( + DataCheckActionCode.TRANSFORM_FEATURES, + data_check_name="Distribution Data Check", + metadata={ + "is_skew": True, + "transformation_strategy": "yeojohnson", + "columns": col, + }, + ), + ], + ).to_dict(), + ) +print(messages) diff --git a/tests/data_checks_tests/test_normalizer.py b/tests/data_checks_tests/test_normalizer.py new file mode 100644 index 0000000..6d0acc7 --- /dev/null +++ b/tests/data_checks_tests/test_normalizer.py @@ -0,0 +1,41 @@ +import numpy as np +import pandas as pd +from scipy.stats import yeojohnson + +data = { + "Column1": np.random.normal(0, 1, 1000), # Normally distributed data + "Column2": np.random.exponential(1, 1000), # Right-skewed data + "Column3": 1 / (np.random.gamma(2, 2, 1000)), # Left-skewed data +} + +X = pd.DataFrame(data) + +_cols_to_normalize = "Column2" + + +def transform(self, X, _cols_to_normalize): + """Transforms input by normalizing distribution. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Target Data + + Returns: + pd.DataFrame: Transformed X + """ + # If there are no columns to normalize, return early + if not _cols_to_normalize: + return self + + # Only select the skewed column to normalize + x_t = X[_cols_to_normalize] + X_t = X + + # Transform the data + X_t[_cols_to_normalize] = yeojohnson(x_t) + + # Reinit woodwork + X_t.ww.init() + + +transform(X, _cols_to_normalize, None)