Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Return details from TargetDistributionDataCheck as floats rather string #3085

Merged
merged 6 commits into from
Nov 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Release Notes
* Changes
* Delete ``predict_uses_y`` estimator attribute :pr:`3069`
* Change ``DateTimeFeaturizer`` to use corresponding Featuretools primitives :pr:`3081`
* Updated ``TargetDistributionDataCheck`` to return metadata details as floats rather strings :pr:`3085`
* Documentation Changes
* Updated docs to use data check action methods rather than manually cleaning data :pr:`3050`
* Testing Changes
Expand Down
60 changes: 34 additions & 26 deletions evalml/data_checks/target_distribution_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def validate(self, X, y):
... "data_check_name": "TargetDistributionDataCheck",
... "level": "warning",
... "code": "TARGET_LOGNORMAL_DISTRIBUTION",
... "details": {"shapiro-statistic/pvalue": '0.8/0.045', "columns": None, "rows": None}}],
... "details": {"normalization_method": "shapiro", "statistic": 0.8, "p-value": 0.045, "columns": None, "rows": None}}],
... "actions": [{'code': 'TRANSFORM_TARGET',
... "data_check_name": "TargetDistributionDataCheck",
... 'metadata': {'transformation_strategy': 'lognormal',
Expand Down Expand Up @@ -95,32 +95,16 @@ def validate(self, X, y):
)
return results

normalization_test = shapiro if len(y) <= 5000 else jarque_bera
normalization_test_string = "shapiro" if len(y) <= 5000 else "jarque_bera"
# Check if a normal distribution is detected with p-value above 0.05
if normalization_test(y).pvalue >= 0.05:
return results

y_new = round(y, 6)
if any(y <= 0):
y_new = y + abs(y.min()) + 1

y_new = y_new[
y_new < (y_new.mean() + 3 * round(y.std(), 3))
] # Drop values greater than 3 standard deviations
norm_test_og = normalization_test(y_new)
norm_test_log = normalization_test(np.log(y_new))

log_detected = False

# If the p-value of the log transformed target is greater than or equal to the p-value of the original target
# with outliers dropped, then it would imply that the log transformed target has more of a normal distribution
if norm_test_log.pvalue >= norm_test_og.pvalue:
log_detected = True

if log_detected:
(
is_log_distribution,
normalization_test_string,
norm_test_og,
) = _detect_log_distribution_helper(y)
if is_log_distribution:
details = {
f"{normalization_test_string}-statistic/pvalue": f"{round(norm_test_og.statistic, 1)}/{round(norm_test_og.pvalue, 3)}"
"normalization_method": normalization_test_string,
"statistic": round(norm_test_og.statistic, 1),
"p-value": round(norm_test_og.pvalue, 3),
}
results["warnings"].append(
DataCheckWarning(
Expand All @@ -142,3 +126,27 @@ def validate(self, X, y):
)

return results


def _detect_log_distribution_helper(y):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving mathy-implementation chunk into helper method to make validate easier to read!

"""Helper method to detect log distribution. Returns boolean, the normalization test used, and test statistics."""
normalization_test = shapiro if len(y) <= 5000 else jarque_bera
normalization_test_string = "shapiro" if len(y) <= 5000 else "jarque_bera"
# Check if a normal distribution is detected with p-value above 0.05
if normalization_test(y).pvalue >= 0.05:
return False, normalization_test_string, None

y_new = round(y, 6)
if any(y <= 0):
y_new = y + abs(y.min()) + 1
y_new = y_new[
y_new < (y_new.mean() + 3 * round(y.std(), 3))
] # Drop values greater than 3 standard deviations
norm_test_og = normalization_test(y_new)
norm_test_log = normalization_test(np.log(y_new))

# If the p-value of the log transformed target is greater than or equal to the p-value of the original target
# with outliers dropped, then it would imply that the log transformed target has more of a normal distribution
if norm_test_log.pvalue >= norm_test_og.pvalue:
return True, normalization_test_string, norm_test_og
return False, normalization_test_string, norm_test_og
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@ def test_target_distribution_data_check_warning_action(
test_og = statistic(y)

details = {
f"{name}-statistic/pvalue": f"{round(test_og.statistic, 1)}/{round(test_og.pvalue, 3)}"
"normalization_method": name,
"statistic": round(test_og.statistic, 1),
"p-value": round(test_og.pvalue, 3),
}
assert target_dist_ == {
"warnings": [
Expand Down