Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add HighVarianceCVDataCheck #1254

Merged
merged 13 commits into from Oct 8, 2020
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Expand Up @@ -9,6 +9,7 @@ Release Notes
* Added stacked ensemble component classes (StackedEnsembleClassifier, StackedEnsembleRegressor) :pr:`1134`
* Added parameter to ``OneHotEncoder`` to enable filtering for features to encode for :pr:`1249`
* Added percent-better-than-baseline for all objectives to automl.results :pr:`1244`
* Added ``HighVarianceCVDataCheck`` and replaced synonymous warning in ``AutoMLSearch`` :pr:`1254`
* Fixes
* Fixed ML performance issue with ordered datasets: always shuffle data in automl's default CV splits :pr:`1265`
* Changes
Expand Down
24 changes: 11 additions & 13 deletions evalml/automl/automl_search.py
@@ -1,6 +1,5 @@
import copy
import time
import warnings
from collections import OrderedDict, defaultdict

import cloudpickle
Expand All @@ -22,7 +21,8 @@
AutoMLDataChecks,
DataChecks,
DefaultDataChecks,
EmptyDataChecks
EmptyDataChecks,
HighVarianceCVDataCheck
)
from evalml.data_checks.data_check_message_type import DataCheckMessageType
from evalml.exceptions import (
Expand Down Expand Up @@ -395,7 +395,7 @@ def search(self, X, y, data_checks="auto", feature_types=None, show_iteration_pl
data_checks = self._validate_data_checks(data_checks)
data_check_results = data_checks.validate(X, y)

if len(data_check_results) > 0:
if data_check_results:
self._data_check_results = data_check_results
for message in self._data_check_results:
if message.message_type == DataCheckMessageType.WARNING:
Expand Down Expand Up @@ -694,16 +694,18 @@ def _add_result(self, trained_pipeline, parameters, training_time, cv_data, cv_s
self._baseline_cv_scores.get(obj_name, np.nan))
percent_better_than_baseline[obj_name] = percent_better

# calculate high_variance_cv
# if the coefficient of variance is greater than .2
with warnings.catch_warnings():
warnings.simplefilter('ignore')
high_variance_cv = (cv_scores.std() / cv_scores.mean()) > .2

pipeline_name = trained_pipeline.name
pipeline_summary = trained_pipeline.summary
pipeline_id = len(self._results['pipeline_results'])

high_variance_cv_check = HighVarianceCVDataCheck(threshold=0.2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should users be allowed to configure this threshold now that this is a DataCheck and we let users configure other data checks?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good thought! Perhaps we can turn this on and off depending on if data_checks = auto vs. data_checks = disabled. But I don't think it'll fit within the existing API for parameterizing data checks as all those data checks run before search is called whereas this check is called during search. I like the idea but we would need to think about what API changes to make to AutoMLSearch.search().

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 yep @freddyaboulton I agree this should be configurable/disable-able.

This PR is essentially porting existing behavior into a new API (data checks). I'll file an issue now to track making this configurable.

high_variance_cv_check_results = high_variance_cv_check.validate(pipeline_name=pipeline_name, cv_scores=cv_scores)
high_variance_cv = False

if high_variance_cv_check_results:
logger.warning(high_variance_cv_check_results[0])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jeremyliweishih does this show up in the console in a well-formatted way? I've noticed that str(check) doesn't look great. You may have to call .message

high_variance_cv = True

self._results['pipeline_results'][pipeline_id] = {
"id": pipeline_id,
"pipeline_name": pipeline_name,
Expand Down Expand Up @@ -785,10 +787,6 @@ def describe_pipeline(self, pipeline_id, return_dict=False):
logger.info("Total training time (including CV): %.1f seconds" % pipeline_results["training_time"])
log_subtitle(logger, "Cross Validation", underline="-")

if pipeline_results["high_variance_cv"]:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved the logging behavior from describe_pipeline to _add_result but otherwise kept the same usage of high_variance_cv within rankings etc.. I feel like it is more appropriate to notify during the search process and not just when describing a pipeline. Happy to discuss further!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this makes sense!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed!

logger.warning("High variance within cross validation scores. " +
"Model may not perform as estimated on unseen data.")

all_objective_scores = [fold["all_objective_scores"] for fold in pipeline_results["cv_data"]]
all_objective_scores = pd.DataFrame(all_objective_scores)

Expand Down
1 change: 1 addition & 0 deletions evalml/data_checks/__init__.py
Expand Up @@ -11,3 +11,4 @@
from .outliers_data_check import OutliersDataCheck
from .no_variance_data_check import NoVarianceDataCheck
from .class_imbalance_data_check import ClassImbalanceDataCheck
from .high_variance_cv_data_check import HighVarianceCVDataCheck
45 changes: 45 additions & 0 deletions evalml/data_checks/high_variance_cv_data_check.py
@@ -0,0 +1,45 @@
import pandas as pd

from .data_check import DataCheck
from .data_check_message import DataCheckWarning


class HighVarianceCVDataCheck(DataCheck):
"""Checks if the variance between folds in cross-validation is higher than an acceptable threshhold."""

def __init__(self, threshold=0.2):
"""Check if there is higher variance among cross-validation results.

Arguments:
threshold (float): The minimum threshold allowed for high variance before a warning is raised.
Defaults to 0.2 and must be above 0.
"""
if threshold < 0:
raise ValueError(f"Provided threshold {threshold} needs to be greater than 0.")
self.threshold = threshold

def validate(self, pipeline_name, cv_scores):
"""Checks cross-validation scores and issues an warning if variance is higher than specified threshhold.

Arguments:
pipeline_name (str): name of pipeline that produced cv_scores
cv_scores (pd.Series, np.array, list): list of scores of each cross-validation fold

Returns:
list (DataCheckWarning): list with DataCheckWarnings if imbalance in classes is less than the threshold.

Example:
>>> cv_scores = pd.Series([0, 1, 1, 1])
>>> check = HighVarianceCVDataCheck(threshold=0.10)
>>> assert check.validate("LogisticRegressionPipeline", cv_scores) == [DataCheckWarning("High coefficient of variation (cv >= 0.1) within cross validation scores. LogisticRegressionPipeline may not perform as estimated on unseen data.", "HighVarianceCVDataCheck")]
"""
if not isinstance(cv_scores, pd.Series):
cv_scores = pd.Series(cv_scores)

messages = []
high_variance_cv = abs(cv_scores.std() / cv_scores.mean()) > self.threshold
# if there are items that occur less than the threshold, add them to the list of messages
if high_variance_cv:
warning_msg = f"High coefficient of variation (cv >= {self.threshold}) within cross validation scores. {pipeline_name} may not perform as estimated on unseen data."
messages.append(DataCheckWarning(warning_msg, self.name))
return messages
2 changes: 1 addition & 1 deletion evalml/tests/automl_tests/test_automl.py
Expand Up @@ -73,7 +73,7 @@ def test_search_results(X_y_regression, X_y_binary, X_y_multi, automl_type):
assert isinstance(results['pipeline_summary'], str)
assert isinstance(results['parameters'], dict)
assert isinstance(results['score'], float)
assert isinstance(results['high_variance_cv'], np.bool_)
assert isinstance(results['high_variance_cv'], bool)
assert isinstance(results['cv_data'], list)
for cv_result in results['cv_data']:
assert cv_result.keys() == expected_cv_data_keys
Expand Down
28 changes: 28 additions & 0 deletions evalml/tests/data_checks_tests/test_high_variance_cv_data_check.py
@@ -0,0 +1,28 @@
import numpy as np
import pandas as pd
import pytest

from evalml.data_checks import DataCheckWarning, HighVarianceCVDataCheck


def test_high_variance_cv_data_check_invalid_threshold():
with pytest.raises(ValueError, match="needs to be greater than 0."):
HighVarianceCVDataCheck(threshold=-0.1).validate(pipeline_name='LogisticRegressionPipeline', cv_scores=pd.Series([0, 1, 1]))


def test_high_variance_cv_data_check():
high_variance_cv = HighVarianceCVDataCheck()

assert high_variance_cv.validate(pipeline_name='LogisticRegressionPipeline', cv_scores=[1, 1, 1]) == []
assert high_variance_cv.validate(pipeline_name='LogisticRegressionPipeline', cv_scores=pd.Series([1, 1, 1])) == []
assert high_variance_cv.validate(pipeline_name='LogisticRegressionPipeline', cv_scores=pd.Series([0, 1, 2, 3])) == [DataCheckWarning("High coefficient of variation (cv >= 0.2) within cross validation scores. LogisticRegressionPipeline may not perform as estimated on unseen data.", "HighVarianceCVDataCheck")]


def test_high_variance_cv_data_check_empty_nan():
high_variance_cv = HighVarianceCVDataCheck()
assert high_variance_cv.validate(pipeline_name='LogisticRegressionPipeline', cv_scores=pd.Series([0, 1, np.nan, np.nan])) == [DataCheckWarning("High coefficient of variation (cv >= 0.2) within cross validation scores. LogisticRegressionPipeline may not perform as estimated on unseen data.", "HighVarianceCVDataCheck")]


def test_high_variance_cv_data_check_negative():
high_variance_cv = HighVarianceCVDataCheck()
assert high_variance_cv.validate(pipeline_name='LogisticRegressionPipeline', cv_scores=pd.Series([0, -1, -1, -1])) == [DataCheckWarning("High coefficient of variation (cv >= 0.2) within cross validation scores. LogisticRegressionPipeline may not perform as estimated on unseen data.", "HighVarianceCVDataCheck")]