Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of max_batches parameter. #1087

Merged
merged 7 commits into from Aug 21, 2020
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Expand Up @@ -11,6 +11,7 @@ Release Notes
* Standardized error when calling transform/predict before fit for pipelines :pr:`1048`
* Added `percent_better_than_baseline` to Automl search rankings and full rankings table :pr:`1050`
* Added "Feature Value" column to prediction explanation reports. :pr:`1064`
* Added `max_batches` parameter to AutoMLSearch :pr:`1087`
* Fixes
* Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022`
* Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022`
Expand Down
21 changes: 17 additions & 4 deletions evalml/automl/automl_search.py
Expand Up @@ -76,7 +76,8 @@ def __init__(self,
n_jobs=-1,
tuner_class=None,
verbose=True,
optimize_thresholds=False):
optimize_thresholds=False,
_max_batches=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I think this is a good pattern for us to avoid breaking changes but still communicate clearly to users 👍

"""Automated pipeline search

Arguments:
Expand Down Expand Up @@ -129,6 +130,9 @@ def __init__(self,
None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.

verbose (boolean): If True, turn verbosity on. Defaults to True

_max_batches (int): The maximum number of batches of pipelines to search. Parameters max_time, and
max_pipelines have precedence over stopping the search.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

"""
try:
self.problem_type = handle_problem_types(problem_type)
Expand Down Expand Up @@ -166,7 +170,7 @@ def __init__(self,
raise TypeError("max_time must be a float, int, or string. Received a {}.".format(type(max_time)))

self.max_pipelines = max_pipelines
if self.max_pipelines is None and self.max_time is None:
if self.max_pipelines is None and self.max_time is None and _max_batches is None:
self.max_pipelines = 5
logger.info("Using default limit of max_pipelines=5.\n")

Expand Down Expand Up @@ -199,6 +203,13 @@ def __init__(self,
self._start = None
self._baseline_cv_score = None

if _max_batches is not None and _max_batches <= 0:
raise ValueError("Parameter max batches must be None or non-negative. Received {max_batches}.")
self._max_batches = _max_batches
# This is the default value for IterativeAlgorithm - setting this explicitly makes sure that
# the behavior of max_batches does not break if IterativeAlgorithm is changed.
self._pipelines_per_batch = 5
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah got it, thanks.


self._validate_problem_type()

@property
Expand Down Expand Up @@ -365,6 +376,8 @@ def search(self, X, y, data_checks="auto", feature_types=None, show_iteration_pl

if self.allowed_pipelines == []:
raise ValueError("No allowed pipelines to search")
if self._max_batches and self.max_pipelines is None:
self.max_pipelines = 1 + len(self.allowed_pipelines) + (self._pipelines_per_batch * (self._max_batches - 1))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. When we change our automl algorithm we'll have to remember to update this. I spent some time considering if there's a way to have IterativeAlgorithm compute this for us, but I think this is fine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can I clarify this calculation so I better understand? Is this 1 (baseline) + len(self.allowed_pipelines) for first batch + (self._pipelines_per_batch * (self._max_batches - 1)) for each batch thereafter?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes you got it @angela97lin !

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah gotcha, thanks @freddyaboulton! 😊


self.allowed_model_families = list(set([p.model_family for p in (self.allowed_pipelines)]))

Expand All @@ -377,7 +390,8 @@ def search(self, X, y, data_checks="auto", feature_types=None, show_iteration_pl
tuner_class=self.tuner_class,
random_state=self.random_state,
n_jobs=self.n_jobs,
number_features=X.shape[1]
number_features=X.shape[1],
pipelines_per_batch=self._pipelines_per_batch
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

)

log_title(logger, "Beginning pipeline search")
Expand Down Expand Up @@ -555,7 +569,6 @@ def _compute_cv_scores(self, pipeline, X, y):
try:
X_threshold_tuning = None
y_threshold_tuning = None

if self.optimize_thresholds and self.objective.problem_type == ProblemTypes.BINARY and self.objective.can_optimize_threshold:
X_train, X_threshold_tuning, y_train, y_threshold_tuning = train_test_split(X_train, y_train, test_size=0.2, random_state=self.random_state)
cv_pipeline = pipeline.clone()
Expand Down
59 changes: 59 additions & 0 deletions evalml/tests/automl_tests/test_automl.py
Expand Up @@ -1011,3 +1011,62 @@ class Pipeline2(DummyPipeline):
baseline_name: round(objective.calculate_percent_difference(baseline_score, baseline_score), 2)}
for name in answers:
np.testing.assert_almost_equal(scores[name], answers[name], decimal=3)


@pytest.mark.parametrize("max_batches", [None, 1, 5, 8, 9, 10, 12])
@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8})
@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
def test_max_batches_works(mock_pipeline_fit, mock_score, max_batches, X_y_binary):
X, y = X_y_binary

automl = AutoMLSearch(problem_type="binary", max_pipelines=None,
_max_batches=max_batches, objective="log_loss_binary")
automl.search(X, y, data_checks=None)

if max_batches is None:
n_results = 5
max_batches = 1
# _automl_algorithm will include all allowed_pipelines in the first batch even
# if they are not searched over. That is why n_automl_pipelines does not equal
# n_results when max_pipelines and max_batches are None
n_automl_pipelines = 1 + len(automl.allowed_pipelines)
else:
# So that the test does not break when new estimator classes are added
n_results = 1 + len(automl.allowed_pipelines) + (5 * (max_batches - 1))
n_automl_pipelines = n_results

assert automl._automl_algorithm.batch_number == max_batches
# We add 1 to pipeline_number because _automl_algorithm does not know about the baseline
assert automl._automl_algorithm.pipeline_number + 1 == n_automl_pipelines
assert len(automl.results["pipeline_results"]) == n_results
assert automl.rankings.shape[0] == min(1 + len(automl.allowed_pipelines), n_results)
assert automl.full_rankings.shape[0] == n_results


@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8})
@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
def test_max_batches_plays_nice_with_other_stopping_criteria(mock_fit, mock_score, X_y_binary):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haha nice

X, y = X_y_binary

# Use the old default when all are None
automl = AutoMLSearch(problem_type="binary", objective="log_loss_binary")
automl.search(X, y, data_checks=None)
assert len(automl.results["pipeline_results"]) == 5

# Use max_pipelines when both max_pipelines and max_batches are set
automl = AutoMLSearch(problem_type="binary", objective="log_loss_binary", _max_batches=10,
max_pipelines=6)
automl.search(X, y, data_checks=None)
assert len(automl.results["pipeline_results"]) == 6

# Don't change max_pipelines when only max_pipelines is set
automl = AutoMLSearch(problem_type="binary", max_pipelines=4)
automl.search(X, y, data_checks=None)
assert len(automl.results["pipeline_results"]) == 4


@pytest.mark.parametrize("max_batches", [0, -1, -10, -np.inf])
def test_max_batches_must_be_non_negative(max_batches):

with pytest.raises(ValueError, match="Parameter max batches must be None or non-negative. Received {max_batches}."):
AutoMLSearch(problem_type="binary", _max_batches=max_batches)