New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implementation of max_batches parameter. #1087
Changes from all commits
15543f1
ae179b7
3b184db
6592ee7
7312dcb
823714f
49f28aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,7 +76,8 @@ def __init__(self, | |
n_jobs=-1, | ||
tuner_class=None, | ||
verbose=True, | ||
optimize_thresholds=False): | ||
optimize_thresholds=False, | ||
_max_batches=None): | ||
"""Automated pipeline search | ||
|
||
Arguments: | ||
|
@@ -129,6 +130,9 @@ def __init__(self, | |
None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. | ||
|
||
verbose (boolean): If True, turn verbosity on. Defaults to True | ||
|
||
_max_batches (int): The maximum number of batches of pipelines to search. Parameters max_time, and | ||
max_pipelines have precedence over stopping the search. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! |
||
""" | ||
try: | ||
self.problem_type = handle_problem_types(problem_type) | ||
|
@@ -166,7 +170,7 @@ def __init__(self, | |
raise TypeError("max_time must be a float, int, or string. Received a {}.".format(type(max_time))) | ||
|
||
self.max_pipelines = max_pipelines | ||
if self.max_pipelines is None and self.max_time is None: | ||
if self.max_pipelines is None and self.max_time is None and _max_batches is None: | ||
self.max_pipelines = 5 | ||
logger.info("Using default limit of max_pipelines=5.\n") | ||
|
||
|
@@ -199,6 +203,13 @@ def __init__(self, | |
self._start = None | ||
self._baseline_cv_score = None | ||
|
||
if _max_batches is not None and _max_batches <= 0: | ||
raise ValueError("Parameter max batches must be None or non-negative. Received {max_batches}.") | ||
self._max_batches = _max_batches | ||
# This is the default value for IterativeAlgorithm - setting this explicitly makes sure that | ||
# the behavior of max_batches does not break if IterativeAlgorithm is changed. | ||
self._pipelines_per_batch = 5 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah got it, thanks. |
||
|
||
self._validate_problem_type() | ||
|
||
@property | ||
|
@@ -365,6 +376,8 @@ def search(self, X, y, data_checks="auto", feature_types=None, show_iteration_pl | |
|
||
if self.allowed_pipelines == []: | ||
raise ValueError("No allowed pipelines to search") | ||
if self._max_batches and self.max_pipelines is None: | ||
self.max_pipelines = 1 + len(self.allowed_pipelines) + (self._pipelines_per_batch * (self._max_batches - 1)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense. When we change our automl algorithm we'll have to remember to update this. I spent some time considering if there's a way to have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can I clarify this calculation so I better understand? Is this 1 (baseline) + len(self.allowed_pipelines) for first batch + (self._pipelines_per_batch * (self._max_batches - 1)) for each batch thereafter? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes you got it @angela97lin ! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah gotcha, thanks @freddyaboulton! 😊 |
||
|
||
self.allowed_model_families = list(set([p.model_family for p in (self.allowed_pipelines)])) | ||
|
||
|
@@ -377,7 +390,8 @@ def search(self, X, y, data_checks="auto", feature_types=None, show_iteration_pl | |
tuner_class=self.tuner_class, | ||
random_state=self.random_state, | ||
n_jobs=self.n_jobs, | ||
number_features=X.shape[1] | ||
number_features=X.shape[1], | ||
pipelines_per_batch=self._pipelines_per_batch | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
) | ||
|
||
log_title(logger, "Beginning pipeline search") | ||
|
@@ -555,7 +569,6 @@ def _compute_cv_scores(self, pipeline, X, y): | |
try: | ||
X_threshold_tuning = None | ||
y_threshold_tuning = None | ||
|
||
if self.optimize_thresholds and self.objective.problem_type == ProblemTypes.BINARY and self.objective.can_optimize_threshold: | ||
X_train, X_threshold_tuning, y_train, y_threshold_tuning = train_test_split(X_train, y_train, test_size=0.2, random_state=self.random_state) | ||
cv_pipeline = pipeline.clone() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1011,3 +1011,62 @@ class Pipeline2(DummyPipeline): | |
baseline_name: round(objective.calculate_percent_difference(baseline_score, baseline_score), 2)} | ||
for name in answers: | ||
np.testing.assert_almost_equal(scores[name], answers[name], decimal=3) | ||
|
||
|
||
@pytest.mark.parametrize("max_batches", [None, 1, 5, 8, 9, 10, 12]) | ||
@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) | ||
@patch('evalml.pipelines.BinaryClassificationPipeline.fit') | ||
def test_max_batches_works(mock_pipeline_fit, mock_score, max_batches, X_y_binary): | ||
X, y = X_y_binary | ||
|
||
automl = AutoMLSearch(problem_type="binary", max_pipelines=None, | ||
_max_batches=max_batches, objective="log_loss_binary") | ||
automl.search(X, y, data_checks=None) | ||
|
||
if max_batches is None: | ||
n_results = 5 | ||
max_batches = 1 | ||
# _automl_algorithm will include all allowed_pipelines in the first batch even | ||
# if they are not searched over. That is why n_automl_pipelines does not equal | ||
# n_results when max_pipelines and max_batches are None | ||
n_automl_pipelines = 1 + len(automl.allowed_pipelines) | ||
else: | ||
# So that the test does not break when new estimator classes are added | ||
n_results = 1 + len(automl.allowed_pipelines) + (5 * (max_batches - 1)) | ||
n_automl_pipelines = n_results | ||
|
||
assert automl._automl_algorithm.batch_number == max_batches | ||
# We add 1 to pipeline_number because _automl_algorithm does not know about the baseline | ||
assert automl._automl_algorithm.pipeline_number + 1 == n_automl_pipelines | ||
assert len(automl.results["pipeline_results"]) == n_results | ||
assert automl.rankings.shape[0] == min(1 + len(automl.allowed_pipelines), n_results) | ||
assert automl.full_rankings.shape[0] == n_results | ||
|
||
|
||
@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8}) | ||
@patch('evalml.pipelines.BinaryClassificationPipeline.fit') | ||
def test_max_batches_plays_nice_with_other_stopping_criteria(mock_fit, mock_score, X_y_binary): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Haha nice |
||
X, y = X_y_binary | ||
|
||
# Use the old default when all are None | ||
automl = AutoMLSearch(problem_type="binary", objective="log_loss_binary") | ||
automl.search(X, y, data_checks=None) | ||
assert len(automl.results["pipeline_results"]) == 5 | ||
|
||
# Use max_pipelines when both max_pipelines and max_batches are set | ||
automl = AutoMLSearch(problem_type="binary", objective="log_loss_binary", _max_batches=10, | ||
max_pipelines=6) | ||
automl.search(X, y, data_checks=None) | ||
assert len(automl.results["pipeline_results"]) == 6 | ||
|
||
# Don't change max_pipelines when only max_pipelines is set | ||
automl = AutoMLSearch(problem_type="binary", max_pipelines=4) | ||
automl.search(X, y, data_checks=None) | ||
assert len(automl.results["pipeline_results"]) == 4 | ||
|
||
|
||
@pytest.mark.parametrize("max_batches", [0, -1, -10, -np.inf]) | ||
def test_max_batches_must_be_non_negative(max_batches): | ||
|
||
with pytest.raises(ValueError, match="Parameter max batches must be None or non-negative. Received {max_batches}."): | ||
AutoMLSearch(problem_type="binary", _max_batches=max_batches) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I think this is a good pattern for us to avoid breaking changes but still communicate clearly to users 👍