Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added _pipelines_per_batch as a private argument to AutoMLSearch #1355

Merged
merged 2 commits into from
Oct 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Release Notes
* Updated ``AutoMLSearch`` to support ``Woodwork`` data structures :pr:`1299`
* Added cv_folds to ``ClassImbalanceDataCheck`` and added this check to ``DefaultDataChecks`` :pr:`1333`
* Make ``max_batches`` argument to ``AutoMLSearch.search`` public :pr:`1320`
* Added ``_pipelines_per_batch`` as a private argument to ``AutoMLSearch`` :pr:`1355`
* Fixes
* Fixed ML performance issue with ordered datasets: always shuffle data in automl's default CV splits :pr:`1265`
* Fixed broken ``evalml info`` CLI command :pr:`1293`
Expand Down
10 changes: 6 additions & 4 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ def __init__(self,
verbose=True,
optimize_thresholds=False,
ensembling=False,
max_batches=None):
max_batches=None,
_pipelines_per_batch=5):
"""Automated pipeline search

Arguments:
Expand Down Expand Up @@ -149,6 +150,9 @@ def __init__(self,

max_batches (int): The maximum number of batches of pipelines to search. Parameters max_time, and
max_iterations have precedence over stopping the search.

_pipelines_per_batch (int): The number of pipelines to train for every batch after the first one.
The first batch will train a baseline pipline + one of each pipeline family allowed in the search.
"""
try:
self.problem_type = handle_problem_types(problem_type)
Expand Down Expand Up @@ -191,9 +195,7 @@ def __init__(self,
if max_batches is not None and max_batches <= 0:
raise ValueError(f"Parameter max batches must be None or non-negative. Received {max_batches}.")
self.max_batches = max_batches
# This is the default value for IterativeAlgorithm - setting this explicitly makes sure that
# the behavior of max_batches does not break if IterativeAlgorithm is changed.
self._pipelines_per_batch = 5
self._pipelines_per_batch = _pipelines_per_batch

self.max_iterations = max_iterations
if not self.max_iterations and not self.max_time and not self.max_batches:
Expand Down
30 changes: 30 additions & 0 deletions evalml/tests/automl_tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1458,3 +1458,33 @@ def test_input_not_woodwork_logs_warning(mock_fit, mock_score, caplog, X_y_binar
automl.search(X, y)
assert "`X` passed was not a DataTable. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead." in caplog.text
assert "`y` passed was not a DataColumn. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead." in caplog.text


@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8})
@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
def test_pipelines_per_batch(mock_fit, mock_score, X_y_binary):
def total_pipelines(automl, num_batches, batch_size):
total = 1 + len(automl.allowed_pipelines)
total += ((num_batches - 1) * batch_size)
return total
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is neat, nice to see this math condensed


X, y = X_y_binary

# Checking for default of _pipelines_per_batch
automl = AutoMLSearch(problem_type='binary', max_batches=2)
automl.search(X, y)
assert automl._pipelines_per_batch == 5
assert automl._automl_algorithm.pipelines_per_batch == 5
assert total_pipelines(automl, 2, 5) == len(automl.full_rankings)

automl = AutoMLSearch(problem_type='binary', max_batches=1, _pipelines_per_batch=2)
automl.search(X, y)
assert automl._pipelines_per_batch == 2
assert automl._automl_algorithm.pipelines_per_batch == 2
assert total_pipelines(automl, 1, 2) == len(automl.full_rankings)

automl = AutoMLSearch(problem_type='binary', max_batches=2, _pipelines_per_batch=10)
automl.search(X, y)
assert automl._pipelines_per_batch == 10
assert automl._automl_algorithm.pipelines_per_batch == 10
assert total_pipelines(automl, 2, 10) == len(automl.full_rankings)