Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Release Notes
* Set max value for plotly and xgboost versions while we debug CI failures with newer versions :pr:`1532`
* Undo version pinning for plotly :pr:`1533`
* Fix ReadTheDocs build by updating the version of ``setuptools`` :pr:`1561`
* Set ``random_state`` of data splitter in AutoMLSearch to take int to keep consistency in the resulting splits :pr:`1579`
* Changes
* Update circleci badge to apply to ``main`` :pr:`1489`
* Added script to generate github markdown for releases :pr:`1487`
Expand Down
5 changes: 3 additions & 2 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
from evalml.pipelines.utils import make_pipeline
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.tuners import SKOptTuner
from evalml.utils import convert_to_seconds, get_random_state
from evalml.utils import convert_to_seconds, get_random_seed, get_random_state
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
Expand Down Expand Up @@ -229,6 +229,7 @@ def __init__(self,
'errors': []
}
self.random_state = get_random_state(random_state)
self.random_seed = get_random_seed(self.random_state)
self.n_jobs = n_jobs

self.plot = None
Expand Down Expand Up @@ -372,7 +373,7 @@ def _set_data_split(self, X, y):
y (pd.Series, ww.DataColumn): The target training data of length [n_samples].
"""
default_data_split = make_data_splitter(X, y, self.problem_type, self.problem_configuration,
n_splits=3, shuffle=True, random_state=self.random_state)
n_splits=3, shuffle=True, random_state=self.random_seed)
self.data_split = self.data_split or default_data_split

def search(self, X, y, data_checks="auto", show_iteration_plot=True):
Expand Down
33 changes: 33 additions & 0 deletions evalml/tests/automl_tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1887,3 +1887,36 @@ class Pipeline2(TimeSeriesRegressionPipeline):
continue
assert result['parameters']['Delayed Feature Transformer'] == configuration
assert result['parameters']['pipeline'] == configuration


@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION])
@patch('evalml.pipelines.RegressionPipeline.fit')
@patch('evalml.pipelines.RegressionPipeline.score')
@patch('evalml.pipelines.MulticlassClassificationPipeline.fit')
@patch('evalml.pipelines.MulticlassClassificationPipeline.score')
@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
@patch('evalml.pipelines.BinaryClassificationPipeline.score')
def test_automl_data_split_consistent(mock_binary_score, mock_binary_fit, mock_multi_score, mock_multi_fit,
mock_regression_score, mock_regression_fit, problem_type,
X_y_binary, X_y_multi, X_y_regression):
if problem_type == ProblemTypes.BINARY:
X, y = X_y_binary

elif problem_type == ProblemTypes.MULTICLASS:
X, y = X_y_multi

elif problem_type == ProblemTypes.REGRESSION:
X, y = X_y_regression

data_splits = []
random_state = [0, 0, 1]
for state in random_state:
a = AutoMLSearch(problem_type=problem_type, random_state=state, max_iterations=1)
a.search(X, y)
data_splits.append([[set(train), set(test)] for train, test in a.data_split.split(X, y)])
# append split from last random state again, should be referencing same datasplit object
data_splits.append([[set(train), set(test)] for train, test in a.data_split.split(X, y)])

assert data_splits[0] == data_splits[1]
assert data_splits[1] != data_splits[2]
assert data_splits[2] == data_splits[3]