diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 2be33184c8..acaa7a3a31 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -29,6 +29,7 @@ Release Notes * Set max value for plotly and xgboost versions while we debug CI failures with newer versions :pr:`1532` * Undo version pinning for plotly :pr:`1533` * Fix ReadTheDocs build by updating the version of ``setuptools`` :pr:`1561` + * Set ``random_state`` of data splitter in AutoMLSearch to take int to keep consistency in the resulting splits :pr:`1579` * Changes * Update circleci badge to apply to ``main`` :pr:`1489` * Added script to generate github markdown for releases :pr:`1487` diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 935a20ad17..55fa892046 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -49,7 +49,7 @@ from evalml.pipelines.utils import make_pipeline from evalml.problem_types import ProblemTypes, handle_problem_types from evalml.tuners import SKOptTuner -from evalml.utils import convert_to_seconds, get_random_state +from evalml.utils import convert_to_seconds, get_random_seed, get_random_state from evalml.utils.gen_utils import ( _convert_to_woodwork_structure, _convert_woodwork_types_wrapper @@ -229,6 +229,7 @@ def __init__(self, 'errors': [] } self.random_state = get_random_state(random_state) + self.random_seed = get_random_seed(self.random_state) self.n_jobs = n_jobs self.plot = None @@ -372,7 +373,7 @@ def _set_data_split(self, X, y): y (pd.Series, ww.DataColumn): The target training data of length [n_samples]. """ default_data_split = make_data_splitter(X, y, self.problem_type, self.problem_configuration, - n_splits=3, shuffle=True, random_state=self.random_state) + n_splits=3, shuffle=True, random_state=self.random_seed) self.data_split = self.data_split or default_data_split def search(self, X, y, data_checks="auto", show_iteration_plot=True): diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 14f99b9884..21ff4f82b9 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1887,3 +1887,36 @@ class Pipeline2(TimeSeriesRegressionPipeline): continue assert result['parameters']['Delayed Feature Transformer'] == configuration assert result['parameters']['pipeline'] == configuration + + +@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION]) +@patch('evalml.pipelines.RegressionPipeline.fit') +@patch('evalml.pipelines.RegressionPipeline.score') +@patch('evalml.pipelines.MulticlassClassificationPipeline.fit') +@patch('evalml.pipelines.MulticlassClassificationPipeline.score') +@patch('evalml.pipelines.BinaryClassificationPipeline.fit') +@patch('evalml.pipelines.BinaryClassificationPipeline.score') +def test_automl_data_split_consistent(mock_binary_score, mock_binary_fit, mock_multi_score, mock_multi_fit, + mock_regression_score, mock_regression_fit, problem_type, + X_y_binary, X_y_multi, X_y_regression): + if problem_type == ProblemTypes.BINARY: + X, y = X_y_binary + + elif problem_type == ProblemTypes.MULTICLASS: + X, y = X_y_multi + + elif problem_type == ProblemTypes.REGRESSION: + X, y = X_y_regression + + data_splits = [] + random_state = [0, 0, 1] + for state in random_state: + a = AutoMLSearch(problem_type=problem_type, random_state=state, max_iterations=1) + a.search(X, y) + data_splits.append([[set(train), set(test)] for train, test in a.data_split.split(X, y)]) + # append split from last random state again, should be referencing same datasplit object + data_splits.append([[set(train), set(test)] for train, test in a.data_split.split(X, y)]) + + assert data_splits[0] == data_splits[1] + assert data_splits[1] != data_splits[2] + assert data_splits[2] == data_splits[3]