alteryx · bchen1116 · Dec 18, 2020 · Dec 18, 2020 · Dec 18, 2020 · Dec 18, 2020
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -29,6 +29,7 @@ Release Notes
         * Set max value for plotly and xgboost versions while we debug CI failures with newer versions :pr:`1532`
         * Undo version pinning for plotly :pr:`1533`
         * Fix ReadTheDocs build by updating the version of ``setuptools`` :pr:`1561`
+        * Set ``random_state`` of data splitter in AutoMLSearch to take int to keep consistency in the resulting splits :pr:`1579`
     * Changes
         * Update circleci badge to apply to ``main`` :pr:`1489`
         * Added script to generate github markdown for releases :pr:`1487`

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -49,7 +49,7 @@
 from evalml.pipelines.utils import make_pipeline
 from evalml.problem_types import ProblemTypes, handle_problem_types
 from evalml.tuners import SKOptTuner
-from evalml.utils import convert_to_seconds, get_random_state
+from evalml.utils import convert_to_seconds, get_random_seed, get_random_state
 from evalml.utils.gen_utils import (
     _convert_to_woodwork_structure,
     _convert_woodwork_types_wrapper
@@ -229,6 +229,7 @@ def __init__(self,
             'errors': []
         }
         self.random_state = get_random_state(random_state)
+        self.random_seed = get_random_seed(self.random_state)
         self.n_jobs = n_jobs
 
         self.plot = None
@@ -372,7 +373,7 @@ def _set_data_split(self, X, y):
             y (pd.Series, ww.DataColumn): The target training data of length [n_samples].
         """
         default_data_split = make_data_splitter(X, y, self.problem_type, self.problem_configuration,
-                                                n_splits=3, shuffle=True, random_state=self.random_state)
+                                                n_splits=3, shuffle=True, random_state=self.random_seed)
         self.data_split = self.data_split or default_data_split
 
     def search(self, X, y, data_checks="auto", show_iteration_plot=True):

diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
@@ -1887,3 +1887,36 @@ class Pipeline2(TimeSeriesRegressionPipeline):
             continue
         assert result['parameters']['Delayed Feature Transformer'] == configuration
         assert result['parameters']['pipeline'] == configuration
+
+
+@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION])
+@patch('evalml.pipelines.RegressionPipeline.fit')
+@patch('evalml.pipelines.RegressionPipeline.score')
+@patch('evalml.pipelines.MulticlassClassificationPipeline.fit')
+@patch('evalml.pipelines.MulticlassClassificationPipeline.score')
+@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
+@patch('evalml.pipelines.BinaryClassificationPipeline.score')
+def test_automl_data_split_consistent(mock_binary_score, mock_binary_fit, mock_multi_score, mock_multi_fit,
+                                      mock_regression_score, mock_regression_fit, problem_type,
+                                      X_y_binary, X_y_multi, X_y_regression):
+    if problem_type == ProblemTypes.BINARY:
+        X, y = X_y_binary
+
+    elif problem_type == ProblemTypes.MULTICLASS:
+        X, y = X_y_multi
+
+    elif problem_type == ProblemTypes.REGRESSION:
+        X, y = X_y_regression
+
+    data_splits = []
+    random_state = [0, 0, 1]
+    for state in random_state:
+        a = AutoMLSearch(problem_type=problem_type, random_state=state, max_iterations=1)
+        a.search(X, y)
+        data_splits.append([[set(train), set(test)] for train, test in a.data_split.split(X, y)])
+    # append split from last random state again, should be referencing same datasplit object
+    data_splits.append([[set(train), set(test)] for train, test in a.data_split.split(X, y)])
+
+    assert data_splits[0] == data_splits[1]
+    assert data_splits[1] != data_splits[2]
+    assert data_splits[2] == data_splits[3]