alteryx · bchen1116 · Feb 7, 2022 · Feb 1, 2022 · Feb 1, 2022 · Feb 1, 2022
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -5,6 +5,7 @@
     * Enhancements
     * Fixes
     * Changes
+        * Added an ``is_cv`` property to the datasplitters used :pr:`3297`
     * Documentation Changes
     * Testing Changes
 

diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py
@@ -2,7 +2,6 @@
 from collections import namedtuple
 
 import pandas as pd
-from sklearn.model_selection import KFold, StratifiedKFold
 
 from evalml.objectives import get_objective
 from evalml.pipelines import (
@@ -14,6 +13,8 @@
     TimeSeriesRegressionPipeline,
 )
 from evalml.preprocessing.data_splitters import (
+    KFold,
+    StratifiedKFold,
     TimeSeriesSplit,
     TrainingValidationSplit,
 )

diff --git a/evalml/preprocessing/data_splitters/__init__.py b/evalml/preprocessing/data_splitters/__init__.py
@@ -2,3 +2,4 @@
 from .no_split import NoSplit
 from .training_validation_split import TrainingValidationSplit
 from .time_series_split import TimeSeriesSplit
+from .sk_splitters import KFold, StratifiedKFold
diff --git a/evalml/preprocessing/data_splitters/no_split.py b/evalml/preprocessing/data_splitters/no_split.py
@@ -29,6 +29,15 @@ def get_n_splits():
         """
         return 0
 
+    @property
+    def is_cv(self):
+        """Returns whether or not the data splitter is a cross-validation data splitter.
+
+        Returns:
+            bool: If the splitter is a cross-validation data splitter
+        """
+        return False
+
     def split(self, X, y=None):
         """Divide the data into training and testing sets, where the testing set is empty.
 

diff --git a/evalml/preprocessing/data_splitters/sk_splitters.py b/evalml/preprocessing/data_splitters/sk_splitters.py
@@ -0,0 +1,28 @@
+"""SKLearn data splitter wrapper classes."""
+from sklearn.model_selection import KFold, StratifiedKFold
+
+
+class KFold(KFold):
+    """Wrapper class for sklearn's KFold splitter."""
+
+    @property
+    def is_cv(self):
+        """Returns whether or not the data splitter is a cross-validation data splitter.
+
+        Returns:
+            bool: If the splitter is a cross-validation data splitter
+        """
+        return True
+
+
+class StratifiedKFold(StratifiedKFold):
+    """Wrapper class for sklearn's Stratified KFold splitter."""
+
+    @property
+    def is_cv(self):
+        """Returns whether or not the data splitter is a cross-validation data splitter.
+
+        Returns:
+            bool: If the splitter is a cross-validation data splitter
+        """
+        return True
diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py
@@ -79,6 +79,15 @@ def get_n_splits(self, X=None, y=None, groups=None):
     def _check_if_empty(data):
         return data is None or data.empty
 
+    @property
+    def is_cv(self):
+        """Returns whether or not the data splitter is a cross-validation data splitter.
+
+        Returns:
+            bool: If the splitter is a cross-validation data splitter
+        """
+        return self._splitter.n_splits > 1
+
     def split(self, X, y=None, groups=None):
         """Get the time series splits.
 

diff --git a/evalml/preprocessing/data_splitters/training_validation_split.py b/evalml/preprocessing/data_splitters/training_validation_split.py
@@ -72,6 +72,15 @@ def get_n_splits():
         """
         return 1
 
+    @property
+    def is_cv(self):
+        """Returns whether or not the data splitter is a cross-validation data splitter.
+
+        Returns:
+            bool: If the splitter is a cross-validation data splitter
+        """
+        return False
+
     def split(self, X, y=None):
         """Divide the data into training and testing sets.
 

diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py
@@ -107,19 +107,22 @@ def test_make_data_splitter_default(problem_type, large_data):
         assert data_splitter.random_seed == 0
         assert data_splitter.shuffle
         assert data_splitter.test_size == _LARGE_DATA_PERCENT_VALIDATION
+        assert not data_splitter.is_cv
         return
 
     if problem_type == ProblemTypes.REGRESSION:
         assert isinstance(data_splitter, KFold)
         assert data_splitter.n_splits == 3
         assert data_splitter.shuffle
         assert data_splitter.random_state == 0
+        assert data_splitter.is_cv
 
     if problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
         assert isinstance(data_splitter, StratifiedKFold)
         assert data_splitter.n_splits == 3
         assert data_splitter.shuffle
         assert data_splitter.random_state == 0
+        assert data_splitter.is_cv
 
     if problem_type in [
         ProblemTypes.TIME_SERIES_REGRESSION,
@@ -132,6 +135,7 @@ def test_make_data_splitter_default(problem_type, large_data):
         assert data_splitter.max_delay == 7
         assert data_splitter.forecast_horizon == 4
         assert data_splitter.time_index == "foo"
+        assert data_splitter.is_cv
 
 
 @pytest.mark.parametrize(
@@ -155,6 +159,7 @@ def test_make_data_splitter_parameters(problem_type, expected_data_splitter):
     assert data_splitter.n_splits == 5
     assert data_splitter.shuffle
     assert data_splitter.random_state == random_seed
+    assert data_splitter.is_cv
 
 
 def test_make_data_splitter_parameters_time_series():

diff --git a/evalml/tests/preprocessing_tests/test_no_split.py b/evalml/tests/preprocessing_tests/test_no_split.py
@@ -6,6 +6,7 @@
 
 def test_nosplit_nsplits():
     assert NoSplit().get_n_splits() == 0
+    assert not NoSplit().is_cv
 
 
 def test_nosplit_default():

diff --git a/evalml/tests/preprocessing_tests/test_sk_splitters.py b/evalml/tests/preprocessing_tests/test_sk_splitters.py
@@ -0,0 +1,33 @@
+import numpy as np
+import pytest
+from sklearn.model_selection import KFold as sk_kfold
+from sklearn.model_selection import StratifiedKFold as sk_stratified
+
+from evalml.preprocessing.data_splitters import KFold, StratifiedKFold
+
+
+@pytest.mark.parametrize(
+    "sk_splitter,splitter", [[sk_kfold, KFold], [sk_stratified, StratifiedKFold]]
+)
+@pytest.mark.parametrize("problem_type", ["binary", "multiclass"])
+def test_splitters_equal(problem_type, sk_splitter, splitter, X_y_binary, X_y_multi):
+    parameters = {"shuffle": True, "random_state": 0, "n_splits": 4}
+    sk_split = splitter(**parameters)
+    evalml_split = splitter(**parameters)
+    if problem_type == "binary":
+        X, y = X_y_binary
+    else:
+        X, y = X_y_multi
+
+    skt, skv = [], []
+    evt, evv = [], []
+
+    for t, v in sk_split.split(X, y):
+        skt.append(t)
+        skv.append(v)
+    for t, v in evalml_split.split(X, y):
+        evt.append(t)
+        evv.append(v)
+    np.testing.assert_array_equal(skt, evt)
+    np.testing.assert_array_equal(skv, evv)
+    assert evalml_split.is_cv
diff --git a/evalml/tests/preprocessing_tests/test_training_validation_split.py b/evalml/tests/preprocessing_tests/test_training_validation_split.py
@@ -7,6 +7,7 @@
 
 def test_tvsplit_nsplits():
     assert TrainingValidationSplit().get_n_splits() == 1
+    assert not TrainingValidationSplit().is_cv
 
 
 def test_tvsplit_default():