alteryx · christopherbunn · Oct 9, 2019 · Oct 1, 2019 · Oct 2, 2019 · Oct 7, 2019
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -15,10 +15,11 @@
 import os
 import sys
 
+import evalml
+
 path = os.path.join('..', '..')
 sys.path.insert(0, os.path.abspath(path))
 
-import evalml
 
 # -- Project information -----------------------------------------------------
 

diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py
@@ -1,6 +1,6 @@
 import pandas as pd
 from dask import dataframe as dd
-from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
 
 
 def load_data(path, index, label, drop=None, verbose=True, **kwargs):
@@ -43,24 +43,29 @@ def load_data(path, index, label, drop=None, verbose=True, **kwargs):
     return X, y
 
 
-def split_data(X, y, test_size=.2, random_state=None):
+def split_data(X, y, regression=False, test_size=.2, random_state=None):
     """Splits data into train and test sets.
 
     Args:
         X (DataFrame) : features
         y (Series) : labels
+        regression (bool): if true, do not use stratified split
         test_size (float) : percent of train set to holdout for testing
         random_state (int) : seed for the random number generator
 
     Returns:
         DataFrame, DataFrame, Series, Series : features and labels each split into train and test sets
     """
-    stratified = StratifiedShuffleSplit(
-        n_splits=1,
-        test_size=test_size,
-        random_state=random_state,
-    )
-    train, test = next(stratified.split(X, y))
+    if regression:
+        CV_method = ShuffleSplit(n_splits=1,
+                                 test_size=test_size,
+                                 random_state=0)
+    else:
+        CV_method = StratifiedShuffleSplit(
+            n_splits=1,
+            test_size=test_size,
+            random_state=random_state)
+    train, test = next(CV_method.split(X, y))
     X_train = X.iloc[train]
     X_test = X.iloc[test]
     y_train = y.iloc[train]

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
@@ -13,6 +13,13 @@ def X_y():
     return X, y
 
 
+@pytest.fixture
+def X_y_reg():
+    X, y = datasets.make_regression(n_samples=100, n_features=20,
+                                    n_informative=3, random_state=0)
+    return X, y
+
+
 @pytest.fixture
 def X_y_multi():
     X, y = datasets.make_classification(n_samples=100, n_features=20, n_classes=3,

diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py
@@ -0,0 +1,31 @@
+import pandas as pd
+
+from evalml.preprocessing import split_data
+
+
+def test_split_regression(X_y_reg):
+    X, y = X_y_reg
+    X = pd.DataFrame(X)
+    y = pd.Series(y)
+    test_pct = 0.25
+    X_train, X_test, y_train, y_test = split_data(X, y, test_size=test_pct, regression=True)
+    test_size = len(X) * test_pct
+    train_size = len(X) - test_size
+    assert len(X_train) == train_size
+    assert len(X_test) == test_size
+    assert len(y_train) == train_size
+    assert len(y_test) == test_size
+
+
+def test_split_classification(X_y):
+    X, y = X_y
+    X = pd.DataFrame(X)
+    y = pd.Series(y)
+    test_pct = 0.25
+    X_train, X_test, y_train, y_test = split_data(X, y, test_size=test_pct)
+    test_size = len(X) * 0.25
+    train_size = len(X) - test_size
+    assert len(X_train) == train_size
+    assert len(X_test) == test_size
+    assert len(y_train) == train_size
+    assert len(y_test) == test_size