diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 2d2e135845..f239753bdb 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,18 +2,19 @@ Release Notes ------------- **Future Releases** * Enhancements - * Add CI testing environment in Mac for install workflow :pr:`3646` * Updated to run with Woodwork >= 0.17.2 :pr:`3626` - * Add ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631` - * Add ``fit_transform`` method to pipelines and component graphs :pr:`3640` + * Added ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631` + * Added ``fit_transform`` method to pipelines and component graphs :pr:`3640` + * Add CI testing environment in Mac for install workflow :pr:`3646` * Fixes * Reverted the Woodwork 0.17.x compatibility work due to performance regression :pr:`3664` * Changes - * Disable holdout set in AutoML search by default :pr:`3659` + * Disabled holdout set in AutoML search by default :pr:`3659` * Pinned ``sktime`` at >=0.7.0,<0.13.1 due to slowdowns with time series modeling :pr:`3658` + * Reduced the default test size in ``split_data`` to 0.1 for time series problems :pr:`3650` * Documentation Changes * Updated broken link checker to exclude stackoverflow domain :pr:`3633` - * Add instructions to add new users to evalml-core-feedstock :pr:`3636` + * Added instructions to add new users to evalml-core-feedstock :pr:`3636` * Testing Changes .. warning:: diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index dba31c5d49..a5022afa12 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -47,7 +47,7 @@ def split_data( y, problem_type, problem_configuration=None, - test_size=0.2, + test_size=None, random_seed=0, ): """Split data into train and test sets. @@ -58,7 +58,8 @@ def split_data( problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list. problem_configuration (dict): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the time_index, gap, and max_delay variables. - test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%). + test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%) for non-timeseries problems and 0.1 + (10%) for timeseries problems. random_seed (int): Seed for the random number generator. Defaults to 0. Returns: @@ -95,24 +96,35 @@ def split_data( data_splitter = None if is_time_series(problem_type): + if test_size is None: + test_size = 0.1 + if ( + problem_configuration is not None + and "forecast_horizon" in problem_configuration + ): + fh_pct = problem_configuration["forecast_horizon"] / len(X) + test_size = max(test_size, fh_pct) data_splitter = TrainingValidationSplit( test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed, ) - elif is_regression(problem_type): - data_splitter = ShuffleSplit( - n_splits=1, - test_size=test_size, - random_state=random_seed, - ) - elif is_classification(problem_type): - data_splitter = StratifiedShuffleSplit( - n_splits=1, - test_size=test_size, - random_state=random_seed, - ) + else: + if test_size is None: + test_size = 0.2 + if is_regression(problem_type): + data_splitter = ShuffleSplit( + n_splits=1, + test_size=test_size, + random_state=random_seed, + ) + elif is_classification(problem_type): + data_splitter = StratifiedShuffleSplit( + n_splits=1, + test_size=test_size, + random_state=random_seed, + ) train, test = next(data_splitter.split(X, y)) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index a802afa48d..b16c60a5eb 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -96,17 +96,42 @@ def _get_test_data_from_configuration( problem_type, column_names=None, nullable_target=False, + scale=2, ): X_all = pd.DataFrame( { - "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] - * 2, - "int_null": [0, 1, 2, np.nan, 4, np.nan, 6] * 2, - "age_null": [0, 1, 2, np.nan, 4, np.nan, 6] * 2, - "bool_null": [True, None, False, True, False, None, True] * 2, - "numerical": range(14), - "categorical": ["a", "b", "a", "b", "b", "a", "b"] * 2, - "dates": pd.date_range("2000-02-03", periods=14, freq="W"), + "all_null": [ + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] + * scale, + "int_null": [0, 1, 2, np.nan, 4, np.nan, 6, 7, 8, 9] * scale, + "age_null": [0, 1, 2, np.nan, 4, np.nan, 6, 7, 8, 9] * scale, + "bool_null": [ + True, + None, + False, + True, + False, + None, + True, + True, + False, + True, + ] + * scale, + "numerical": range(10 * scale), + "categorical": ["a", "b", "a", "b", "b", "a", "b", "a", "a", "b"] + * scale, + "dates": pd.date_range("2000-02-03", periods=10 * scale, freq="W"), "text": [ "this is a string", "this is another string", @@ -115,8 +140,11 @@ def _get_test_data_from_configuration( "cats are gr8", "hello world", "evalml is gr8", + "more strings", + "here we go", + "wheeeee!!!", ] - * 2, + * scale, "email": [ "abalone_0@gmail.com", "AbaloneRings@yahoo.com", @@ -125,8 +153,11 @@ def _get_test_data_from_configuration( "fooEMAIL@email.org", "evalml@evalml.org", "evalml@alteryx.org", + "woodwork@alteryx.org", + "featuretools@alteryx.org", + "compose@alteryx.org", ] - * 2, + * scale, "url": [ "https://evalml.alteryx.com/en/stable/", "https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html", @@ -135,8 +166,11 @@ def _get_test_data_from_configuration( "https://www.evalml.alteryx.com/en/stable/demos/text_input.html", "https://github.com/alteryx/evalml", "https://github.com/alteryx/featuretools", + "https://github.com/alteryx/woodwork", + "https://github.com/alteryx/compose", + "https://woodwork.alteryx.com/en/stable/", ] - * 2, + * scale, "ip": [ "0.0.0.0", "1.1.1.101", @@ -145,21 +179,28 @@ def _get_test_data_from_configuration( "101.1.1.1", "192.168.1.1", "255.255.255.255", + "2.1.1.101", + "2.1.101.1", + "2.101.1.1", ] - * 2, + * scale, }, ) - y = pd.Series([0, 0, 1, 0, 0, 1, 1] * 2) + y = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 0] * scale) if problem_type == ProblemTypes.MULTICLASS: - y = pd.Series([0, 2, 1, 2, 0, 2, 1] * 2) + y = pd.Series([0, 2, 1, 2, 0, 2, 1, 2, 1, 0] * scale) elif is_regression(problem_type): - y = pd.Series([1, 2, 3, 3, 3, 4, 5] * 2) + y = pd.Series([1, 2, 3, 3, 3, 4, 5, 5, 6, 6] * scale) if nullable_target: y.iloc[2] = None if input_type == "ww": y = ww.init_series(y, logical_type="integer_nullable") X = X_all[column_names] + if input_type == "np": + X = X.to_numpy() + y = y.to_numpy() + if input_type == "ww": logical_types = {} if "text" in column_names: diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py index 98d7caceae..8c34810a34 100644 --- a/evalml/tests/preprocessing_tests/test_split_data.py +++ b/evalml/tests/preprocessing_tests/test_split_data.py @@ -59,3 +59,72 @@ def test_split_data( y = pd.Series(y) pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False) pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False) + + +@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) +def test_split_data_defaults(problem_type, data_type, get_test_data_from_configuration): + X, y = get_test_data_from_configuration( + data_type, + problem_type, + column_names=["numerical"], + scale=10, + ) + + problem_configuration = None + if is_time_series(problem_type): + problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "ts_data"} + test_pct = 0.1 + else: + test_pct = 0.2 + X_train, X_test, y_train, y_test = split_data( + X, + y, + problem_type=problem_type, + problem_configuration=problem_configuration, + ) + test_size = len(X) * test_pct + train_size = len(X) - test_size + assert len(X_train) == train_size + assert len(X_test) == test_size + assert len(y_train) == train_size + assert len(y_test) == test_size + + if is_time_series(problem_type): + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + y = pd.Series(y) + pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False) + pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False) + + +@pytest.mark.parametrize("test", ["fh_limitation", "no_fh_limitation"]) +def test_split_data_ts(test, X_y_regression): + X, y = X_y_regression + + if test == "no_fh_limitation": + test_pct = 0.1 + fh = 5 + test_size = len(X) * test_pct + train_size = len(X) - test_size + elif test == "fh_limitation": + fh = 25 + test_size = fh + train_size = len(X) - fh + + problem_configuration = { + "gap": 1, + "max_delay": 7, + "forecast_horizon": fh, + "time_index": "ts_data", + } + X_train, X_test, y_train, y_test = split_data( + X, + y, + problem_type="time series regression", + problem_configuration=problem_configuration, + ) + assert len(X_train) == train_size + assert len(X_test) == test_size + assert len(y_train) == train_size + assert len(y_test) == test_size