alteryx · ParthivNaresh · Jan 18, 2022 · Dec 20, 2021 · Dec 20, 2021 · Dec 20, 2021
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,7 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Required the separation of training and test data by ``gap`` + 1 units to be verified by ``time_index`` for time series problems :pr:`3208`
         * Added support for boolean features for ``ARIMARegressor`` :pr:`3187`
         * Updated dependency bot workflow to remove outdated description and add new configuration to delete branches automatically :pr:`3212`
     * Fixes
@@ -12,6 +13,7 @@ Release Notes
     * Changes
         * Changed the default objective to ``MedianAE`` from ``R2`` for time series regression :pr:`3205`
         * Removed all-nan Unknown to Double logical conversion in ``infer_feature_types`` :pr:`3196`
+        * Checking the validity of holdout data for time series problems can be performed by calling ``pipelines.utils.validate_holdout_datasets`` prior to calling ``predict`` :pr:`3208`
     * Documentation Changes
     * Testing Changes
 

diff --git a/evalml/demos/weather.py b/evalml/demos/weather.py
@@ -1,6 +1,9 @@
 """The Australian daily-min-termperatures weather dataset."""
+import pandas as pd
+
 import evalml
 from evalml.preprocessing import load_data
+from evalml.utils import infer_feature_types
 
 
 def load_weather():
@@ -15,4 +18,18 @@ def load_weather():
         + evalml.__version__
     )
     X, y = load_data(filename, index=None, target="Temp")
+
+    missing_date_1 = pd.DataFrame([pd.to_datetime("1984-12-31")], columns=["Date"])
+    missing_date_2 = pd.DataFrame([pd.to_datetime("1988-12-31")], columns=["Date"])
+    missing_y_1 = pd.Series([14.5], name="Temp")
+    missing_y_2 = pd.Series([14.5], name="Temp")
+
+    X = pd.concat([X.iloc[:1460], missing_date_1, X.iloc[1460:]]).reset_index(drop=True)
+    X = pd.concat([X.iloc[:2921], missing_date_2, X.iloc[2921:]]).reset_index(drop=True)
+    y = pd.concat([y.iloc[:1460], missing_y_1, y.iloc[1460:]]).reset_index(drop=True)
+    y = pd.concat([y.iloc[:2921], missing_y_2, y.iloc[2921:]]).reset_index(drop=True)
+
+    X = infer_feature_types(X)
+    y = infer_feature_types(y)
+
     return X, y
diff --git a/evalml/exceptions/exceptions.py b/evalml/exceptions/exceptions.py
@@ -124,6 +124,8 @@ class PartialDependenceErrorCode(Enum):
     """ice_plot_requested_for_two_way_partial_dependence_plot"""
     INVALID_CLASS_LABEL = "invalid_class_label_requested_for_plot"
     """invalid_class_label_requested_for_plot"""
+    INVALID_HOLDOUT_SET = "invalid_holdout_set"
+    """invalid_holdout_set"""
     ALL_OTHER_ERRORS = "all_other_errors"
     """all_other_errors"""
 

diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py
@@ -119,7 +119,6 @@ def predict_proba(self, X, X_train=None, y_train=None):
         X.index = self._move_index_forward(
             X_train.index[-X.shape[0] :], self.gap + X.shape[0]
         )
-        self._validate_holdout_datasets(X, X_train)
         y_holdout = self._create_empty_series(y_train, X.shape[0])
         y_holdout = infer_feature_types(y_holdout)
         y_holdout.index = X.index

diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py
@@ -36,11 +36,11 @@ def __init__(
                 "time_index, gap, max_delay, and forecast_horizon parameters cannot be omitted from the parameters dict. "
                 "Please specify them as a dictionary with the key 'pipeline'."
             )
-        pipeline_params = parameters["pipeline"]
-        self.gap = pipeline_params["gap"]
-        self.max_delay = pipeline_params["max_delay"]
-        self.forecast_horizon = pipeline_params["forecast_horizon"]
-        self.time_index = pipeline_params["time_index"]
+        self.pipeline_params = parameters["pipeline"]
+        self.gap = self.pipeline_params["gap"]
+        self.max_delay = self.pipeline_params["max_delay"]
+        self.forecast_horizon = self.pipeline_params["forecast_horizon"]
+        self.time_index = self.pipeline_params["time_index"]
         if self.time_index is None:
             raise ValueError("Parameter time_index cannot be None!")
         super().__init__(
@@ -66,55 +66,20 @@ def _move_index_forward(index, gap):
         else:
             return index + gap
 
-    @staticmethod
-    def _are_datasets_separated_by_gap(train_index, test_index, gap):
-        """Determine if the train and test datasets are separated by gap number of units.
-
-        This will be true when users are predicting on unseen data but not during cross
-        validation since the target is known.
-        """
-        gap_difference = gap + 1
-        index_difference = test_index[0] - train_index[-1]
-        if isinstance(
-            train_index, (pd.DatetimeIndex, pd.PeriodIndex, pd.TimedeltaIndex)
-        ):
-            gap_difference *= test_index.freq
-        return index_difference == gap_difference
-
-    def _validate_holdout_datasets(self, X, X_train):
-        """Validate the holdout datasets match out expectations.
-
-        Args:
-            X (pd.DataFrame): Data of shape [n_samples, n_features].
-            X_train (pd.DataFrame): Training data.
-
-        Raises:
-            ValueError: If holdout data does not have forecast_horizon entries or if datasets
-                are not separated by gap.
-        """
-        right_length = len(X) <= self.forecast_horizon
-        X_separated_by_gap = self._are_datasets_separated_by_gap(
-            X_train.index, X.index, self.gap
-        )
-        if not (right_length and X_separated_by_gap):
-            raise ValueError(
-                f"Holdout data X must have {self.forecast_horizon}  rows (value of forecast horizon) "
-                "and its index needs to "
-                f"start {self.gap + 1} values ahead of the training index. "
-                f"Data received - Length X: {len(X)}, "
-                f"X index start: {X.index[0]}, X_train index end {X_train.index[-1]}."
-            )
-
     def _add_training_data_to_X_Y(self, X, y, X_train, y_train):
         """Append the training data to the holdout data.
 
         Need to do this so that we have all the data we need to compute lagged features on the holdout set.
         """
+        from evalml.pipelines.utils import (
+            are_datasets_separated_by_gap_time_index,
+        )
+
         last_row_of_training = self.forecast_horizon + self.max_delay + self.gap
         gap_features = pd.DataFrame()
         gap_target = pd.Series()
         if (
-            self._are_datasets_separated_by_gap(X_train.index, X.index, self.gap)
+            are_datasets_separated_by_gap_time_index(X_train, X, self.pipeline_params)
             and self.gap
         ):
             # The training data does not have the gap dates so don't need to include them
@@ -235,7 +200,6 @@ def predict(self, X, objective=None, X_train=None, y_train=None):
         X.index = self._move_index_forward(
             X_train.index[-X.shape[0] :], self.gap + X.shape[0]
         )
-        self._validate_holdout_datasets(X, X_train)
         y_holdout = self._create_empty_series(y_train, X.shape[0])
         y_holdout = infer_feature_types(y_holdout)
         y_holdout.index = X.index

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -4,6 +4,7 @@
 
 from woodwork import logical_types
 
+from ..exceptions import PartialDependenceError, PartialDependenceErrorCode
 from . import (
     TimeSeriesBinaryClassificationPipeline,
     TimeSeriesMulticlassClassificationPipeline,
@@ -815,6 +816,75 @@ def make_timeseries_baseline_pipeline(problem_type, gap, forecast_horizon, time_
     return baseline
 
 
+def are_datasets_separated_by_gap_time_index(train, test, pipeline_params):
+    """Determine if the train and test datasets are separated by gap number of units using the time_index.
+
+    This will be true when users are predicting on unseen data but not during cross
+    validation since the target is known.
+
+    Args:
+        train (pd.DataFrame): Training data.
+        test (pd.DataFrame): Data of shape [n_samples, n_features].
+        pipeline_params (dict): Dictionary of time series parameters.
+
+    Returns:
+        bool: True if the difference in time units is equal to gap + 1.
+
+    """
+    gap_difference = pipeline_params["gap"] + 1
+
+    train_copy = train.copy()
+    test_copy = test.copy()
+    train_copy.ww.init(time_index=pipeline_params["time_index"])
+    test_copy.ww.init(time_index=pipeline_params["time_index"])
+
+    X_frequency_dict = train_copy.ww.infer_temporal_frequencies(
+        temporal_columns=[train_copy.ww.time_index]
+    )
+    freq = X_frequency_dict[test_copy.ww.time_index]
+    if freq is None:
+        return True
+
+    first_testing_date = test_copy[test_copy.ww.time_index].iloc[0]
+    last_training_date = train_copy[train_copy.ww.time_index].iloc[-1]
+    dt_difference = first_testing_date - last_training_date
+
+    try:
+        units_difference = dt_difference / freq
+    except ValueError:
+        units_difference = dt_difference / ("1" + freq)
+    return units_difference == gap_difference
+
+
+def validate_holdout_datasets(X, X_train, pipeline_params):
+    """Validate the holdout datasets match out expectations.
+
+    Args:
+        X (pd.DataFrame): Data of shape [n_samples, n_features].
+        X_train (pd.DataFrame): Training data.
+        pipeline_params (dict): Dictionary of time series parameters.
+
+    Raises:
+        PartialDependenceError: If holdout data does not have forecast_horizon entries or if datasets are not separated by gap.
+    """
+    forecast_horizon = pipeline_params["forecast_horizon"]
+    gap = pipeline_params["gap"]
+    time_index = pipeline_params["time_index"]
+    right_length = len(X) <= forecast_horizon
+    X_separated_by_gap = are_datasets_separated_by_gap_time_index(
+        X_train, X, pipeline_params
+    )
+    if not (right_length and X_separated_by_gap):
+        raise PartialDependenceError(
+            f"Holdout data X must have {forecast_horizon} rows (value of forecast horizon) "
+            f"and the first value indicated by the column {time_index} needs to "
+            f"start {gap + 1} units ahead of the training data. "
+            f"Data received - Length X: {len(X)}, "
+            f"X value start: {X[time_index].iloc[0]}, X_train value end {X_train[time_index].iloc[-1]}.",
+            PartialDependenceErrorCode.INVALID_HOLDOUT_SET,
+        )
+
+
 def rows_of_interest(
     pipeline, X, y=None, threshold=None, epsilon=0.1, sort_values=True, types="all"
 ):

diff --git a/evalml/tests/automl_tests/test_engine_base.py b/evalml/tests/automl_tests/test_engine_base.py
@@ -135,13 +135,13 @@ def test_train_pipeline_trains_and_tunes_threshold(
 
 def test_train_pipeline_trains_and_tunes_threshold_ts(
     ts_data,
-    dummy_ts_binary_linear_classifier_pipeline_class,
+    dummy_ts_binary_tree_classifier_pipeline_class,
 ):
-    X = pd.DataFrame([i for i in range(32)])
+    X = pd.DataFrame(pd.date_range("1/1/21", periods=32), columns=["date"])
     y = pd.Series([0, 1, 0, 1] * 8)
 
     params = {"gap": 1, "max_delay": 1, "forecast_horizon": 1, "time_index": "date"}
-    ts_binary = dummy_ts_binary_linear_classifier_pipeline_class(
+    ts_binary = dummy_ts_binary_tree_classifier_pipeline_class(
         parameters={"pipeline": params}
     )
     assert ts_binary.threshold is None

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
@@ -826,11 +826,11 @@ def __init__(
 
 
 @pytest.fixture
-def dummy_ts_binary_linear_classifier_pipeline_class():
-    log_reg_classifier = LogisticRegressionClassifier
+def dummy_ts_binary_tree_classifier_pipeline_class():
+    dec_tree_classifier = DecisionTreeClassifier
 
     class MockBinaryClassificationPipeline(TimeSeriesBinaryClassificationPipeline):
-        estimator = log_reg_classifier
+        estimator = dec_tree_classifier
         component_graph = [estimator]
 
         def __init__(

diff --git a/evalml/tests/demo_tests/test_datasets.py b/evalml/tests/demo_tests/test_datasets.py
@@ -86,5 +86,31 @@ def test_datasets(dataset_name, expected_shape, local_datasets):
 def test_datasets_match_local(dataset_name, demo_method, local_datasets):
     X, y = demo_method
     X_local, y_local = local_datasets[dataset_name]
+
+    if dataset_name == "daily_temp":
+        missing_date_1 = pd.DataFrame([pd.to_datetime("1984-12-31")], columns=["Date"])
+        missing_date_2 = pd.DataFrame([pd.to_datetime("1988-12-31")], columns=["Date"])
+        missing_y_1 = pd.Series([14.5], name="Temp")
+        missing_y_2 = pd.Series([14.5], name="Temp")
+
+        X_local = pd.concat(
+            [
+                X_local.iloc[:1460],
+                missing_date_1,
+                X_local.iloc[1460:2920],
+                missing_date_2,
+                X_local.iloc[2920:],
+            ]
+        ).reset_index(drop=True)
+        y_local = pd.concat(
+            [
+                y_local.iloc[:1460],
+                missing_y_1,
+                y_local.iloc[1460:2920],
+                missing_y_2,
+                y_local.iloc[2920:],
+            ]
+        ).reset_index(drop=True)
+
     pd.testing.assert_frame_equal(X, X_local)
     pd.testing.assert_series_equal(y, y_local)