Fix resetting y_holdout index for time series (#4161)

* Add fix for when y_index is reset
alteryx · May 1, 2023 · 4aaf037 · 4aaf037
1 parent 2eee8cb
commit 4aaf037
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 0 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -3,6 +3,7 @@ Release Notes
 **Future Releases**
     * Enhancements
     * Fixes
+        * Fixed bug where resetting the holdout data indices would cause time series ``predict_in_sample`` to be wrong :pr:`4161`
     * Changes
         * Changed per-pipeline timings to store as a float :pr:`4160`
         * Update Dask install commands in ``pyproject.toml`` :pr:`4164`

diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py
@@ -251,6 +251,7 @@ def predict_in_sample(
             )
         X, y = self._drop_time_index(X, y)
         X_train, y_train = self._drop_time_index(X_train, y_train)
+        X, y = self._ensure_correct_indices(X, y, X_train)
         target = infer_feature_types(y)
         features = self.transform_all_but_final(
             X,
@@ -266,6 +267,18 @@ def predict_in_sample(
         predictions = predictions.rename(self.input_target_name)
         return infer_feature_types(predictions)
 
+    def _ensure_correct_indices(self, X, y, X_train):
+        """Ensures that X and y holdout's indices are the correct integer or time units w.r.t the training data.
+
+        For predict in sample where the holdout is known to follow the training data.
+        """
+        if X_train.index.is_numeric():
+            starting_index = X_train.index[-1] + 1 + self.gap
+            correct_index = range(starting_index, starting_index + len(y))
+            X.index = correct_index
+            y.index = correct_index
+        return X, y
+
     def _create_empty_series(self, y_train, size):
         return ww.init_series(
             pd.Series([y_train.iloc[0]] * size),

diff --git a/evalml/tests/pipeline_tests/test_time_series_pipeline.py b/evalml/tests/pipeline_tests/test_time_series_pipeline.py
@@ -668,6 +668,69 @@ def test_predict_and_predict_in_sample_with_time_index(
     assert (preds_in_sample.index == target.iloc[20:].index).all()
 
 
+def test_predict_in_sample_index_error(ts_data):
+    X_train, X_holdout, y_train = ts_data(
+        problem_type="time series regression",
+        train_target_index_dt=False,
+        train_features_index_dt=False,
+        no_features=True,
+    )
+    y_holdout = y_train[-len(X_holdout) :]
+    problem_configuration = {
+        "max_delay": 2,
+        "gap": 0,
+        "forecast_horizon": 5,
+        "time_index": "date",
+    }
+
+    pipeline = TimeSeriesRegressionPipeline(
+        component_graph={
+            "Imputer": ["Imputer", "X", "y"],
+            "Time Series Featurizer": ["Time Series Featurizer", "Imputer.x", "y"],
+            "STL Decomposer": ["STL Decomposer", "Time Series Featurizer.x", "y"],
+            "DateTime Featurizer": [
+                "DateTime Featurizer",
+                "Time Series Featurizer.x",
+                "STL Decomposer.y",
+            ],
+            "Drop NaN Rows Transformer": [
+                "Drop NaN Rows Transformer",
+                "DateTime Featurizer.x",
+                "STL Decomposer.y",
+            ],
+            "Extra Trees Regressor": [
+                "Extra Trees Regressor",
+                "Drop NaN Rows Transformer.x",
+                "Drop NaN Rows Transformer.y",
+            ],
+        },
+        parameters={
+            "pipeline": problem_configuration,
+            "Time Series Featurizer": problem_configuration,
+        },
+    )
+    pipeline.fit(X_train, y_train)
+
+    preds_in_sample = pipeline.predict_in_sample(
+        X_holdout,
+        y_holdout,
+        X_train=X_train,
+        y_train=y_train,
+    )
+
+    y_reset_holdout = y_holdout.reset_index(drop=True)
+    X_reset_holdout = X_holdout.reset_index(drop=True)
+    X_reset_holdout.ww.init(schema=X_holdout.ww.schema)
+    preds_in_sample_reset = pipeline.predict_in_sample(
+        X_reset_holdout,
+        y_reset_holdout,
+        X_train=X_train,
+        y_train=y_train,
+    )
+
+    assert_series_equal(preds_in_sample, preds_in_sample_reset)
+
+
 @pytest.mark.parametrize("only_use_y", [False])
 @pytest.mark.parametrize("include_delayed_features", [True, False])
 @pytest.mark.parametrize(