Skip to content

Commit

Permalink
Fix resetting y_holdout index for time series (#4161)
Browse files Browse the repository at this point in the history
* Add fix for when y_index is reset
  • Loading branch information
eccabay committed May 1, 2023
1 parent 2eee8cb commit 4aaf037
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Release Notes
**Future Releases**
* Enhancements
* Fixes
* Fixed bug where resetting the holdout data indices would cause time series ``predict_in_sample`` to be wrong :pr:`4161`
* Changes
* Changed per-pipeline timings to store as a float :pr:`4160`
* Update Dask install commands in ``pyproject.toml`` :pr:`4164`
Expand Down
13 changes: 13 additions & 0 deletions evalml/pipelines/time_series_pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def predict_in_sample(
)
X, y = self._drop_time_index(X, y)
X_train, y_train = self._drop_time_index(X_train, y_train)
X, y = self._ensure_correct_indices(X, y, X_train)
target = infer_feature_types(y)
features = self.transform_all_but_final(
X,
Expand All @@ -266,6 +267,18 @@ def predict_in_sample(
predictions = predictions.rename(self.input_target_name)
return infer_feature_types(predictions)

def _ensure_correct_indices(self, X, y, X_train):
"""Ensures that X and y holdout's indices are the correct integer or time units w.r.t the training data.
For predict in sample where the holdout is known to follow the training data.
"""
if X_train.index.is_numeric():
starting_index = X_train.index[-1] + 1 + self.gap
correct_index = range(starting_index, starting_index + len(y))
X.index = correct_index
y.index = correct_index
return X, y

def _create_empty_series(self, y_train, size):
return ww.init_series(
pd.Series([y_train.iloc[0]] * size),
Expand Down
63 changes: 63 additions & 0 deletions evalml/tests/pipeline_tests/test_time_series_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,69 @@ def test_predict_and_predict_in_sample_with_time_index(
assert (preds_in_sample.index == target.iloc[20:].index).all()


def test_predict_in_sample_index_error(ts_data):
X_train, X_holdout, y_train = ts_data(
problem_type="time series regression",
train_target_index_dt=False,
train_features_index_dt=False,
no_features=True,
)
y_holdout = y_train[-len(X_holdout) :]
problem_configuration = {
"max_delay": 2,
"gap": 0,
"forecast_horizon": 5,
"time_index": "date",
}

pipeline = TimeSeriesRegressionPipeline(
component_graph={
"Imputer": ["Imputer", "X", "y"],
"Time Series Featurizer": ["Time Series Featurizer", "Imputer.x", "y"],
"STL Decomposer": ["STL Decomposer", "Time Series Featurizer.x", "y"],
"DateTime Featurizer": [
"DateTime Featurizer",
"Time Series Featurizer.x",
"STL Decomposer.y",
],
"Drop NaN Rows Transformer": [
"Drop NaN Rows Transformer",
"DateTime Featurizer.x",
"STL Decomposer.y",
],
"Extra Trees Regressor": [
"Extra Trees Regressor",
"Drop NaN Rows Transformer.x",
"Drop NaN Rows Transformer.y",
],
},
parameters={
"pipeline": problem_configuration,
"Time Series Featurizer": problem_configuration,
},
)
pipeline.fit(X_train, y_train)

preds_in_sample = pipeline.predict_in_sample(
X_holdout,
y_holdout,
X_train=X_train,
y_train=y_train,
)

y_reset_holdout = y_holdout.reset_index(drop=True)
X_reset_holdout = X_holdout.reset_index(drop=True)
X_reset_holdout.ww.init(schema=X_holdout.ww.schema)
preds_in_sample_reset = pipeline.predict_in_sample(
X_reset_holdout,
y_reset_holdout,
X_train=X_train,
y_train=y_train,
)

assert_series_equal(preds_in_sample, preds_in_sample_reset)


@pytest.mark.parametrize("only_use_y", [False])
@pytest.mark.parametrize("include_delayed_features", [True, False])
@pytest.mark.parametrize(
Expand Down

0 comments on commit 4aaf037

Please sign in to comment.