New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Gap separation #3208
Gap separation #3208
Changes from 40 commits
51ef066
b2f2ca4
58d4dcd
ad6eb04
21a9e4d
56bb4d4
052fc8b
e9f4bd9
b9ea305
9b98f74
528da11
d4d98c8
95e0bc5
6e8d3e3
95e08ea
b9fdd28
7efde41
599ce89
81a8fc9
8e42bf3
b25d550
b105535
358bd2a
8ccd12f
853fcf2
099bf32
035d9a6
fc4c67a
98bcb4e
d6932aa
e4ce8c9
53546f2
6efc6db
0ad9ed6
2765e42
6628bfc
49f1e29
5d57abd
361624b
6e13b71
ec682b0
1fd8a1d
352e6ae
69e4ca5
7c6204c
e985fae
e04d628
3f3375b
1db24ed
3bb4b90
5b7974e
7412f48
af1b7b9
adf67bf
2f6c4d2
b176569
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,11 +36,11 @@ def __init__( | |
"time_index, gap, max_delay, and forecast_horizon parameters cannot be omitted from the parameters dict. " | ||
"Please specify them as a dictionary with the key 'pipeline'." | ||
) | ||
pipeline_params = parameters["pipeline"] | ||
self.gap = pipeline_params["gap"] | ||
self.max_delay = pipeline_params["max_delay"] | ||
self.forecast_horizon = pipeline_params["forecast_horizon"] | ||
self.time_index = pipeline_params["time_index"] | ||
self.pipeline_params = parameters["pipeline"] | ||
self.gap = self.pipeline_params["gap"] | ||
self.max_delay = self.pipeline_params["max_delay"] | ||
self.forecast_horizon = self.pipeline_params["forecast_horizon"] | ||
self.time_index = self.pipeline_params["time_index"] | ||
if self.time_index is None: | ||
raise ValueError("Parameter time_index cannot be None!") | ||
super().__init__( | ||
|
@@ -66,55 +66,20 @@ def _move_index_forward(index, gap): | |
else: | ||
return index + gap | ||
|
||
@staticmethod | ||
def _are_datasets_separated_by_gap(train_index, test_index, gap): | ||
"""Determine if the train and test datasets are separated by gap number of units. | ||
|
||
This will be true when users are predicting on unseen data but not during cross | ||
validation since the target is known. | ||
""" | ||
gap_difference = gap + 1 | ||
index_difference = test_index[0] - train_index[-1] | ||
if isinstance( | ||
train_index, (pd.DatetimeIndex, pd.PeriodIndex, pd.TimedeltaIndex) | ||
): | ||
gap_difference *= test_index.freq | ||
return index_difference == gap_difference | ||
|
||
def _validate_holdout_datasets(self, X, X_train): | ||
"""Validate the holdout datasets match out expectations. | ||
|
||
Args: | ||
X (pd.DataFrame): Data of shape [n_samples, n_features]. | ||
X_train (pd.DataFrame): Training data. | ||
|
||
Raises: | ||
ValueError: If holdout data does not have forecast_horizon entries or if datasets | ||
are not separated by gap. | ||
""" | ||
right_length = len(X) <= self.forecast_horizon | ||
X_separated_by_gap = self._are_datasets_separated_by_gap( | ||
X_train.index, X.index, self.gap | ||
) | ||
if not (right_length and X_separated_by_gap): | ||
raise ValueError( | ||
f"Holdout data X must have {self.forecast_horizon} rows (value of forecast horizon) " | ||
"and its index needs to " | ||
f"start {self.gap + 1} values ahead of the training index. " | ||
f"Data received - Length X: {len(X)}, " | ||
f"X index start: {X.index[0]}, X_train index end {X_train.index[-1]}." | ||
) | ||
|
||
def _add_training_data_to_X_Y(self, X, y, X_train, y_train): | ||
"""Append the training data to the holdout data. | ||
|
||
Need to do this so that we have all the data we need to compute lagged features on the holdout set. | ||
""" | ||
from evalml.pipelines.utils import ( | ||
are_datasets_separated_by_gap_time_index, | ||
) | ||
|
||
last_row_of_training = self.forecast_horizon + self.max_delay + self.gap | ||
gap_features = pd.DataFrame() | ||
gap_target = pd.Series() | ||
if ( | ||
self._are_datasets_separated_by_gap(X_train.index, X.index, self.gap) | ||
are_datasets_separated_by_gap_time_index(X_train, X, self.pipeline_params) | ||
and self.gap | ||
): | ||
# The training data does not have the gap dates so don't need to include them | ||
|
@@ -235,7 +200,6 @@ def predict(self, X, objective=None, X_train=None, y_train=None): | |
X.index = self._move_index_forward( | ||
X_train.index[-X.shape[0] :], self.gap + X.shape[0] | ||
) | ||
self._validate_holdout_datasets(X, X_train) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on the third point of this: Rather than raising a ValueError in predict, let's refactor that logic into a helper function that could be used before calling predict. predict should no longer raise exceptions if the data violates our constraints. |
||
y_holdout = self._create_empty_series(y_train, X.shape[0]) | ||
y_holdout = infer_feature_types(y_holdout) | ||
y_holdout.index = X.index | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
|
||
from woodwork import logical_types | ||
|
||
from ..exceptions import PartialDependenceError, PartialDependenceErrorCode | ||
from . import ( | ||
TimeSeriesBinaryClassificationPipeline, | ||
TimeSeriesMulticlassClassificationPipeline, | ||
|
@@ -815,6 +816,75 @@ def make_timeseries_baseline_pipeline(problem_type, gap, forecast_horizon, time_ | |
return baseline | ||
|
||
|
||
def are_datasets_separated_by_gap_time_index(train, test, pipeline_params): | ||
"""Determine if the train and test datasets are separated by gap number of units using the time_index. | ||
|
||
This will be true when users are predicting on unseen data but not during cross | ||
validation since the target is known. | ||
|
||
Args: | ||
train (pd.DataFrame): Training data. | ||
test (pd.DataFrame): Data of shape [n_samples, n_features]. | ||
pipeline_params (dict): Dictionary of time series parameters. | ||
|
||
Returns: | ||
bool: True if the difference in time units is equal to gap + 1. | ||
|
||
""" | ||
gap_difference = pipeline_params["gap"] + 1 | ||
|
||
train_copy = train.copy() | ||
test_copy = test.copy() | ||
train_copy.ww.init(time_index=pipeline_params["time_index"]) | ||
test_copy.ww.init(time_index=pipeline_params["time_index"]) | ||
|
||
X_frequency_dict = train_copy.ww.infer_temporal_frequencies( | ||
temporal_columns=[train_copy.ww.time_index] | ||
) | ||
freq = X_frequency_dict[test_copy.ww.time_index] | ||
if freq is None: | ||
return True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on the third point of this: If the training data does not have an inferable frequency, let's assume the datasets are correctly separated by the gap for now. |
||
|
||
first_testing_date = test_copy[test_copy.ww.time_index].iloc[0] | ||
last_training_date = train_copy[train_copy.ww.time_index].iloc[-1] | ||
dt_difference = first_testing_date - last_training_date | ||
|
||
try: | ||
units_difference = dt_difference / freq | ||
except ValueError: | ||
units_difference = dt_difference / ("1" + freq) | ||
return units_difference == gap_difference | ||
|
||
|
||
def validate_holdout_datasets(X, X_train, pipeline_params): | ||
"""Validate the holdout datasets match out expectations. | ||
|
||
Args: | ||
X (pd.DataFrame): Data of shape [n_samples, n_features]. | ||
X_train (pd.DataFrame): Training data. | ||
pipeline_params (dict): Dictionary of time series parameters. | ||
|
||
Raises: | ||
PartialDependenceError: If holdout data does not have forecast_horizon entries or if datasets are not separated by gap. | ||
""" | ||
forecast_horizon = pipeline_params["forecast_horizon"] | ||
gap = pipeline_params["gap"] | ||
time_index = pipeline_params["time_index"] | ||
right_length = len(X) <= forecast_horizon | ||
X_separated_by_gap = are_datasets_separated_by_gap_time_index( | ||
X_train, X, pipeline_params | ||
) | ||
if not (right_length and X_separated_by_gap): | ||
raise PartialDependenceError( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on the second point of this: This helper function will return "something" if the test data violates our constraints. Users can then use this "something" to display warning messages prior to calling predict. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe it'll be better if rather than raising an exception, we return a tuple of
If we do it this way, it might be easier to communicate which of the two criteria was not met. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think that makes sense |
||
f"Holdout data X must have {forecast_horizon} rows (value of forecast horizon) " | ||
f"and the first value indicated by the column {time_index} needs to " | ||
f"start {gap + 1} units ahead of the training data. " | ||
f"Data received - Length X: {len(X)}, " | ||
f"X value start: {X[time_index].iloc[0]}, X_train value end {X_train[time_index].iloc[-1]}.", | ||
PartialDependenceErrorCode.INVALID_HOLDOUT_SET, | ||
) | ||
|
||
|
||
def rows_of_interest( | ||
pipeline, X, y=None, threshold=None, epsilon=0.1, sort_values=True, types="all" | ||
): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we move the import to top of file?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We end up running into a circular dependency issue unfortunately
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Gotcha. Let's move it to
gen_utils
then? That's whereare_ts_parameters_valid_for_split
so I think its sensible to include it there.