Skip to content

Commit

Permalink
Merge branch 'main' into 3249-invalid-target-check-ts
Browse files Browse the repository at this point in the history
  • Loading branch information
chukarsten committed Jan 18, 2022
2 parents 8f23f0d + af36015 commit 266a4dc
Show file tree
Hide file tree
Showing 12 changed files with 247 additions and 114 deletions.
2 changes: 2 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
-------------
**Future Releases**
* Enhancements
* Required the separation of training and test data by ``gap`` + 1 units to be verified by ``time_index`` for time series problems :pr:`3208`
* Added support for boolean features for ``ARIMARegressor`` :pr:`3187`
* Updated dependency bot workflow to remove outdated description and add new configuration to delete branches automatically :pr:`3212`
* Added ``n_obs`` and ``n_splits`` to ``TimeSeriesParametersDataCheck`` error details :pr:`3246`
Expand All @@ -17,6 +18,7 @@
* Raised lowest compatible numpy version to 1.21.0 to address security concerns :pr:`3207`
* Changed the default objective to ``MedianAE`` from ``R2`` for time series regression :pr:`3205`
* Removed all-nan Unknown to Double logical conversion in ``infer_feature_types`` :pr:`3196`
* Checking the validity of holdout data for time series problems can be performed by calling ``pipelines.utils.validate_holdout_datasets`` prior to calling ``predict`` :pr:`3208`
* Documentation Changes
* Testing Changes

Expand Down
17 changes: 17 additions & 0 deletions evalml/demos/weather.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""The Australian daily-min-termperatures weather dataset."""
import pandas as pd

import evalml
from evalml.preprocessing import load_data
from evalml.utils import infer_feature_types


def load_weather():
Expand All @@ -15,4 +18,18 @@ def load_weather():
+ evalml.__version__
)
X, y = load_data(filename, index=None, target="Temp")

missing_date_1 = pd.DataFrame([pd.to_datetime("1984-12-31")], columns=["Date"])
missing_date_2 = pd.DataFrame([pd.to_datetime("1988-12-31")], columns=["Date"])
missing_y_1 = pd.Series([14.5], name="Temp")
missing_y_2 = pd.Series([14.5], name="Temp")

X = pd.concat([X.iloc[:1460], missing_date_1, X.iloc[1460:]]).reset_index(drop=True)
X = pd.concat([X.iloc[:2921], missing_date_2, X.iloc[2921:]]).reset_index(drop=True)
y = pd.concat([y.iloc[:1460], missing_y_1, y.iloc[1460:]]).reset_index(drop=True)
y = pd.concat([y.iloc[:2921], missing_y_2, y.iloc[2921:]]).reset_index(drop=True)

X = infer_feature_types(X)
y = infer_feature_types(y)

return X, y
1 change: 1 addition & 0 deletions evalml/exceptions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@
ParameterNotUsedWarning,
PartialDependenceErrorCode,
PartialDependenceError,
ValidationErrorCode,
)
9 changes: 9 additions & 0 deletions evalml/exceptions/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ def __init__(self, components):
super().__init__(msg)


class ValidationErrorCode(Enum):
"""Enum identifying the type of error encountered in holdout validation."""

INVALID_HOLDOUT_LENGTH = "invalid_holdout_length"
"""invalid_holdout_length"""
INVALID_HOLDOUT_GAP_SEPARATION = "invalid_holdout_gap_separation"
"""invalid_holdout_gap_separation"""


class PartialDependenceErrorCode(Enum):
"""Enum identifying the type of error encountered in partial dependence."""

Expand Down
1 change: 0 additions & 1 deletion evalml/pipelines/time_series_classification_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ def predict_proba(self, X, X_train=None, y_train=None):
X.index = self._move_index_forward(
X_train.index[-X.shape[0] :], self.gap + X.shape[0]
)
self._validate_holdout_datasets(X, X_train)
y_holdout = self._create_empty_series(y_train, X.shape[0])
y_holdout = infer_feature_types(y_holdout)
y_holdout.index = X.index
Expand Down
53 changes: 7 additions & 46 deletions evalml/pipelines/time_series_pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from evalml.pipelines import PipelineBase
from evalml.pipelines.pipeline_meta import PipelineBaseMeta
from evalml.utils import drop_rows_with_nans, infer_feature_types
from evalml.utils.gen_utils import are_datasets_separated_by_gap_time_index


class TimeSeriesPipelineBase(PipelineBase, metaclass=PipelineBaseMeta):
Expand Down Expand Up @@ -36,11 +37,11 @@ def __init__(
"time_index, gap, max_delay, and forecast_horizon parameters cannot be omitted from the parameters dict. "
"Please specify them as a dictionary with the key 'pipeline'."
)
pipeline_params = parameters["pipeline"]
self.gap = pipeline_params["gap"]
self.max_delay = pipeline_params["max_delay"]
self.forecast_horizon = pipeline_params["forecast_horizon"]
self.time_index = pipeline_params["time_index"]
self.pipeline_params = parameters["pipeline"]
self.gap = self.pipeline_params["gap"]
self.max_delay = self.pipeline_params["max_delay"]
self.forecast_horizon = self.pipeline_params["forecast_horizon"]
self.time_index = self.pipeline_params["time_index"]
if self.time_index is None:
raise ValueError("Parameter time_index cannot be None!")
super().__init__(
Expand All @@ -66,45 +67,6 @@ def _move_index_forward(index, gap):
else:
return index + gap

@staticmethod
def _are_datasets_separated_by_gap(train_index, test_index, gap):
"""Determine if the train and test datasets are separated by gap number of units.
This will be true when users are predicting on unseen data but not during cross
validation since the target is known.
"""
gap_difference = gap + 1
index_difference = test_index[0] - train_index[-1]
if isinstance(
train_index, (pd.DatetimeIndex, pd.PeriodIndex, pd.TimedeltaIndex)
):
gap_difference *= test_index.freq
return index_difference == gap_difference

def _validate_holdout_datasets(self, X, X_train):
"""Validate the holdout datasets match out expectations.
Args:
X (pd.DataFrame): Data of shape [n_samples, n_features].
X_train (pd.DataFrame): Training data.
Raises:
ValueError: If holdout data does not have forecast_horizon entries or if datasets
are not separated by gap.
"""
right_length = len(X) <= self.forecast_horizon
X_separated_by_gap = self._are_datasets_separated_by_gap(
X_train.index, X.index, self.gap
)
if not (right_length and X_separated_by_gap):
raise ValueError(
f"Holdout data X must have {self.forecast_horizon} rows (value of forecast horizon) "
"and its index needs to "
f"start {self.gap + 1} values ahead of the training index. "
f"Data received - Length X: {len(X)}, "
f"X index start: {X.index[0]}, X_train index end {X_train.index[-1]}."
)

def _add_training_data_to_X_Y(self, X, y, X_train, y_train):
"""Append the training data to the holdout data.
Expand All @@ -114,7 +76,7 @@ def _add_training_data_to_X_Y(self, X, y, X_train, y_train):
gap_features = pd.DataFrame()
gap_target = pd.Series()
if (
self._are_datasets_separated_by_gap(X_train.index, X.index, self.gap)
are_datasets_separated_by_gap_time_index(X_train, X, self.pipeline_params)
and self.gap
):
# The training data does not have the gap dates so don't need to include them
Expand Down Expand Up @@ -235,7 +197,6 @@ def predict(self, X, objective=None, X_train=None, y_train=None):
X.index = self._move_index_forward(
X_train.index[-X.shape[0] :], self.gap + X.shape[0]
)
self._validate_holdout_datasets(X, X_train)
y_holdout = self._create_empty_series(y_train, X.shape[0])
y_holdout = infer_feature_types(y_holdout)
y_holdout.index = X.index
Expand Down
6 changes: 3 additions & 3 deletions evalml/tests/automl_tests/test_engine_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,13 @@ def test_train_pipeline_trains_and_tunes_threshold(

def test_train_pipeline_trains_and_tunes_threshold_ts(
ts_data,
dummy_ts_binary_linear_classifier_pipeline_class,
dummy_ts_binary_tree_classifier_pipeline_class,
):
X = pd.DataFrame([i for i in range(32)])
X = pd.DataFrame(pd.date_range("1/1/21", periods=32), columns=["date"])
y = pd.Series([0, 1, 0, 1] * 8)

params = {"gap": 1, "max_delay": 1, "forecast_horizon": 1, "time_index": "date"}
ts_binary = dummy_ts_binary_linear_classifier_pipeline_class(
ts_binary = dummy_ts_binary_tree_classifier_pipeline_class(
parameters={"pipeline": params}
)
assert ts_binary.threshold is None
Expand Down
36 changes: 30 additions & 6 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
is_regression,
is_time_series,
)
from evalml.utils import infer_feature_types


def pytest_configure(config):
Expand Down Expand Up @@ -826,11 +827,11 @@ def __init__(


@pytest.fixture
def dummy_ts_binary_linear_classifier_pipeline_class():
log_reg_classifier = LogisticRegressionClassifier
def dummy_ts_binary_tree_classifier_pipeline_class():
dec_tree_classifier = DecisionTreeClassifier

class MockBinaryClassificationPipeline(TimeSeriesBinaryClassificationPipeline):
estimator = log_reg_classifier
estimator = dec_tree_classifier
component_graph = [estimator]

def __init__(
Expand Down Expand Up @@ -1620,20 +1621,43 @@ def objective_function(self, y_true, y_predicted, X=None):
def load_daily_temp_local(n_rows=None):
currdir_path = os.path.dirname(os.path.abspath(__file__))
data_folder_path = os.path.join(currdir_path, "data")
fraud_data_path = os.path.join(data_folder_path, "daily-min-temperatures.csv")
temp_data_path = os.path.join(data_folder_path, "daily-min-temperatures.csv")
X, y = load_data(
path=fraud_data_path,
path=temp_data_path,
index=None,
target="Temp",
n_rows=n_rows,
)
missing_date_1 = pd.DataFrame([pd.to_datetime("1984-12-31")], columns=["Date"])
missing_date_2 = pd.DataFrame([pd.to_datetime("1988-12-31")], columns=["Date"])
missing_y_1 = pd.Series([14.5], name="Temp")
missing_y_2 = pd.Series([14.5], name="Temp")

X = pd.concat(
[
X.iloc[:1460],
missing_date_1,
X.iloc[1460:2920],
missing_date_2,
X.iloc[2920:],
]
).reset_index(drop=True)
y = pd.concat(
[
y.iloc[:1460],
missing_y_1,
y.iloc[1460:2920],
missing_y_2,
y.iloc[2920:],
]
).reset_index(drop=True)
return X, y


@pytest.fixture
def daily_temp_local():
X, y = load_daily_temp_local()
return X, y
return infer_feature_types(X), infer_feature_types(y)


@pytest.fixture
Expand Down
3 changes: 2 additions & 1 deletion evalml/tests/demo_tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def local_datasets(
("breast_cancer", (569, 30)),
("diabetes", (442, 10)),
("churn", (7043, 19)),
("daily_temp", (3650, 1)),
("daily_temp", (3652, 1)),
],
)
def test_datasets(dataset_name, expected_shape, local_datasets):
Expand Down Expand Up @@ -86,5 +86,6 @@ def test_datasets(dataset_name, expected_shape, local_datasets):
def test_datasets_match_local(dataset_name, demo_method, local_datasets):
X, y = demo_method
X_local, y_local = local_datasets[dataset_name]

pd.testing.assert_frame_equal(X, X_local)
pd.testing.assert_series_equal(y, y_local)
56 changes: 0 additions & 56 deletions evalml/tests/pipeline_tests/test_time_series_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,62 +25,6 @@
from evalml.utils import infer_feature_types


@pytest.mark.parametrize(
"pipeline_class,estimator",
[
(TimeSeriesRegressionPipeline, "Linear Regressor"),
(TimeSeriesBinaryClassificationPipeline, "Logistic Regression Classifier"),
(TimeSeriesMulticlassClassificationPipeline, "Logistic Regression Classifier"),
],
)
@pytest.mark.parametrize("gap", [0, 1, 5])
@pytest.mark.parametrize("forecast_horizon", [1, 5, 10])
@patch("evalml.pipelines.components.LinearRegressor.fit")
@patch("evalml.pipelines.components.LogisticRegressionClassifier.fit")
def test_time_series_pipeline_validates_holdout_data(
mock_fit_lr,
mock_fit_linear,
forecast_horizon,
gap,
pipeline_class,
estimator,
ts_data,
ts_data_binary,
):
pl = pipeline_class(
component_graph=[estimator],
parameters={
"pipeline": {
"time_index": "date",
"gap": gap,
"max_delay": 2,
"forecast_horizon": forecast_horizon,
}
},
)
X, y = ts_data

if pipeline_class == TimeSeriesBinaryClassificationPipeline:
X, y = ts_data_binary

TRAIN_LENGTH = 15
X_train, y_train = X.iloc[:TRAIN_LENGTH], y.iloc[:TRAIN_LENGTH]
X = X.iloc[TRAIN_LENGTH + gap : TRAIN_LENGTH + gap + forecast_horizon + 2]

pl.fit(X_train, y_train)

with pytest.raises(
ValueError, match=f"Holdout data X must have {forecast_horizon}"
):
pl.predict(X, None, X_train, y_train)

if hasattr(pl, "predict_proba"):
with pytest.raises(
ValueError, match=f"Holdout data X must have {forecast_horizon}"
):
pl.predict_proba(X, X_train, y_train)


@pytest.mark.parametrize("num_unique", [1, 2, 3])
@pytest.mark.parametrize("pipeline", ["ts_binary", "ts_multiclass"])
def test_invalid_targets_time_series_classification_pipeline(
Expand Down
Loading

0 comments on commit 266a4dc

Please sign in to comment.