Merge branch 'main' into 3249-invalid-target-check-ts

alteryx · Jan 18, 2022 · 266a4dc · 266a4dc
2 parents 8f23f0d + af36015
commit 266a4dc
Show file tree

Hide file tree

Showing 12 changed files with 247 additions and 114 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,7 @@
 -------------
 **Future Releases**
     * Enhancements
+        * Required the separation of training and test data by ``gap`` + 1 units to be verified by ``time_index`` for time series problems :pr:`3208`
         * Added support for boolean features for ``ARIMARegressor`` :pr:`3187`
         * Updated dependency bot workflow to remove outdated description and add new configuration to delete branches automatically :pr:`3212`
         * Added ``n_obs`` and ``n_splits`` to ``TimeSeriesParametersDataCheck`` error details :pr:`3246`
@@ -17,6 +18,7 @@
         * Raised lowest compatible numpy version to 1.21.0 to address security concerns :pr:`3207`
         * Changed the default objective to ``MedianAE`` from ``R2`` for time series regression :pr:`3205`
         * Removed all-nan Unknown to Double logical conversion in ``infer_feature_types`` :pr:`3196`
+        * Checking the validity of holdout data for time series problems can be performed by calling ``pipelines.utils.validate_holdout_datasets`` prior to calling ``predict`` :pr:`3208`
     * Documentation Changes
     * Testing Changes
 

diff --git a/evalml/demos/weather.py b/evalml/demos/weather.py
@@ -1,6 +1,9 @@
 """The Australian daily-min-termperatures weather dataset."""
+import pandas as pd
+
 import evalml
 from evalml.preprocessing import load_data
+from evalml.utils import infer_feature_types
 
 
 def load_weather():
@@ -15,4 +18,18 @@ def load_weather():
         + evalml.__version__
     )
     X, y = load_data(filename, index=None, target="Temp")
+
+    missing_date_1 = pd.DataFrame([pd.to_datetime("1984-12-31")], columns=["Date"])
+    missing_date_2 = pd.DataFrame([pd.to_datetime("1988-12-31")], columns=["Date"])
+    missing_y_1 = pd.Series([14.5], name="Temp")
+    missing_y_2 = pd.Series([14.5], name="Temp")
+
+    X = pd.concat([X.iloc[:1460], missing_date_1, X.iloc[1460:]]).reset_index(drop=True)
+    X = pd.concat([X.iloc[:2921], missing_date_2, X.iloc[2921:]]).reset_index(drop=True)
+    y = pd.concat([y.iloc[:1460], missing_y_1, y.iloc[1460:]]).reset_index(drop=True)
+    y = pd.concat([y.iloc[:2921], missing_y_2, y.iloc[2921:]]).reset_index(drop=True)
+
+    X = infer_feature_types(X)
+    y = infer_feature_types(y)
+
     return X, y
diff --git a/evalml/exceptions/__init__.py b/evalml/exceptions/__init__.py
@@ -15,4 +15,5 @@
     ParameterNotUsedWarning,
     PartialDependenceErrorCode,
     PartialDependenceError,
+    ValidationErrorCode,
 )
diff --git a/evalml/exceptions/exceptions.py b/evalml/exceptions/exceptions.py
@@ -97,6 +97,15 @@ def __init__(self, components):
         super().__init__(msg)
 
 
+class ValidationErrorCode(Enum):
+    """Enum identifying the type of error encountered in holdout validation."""
+
+    INVALID_HOLDOUT_LENGTH = "invalid_holdout_length"
+    """invalid_holdout_length"""
+    INVALID_HOLDOUT_GAP_SEPARATION = "invalid_holdout_gap_separation"
+    """invalid_holdout_gap_separation"""
+
+
 class PartialDependenceErrorCode(Enum):
     """Enum identifying the type of error encountered in partial dependence."""
 

diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py
@@ -119,7 +119,6 @@ def predict_proba(self, X, X_train=None, y_train=None):
         X.index = self._move_index_forward(
             X_train.index[-X.shape[0] :], self.gap + X.shape[0]
         )
-        self._validate_holdout_datasets(X, X_train)
         y_holdout = self._create_empty_series(y_train, X.shape[0])
         y_holdout = infer_feature_types(y_holdout)
         y_holdout.index = X.index

diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py
@@ -5,6 +5,7 @@
 from evalml.pipelines import PipelineBase
 from evalml.pipelines.pipeline_meta import PipelineBaseMeta
 from evalml.utils import drop_rows_with_nans, infer_feature_types
+from evalml.utils.gen_utils import are_datasets_separated_by_gap_time_index
 
 
 class TimeSeriesPipelineBase(PipelineBase, metaclass=PipelineBaseMeta):
@@ -36,11 +37,11 @@ def __init__(
                 "time_index, gap, max_delay, and forecast_horizon parameters cannot be omitted from the parameters dict. "
                 "Please specify them as a dictionary with the key 'pipeline'."
             )
-        pipeline_params = parameters["pipeline"]
-        self.gap = pipeline_params["gap"]
-        self.max_delay = pipeline_params["max_delay"]
-        self.forecast_horizon = pipeline_params["forecast_horizon"]
-        self.time_index = pipeline_params["time_index"]
+        self.pipeline_params = parameters["pipeline"]
+        self.gap = self.pipeline_params["gap"]
+        self.max_delay = self.pipeline_params["max_delay"]
+        self.forecast_horizon = self.pipeline_params["forecast_horizon"]
+        self.time_index = self.pipeline_params["time_index"]
         if self.time_index is None:
             raise ValueError("Parameter time_index cannot be None!")
         super().__init__(
@@ -66,45 +67,6 @@ def _move_index_forward(index, gap):
         else:
             return index + gap
 
-    @staticmethod
-    def _are_datasets_separated_by_gap(train_index, test_index, gap):
-        """Determine if the train and test datasets are separated by gap number of units.
-
-        This will be true when users are predicting on unseen data but not during cross
-        validation since the target is known.
-        """
-        gap_difference = gap + 1
-        index_difference = test_index[0] - train_index[-1]
-        if isinstance(
-            train_index, (pd.DatetimeIndex, pd.PeriodIndex, pd.TimedeltaIndex)
-        ):
-            gap_difference *= test_index.freq
-        return index_difference == gap_difference
-
-    def _validate_holdout_datasets(self, X, X_train):
-        """Validate the holdout datasets match out expectations.
-
-        Args:
-            X (pd.DataFrame): Data of shape [n_samples, n_features].
-            X_train (pd.DataFrame): Training data.
-
-        Raises:
-            ValueError: If holdout data does not have forecast_horizon entries or if datasets
-                are not separated by gap.
-        """
-        right_length = len(X) <= self.forecast_horizon
-        X_separated_by_gap = self._are_datasets_separated_by_gap(
-            X_train.index, X.index, self.gap
-        )
-        if not (right_length and X_separated_by_gap):
-            raise ValueError(
-                f"Holdout data X must have {self.forecast_horizon}  rows (value of forecast horizon) "
-                "and its index needs to "
-                f"start {self.gap + 1} values ahead of the training index. "
-                f"Data received - Length X: {len(X)}, "
-                f"X index start: {X.index[0]}, X_train index end {X_train.index[-1]}."
-            )
-
     def _add_training_data_to_X_Y(self, X, y, X_train, y_train):
         """Append the training data to the holdout data.
 
@@ -114,7 +76,7 @@ def _add_training_data_to_X_Y(self, X, y, X_train, y_train):
         gap_features = pd.DataFrame()
         gap_target = pd.Series()
         if (
-            self._are_datasets_separated_by_gap(X_train.index, X.index, self.gap)
+            are_datasets_separated_by_gap_time_index(X_train, X, self.pipeline_params)
             and self.gap
         ):
             # The training data does not have the gap dates so don't need to include them
@@ -235,7 +197,6 @@ def predict(self, X, objective=None, X_train=None, y_train=None):
         X.index = self._move_index_forward(
             X_train.index[-X.shape[0] :], self.gap + X.shape[0]
         )
-        self._validate_holdout_datasets(X, X_train)
         y_holdout = self._create_empty_series(y_train, X.shape[0])
         y_holdout = infer_feature_types(y_holdout)
         y_holdout.index = X.index

diff --git a/evalml/tests/automl_tests/test_engine_base.py b/evalml/tests/automl_tests/test_engine_base.py
@@ -135,13 +135,13 @@ def test_train_pipeline_trains_and_tunes_threshold(
 
 def test_train_pipeline_trains_and_tunes_threshold_ts(
     ts_data,
-    dummy_ts_binary_linear_classifier_pipeline_class,
+    dummy_ts_binary_tree_classifier_pipeline_class,
 ):
-    X = pd.DataFrame([i for i in range(32)])
+    X = pd.DataFrame(pd.date_range("1/1/21", periods=32), columns=["date"])
     y = pd.Series([0, 1, 0, 1] * 8)
 
     params = {"gap": 1, "max_delay": 1, "forecast_horizon": 1, "time_index": "date"}
-    ts_binary = dummy_ts_binary_linear_classifier_pipeline_class(
+    ts_binary = dummy_ts_binary_tree_classifier_pipeline_class(
         parameters={"pipeline": params}
     )
     assert ts_binary.threshold is None

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
@@ -42,6 +42,7 @@
     is_regression,
     is_time_series,
 )
+from evalml.utils import infer_feature_types
 
 
 def pytest_configure(config):
@@ -826,11 +827,11 @@ def __init__(
 
 
 @pytest.fixture
-def dummy_ts_binary_linear_classifier_pipeline_class():
-    log_reg_classifier = LogisticRegressionClassifier
+def dummy_ts_binary_tree_classifier_pipeline_class():
+    dec_tree_classifier = DecisionTreeClassifier
 
     class MockBinaryClassificationPipeline(TimeSeriesBinaryClassificationPipeline):
-        estimator = log_reg_classifier
+        estimator = dec_tree_classifier
         component_graph = [estimator]
 
         def __init__(
@@ -1620,20 +1621,43 @@ def objective_function(self, y_true, y_predicted, X=None):
 def load_daily_temp_local(n_rows=None):
     currdir_path = os.path.dirname(os.path.abspath(__file__))
     data_folder_path = os.path.join(currdir_path, "data")
-    fraud_data_path = os.path.join(data_folder_path, "daily-min-temperatures.csv")
+    temp_data_path = os.path.join(data_folder_path, "daily-min-temperatures.csv")
     X, y = load_data(
-        path=fraud_data_path,
+        path=temp_data_path,
         index=None,
         target="Temp",
         n_rows=n_rows,
     )
+    missing_date_1 = pd.DataFrame([pd.to_datetime("1984-12-31")], columns=["Date"])
+    missing_date_2 = pd.DataFrame([pd.to_datetime("1988-12-31")], columns=["Date"])
+    missing_y_1 = pd.Series([14.5], name="Temp")
+    missing_y_2 = pd.Series([14.5], name="Temp")
+
+    X = pd.concat(
+        [
+            X.iloc[:1460],
+            missing_date_1,
+            X.iloc[1460:2920],
+            missing_date_2,
+            X.iloc[2920:],
+        ]
+    ).reset_index(drop=True)
+    y = pd.concat(
+        [
+            y.iloc[:1460],
+            missing_y_1,
+            y.iloc[1460:2920],
+            missing_y_2,
+            y.iloc[2920:],
+        ]
+    ).reset_index(drop=True)
     return X, y
 
 
 @pytest.fixture
 def daily_temp_local():
     X, y = load_daily_temp_local()
-    return X, y
+    return infer_feature_types(X), infer_feature_types(y)
 
 
 @pytest.fixture

diff --git a/evalml/tests/demo_tests/test_datasets.py b/evalml/tests/demo_tests/test_datasets.py
@@ -58,7 +58,7 @@ def local_datasets(
         ("breast_cancer", (569, 30)),
         ("diabetes", (442, 10)),
         ("churn", (7043, 19)),
-        ("daily_temp", (3650, 1)),
+        ("daily_temp", (3652, 1)),
     ],
 )
 def test_datasets(dataset_name, expected_shape, local_datasets):
@@ -86,5 +86,6 @@ def test_datasets(dataset_name, expected_shape, local_datasets):
 def test_datasets_match_local(dataset_name, demo_method, local_datasets):
     X, y = demo_method
     X_local, y_local = local_datasets[dataset_name]
+
     pd.testing.assert_frame_equal(X, X_local)
     pd.testing.assert_series_equal(y, y_local)
diff --git a/evalml/tests/pipeline_tests/test_time_series_pipeline.py b/evalml/tests/pipeline_tests/test_time_series_pipeline.py
@@ -25,62 +25,6 @@
 from evalml.utils import infer_feature_types
 
 
-@pytest.mark.parametrize(
-    "pipeline_class,estimator",
-    [
-        (TimeSeriesRegressionPipeline, "Linear Regressor"),
-        (TimeSeriesBinaryClassificationPipeline, "Logistic Regression Classifier"),
-        (TimeSeriesMulticlassClassificationPipeline, "Logistic Regression Classifier"),
-    ],
-)
-@pytest.mark.parametrize("gap", [0, 1, 5])
-@pytest.mark.parametrize("forecast_horizon", [1, 5, 10])
-@patch("evalml.pipelines.components.LinearRegressor.fit")
-@patch("evalml.pipelines.components.LogisticRegressionClassifier.fit")
-def test_time_series_pipeline_validates_holdout_data(
-    mock_fit_lr,
-    mock_fit_linear,
-    forecast_horizon,
-    gap,
-    pipeline_class,
-    estimator,
-    ts_data,
-    ts_data_binary,
-):
-    pl = pipeline_class(
-        component_graph=[estimator],
-        parameters={
-            "pipeline": {
-                "time_index": "date",
-                "gap": gap,
-                "max_delay": 2,
-                "forecast_horizon": forecast_horizon,
-            }
-        },
-    )
-    X, y = ts_data
-
-    if pipeline_class == TimeSeriesBinaryClassificationPipeline:
-        X, y = ts_data_binary
-
-    TRAIN_LENGTH = 15
-    X_train, y_train = X.iloc[:TRAIN_LENGTH], y.iloc[:TRAIN_LENGTH]
-    X = X.iloc[TRAIN_LENGTH + gap : TRAIN_LENGTH + gap + forecast_horizon + 2]
-
-    pl.fit(X_train, y_train)
-
-    with pytest.raises(
-        ValueError, match=f"Holdout data X must have {forecast_horizon}"
-    ):
-        pl.predict(X, None, X_train, y_train)
-
-    if hasattr(pl, "predict_proba"):
-        with pytest.raises(
-            ValueError, match=f"Holdout data X must have {forecast_horizon}"
-        ):
-            pl.predict_proba(X, X_train, y_train)
-
-
 @pytest.mark.parametrize("num_unique", [1, 2, 3])
 @pytest.mark.parametrize("pipeline", ["ts_binary", "ts_multiclass"])
 def test_invalid_targets_time_series_classification_pipeline(