Merge branch 'main' into nullable_types_preprocessing

alteryx · Dec 7, 2021 · 168fa5c · 168fa5c
2 parents 8ea2cce + 6a82b55
commit 168fa5c
Show file tree

Hide file tree

Showing 9 changed files with 311 additions and 535 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -7,6 +7,8 @@ Release Notes
     * Fixes
         * Default parameters for ``RFRegressorSelectFromModel`` and ``RFClassifierSelectFromModel`` has been fixed to avoid selecting all features :pr:`3110`
     * Changes
+        * Removed reliance on a datetime index for ``ARIMARegressor`` and ``ProphetRegressor`` :pr:`3104`
+        * Included target leakage check when fitting ``ARIMARegressor`` to account for the lack of ``TimeSeriesFeaturizer`` in ``ARIMARegressor`` based pipelines :pr:`3104`
         * Cleaned up and refactored ``InvalidTargetDataCheck`` implementation and docstring :pr:`3122`
         * Removed indices information from the output of ``HighlyNullDataCheck``'s ``validate()`` method :pr:`3092`
         * Added ``ReplaceNullableTypes`` component to prepare for handling pandas nullable types. :pr:`3090`
@@ -18,6 +20,7 @@ Release Notes
 
     **Breaking Changes**
         * Renamed ``DelayedFeatureTransformer`` to ``TimeSeriesFeaturizer`` :pr:`3028`
+        * ``ProphetRegressor`` now requires a datetime column in ``X`` represented by the ``date_index`` parameter :pr:`3104`
         * Renamed module ``evalml.data_checks.invalid_target_data_check`` to ``evalml.data_checks.invalid_targets_data_check`` :pr:`3122`
 
 

diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py
@@ -5,7 +5,6 @@
 from operator import itemgetter
 
 import numpy as np
-import pandas as pd
 from skopt.space import Categorical, Integer, Real
 
 from .automl_algorithm import AutoMLAlgorithm, AutoMLAlgorithmException
@@ -15,7 +14,7 @@
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components.utils import get_estimators
 from evalml.pipelines.utils import make_pipeline
-from evalml.problem_types import is_multiclass, is_time_series
+from evalml.problem_types import is_multiclass
 from evalml.utils import infer_feature_types
 from evalml.utils.logger import get_logger
 
@@ -187,21 +186,6 @@ def _create_pipelines(self):
                 self.problem_type, self.allowed_model_families
             )
             allowed_estimators = self._filter_estimators(allowed_estimators)
-            if (
-                is_time_series(self.problem_type)
-                and self._pipeline_params["pipeline"]["date_index"]
-            ):
-                if (
-                    pd.infer_freq(
-                        self.X[self._pipeline_params["pipeline"]["date_index"]]
-                    )
-                    == "MS"
-                ):
-                    allowed_estimators = [
-                        estimator
-                        for estimator in allowed_estimators
-                        if estimator.name != "ARIMA Regressor"
-                    ]
             self.logger.debug(
                 f"allowed_estimators set to {[estimator.name for estimator in allowed_estimators]}"
             )

diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py
@@ -96,66 +96,32 @@ def __init__(
             parameters=parameters, component_obj=arima_model, random_seed=random_seed
         )
 
-    def _get_dates(self, X, y):
-        date_col = None
-        if y is not None:
-            y_index_type = infer_feature_types(
-                pd.Series(y.index)
-            ).ww.logical_type.type_string
-            if y_index_type == "datetime":
-                date_col = y.index
+    def _remove_datetime(self, data, features=False):
+        if data is None:
+            return None
+        data_no_dt = data.copy()
+        if isinstance(
+            data_no_dt.index, (pd.DatetimeIndex, pd.PeriodIndex, pd.IntervalIndex)
+        ):
+            data_no_dt = data_no_dt.reset_index(drop=True)
+        if features:
+            data_no_dt = data_no_dt.select_dtypes(exclude=["datetime64"])
+
+        return data_no_dt
+
+    def _match_indices(self, X, y):
         if X is not None:
-            X_index_type = infer_feature_types(
-                pd.Series(X.index)
-            ).ww.logical_type.type_string
-            if self.parameters["date_index"] in X.columns:
-                date_col = X.pop(self.parameters["date_index"])
-            elif X_index_type == "datetime":
-                date_col = X.index
-        if date_col is None:
-            msg = (
-                "ARIMA regressor requires input data X to have a datetime column specified by the 'date_index' parameter. "
-                "If not it will look for the datetime column in the index of X or y."
-            )
-            raise ValueError(msg)
-        return date_col, X
-
-    def _match_indices(self, X, y, date_col):
-        if X is not None:
-            X = X.copy()
-            X.index = date_col
-        if y is not None:
-            y = y.copy()
-            y.index = date_col
+            if X.index.equals(y.index):
+                return X, y
+            else:
+                y.index = X.index
         return X, y
 
-    def _format_dates(self, dates, X, y, predict=False):
-        if len(dates.shape) == 1:
-            dates = pd.DataFrame(dates)
-        if dates.shape[1] == 1:
-            dates.set_index(dates.columns[0], drop=True, inplace=True)
-            dates = pd.DatetimeIndex(dates.index)
-        elif dates.shape[1] > 1:
-            raise ValueError(
-                f"The dates parameter should not consist of any additional data outside of the datetime information located in the index or in a column."
-                f" Found {dates.shape[1]} columns."
-            )
-        freq = pd.infer_freq(dates)
-        dates = pd.DatetimeIndex(dates, freq=freq)
-        X, y = self._match_indices(X, y, dates)
-        if predict:
-            arima_model_msg = (
-                "sktime is not installed. Please install using `pip install sktime.`"
-            )
-            forecasting_ = import_or_raise(
-                "sktime.forecasting.base", error_msg=arima_model_msg
-            )
-            fh_ = forecasting_.ForecastingHorizon(
-                [i + 1 for i in range(len(dates))], is_relative=True
-            )
-            return X, y, fh_
-        else:
-            return X, y, None
+    def _set_forecast(self, X):
+        from sktime.forecasting.base import ForecastingHorizon
+
+        fh_ = ForecastingHorizon([i + 1 for i in range(len(X))], is_relative=True)
+        return fh_
 
     def fit(self, X, y=None):
         """Fits ARIMA regressor to data.
@@ -170,14 +136,15 @@ def fit(self, X, y=None):
         Raises:
             ValueError: If X was passed to `fit` but not passed in `predict`.
         """
+        X, y = self._manage_woodwork(X, y)
         if y is None:
             raise ValueError("ARIMA Regressor requires y as input.")
 
-        X, y = self._manage_woodwork(X, y)
-        dates, X = self._get_dates(X, y)
-        X, y, _ = self._format_dates(dates, X, y)
+        X = self._remove_datetime(X, features=True)
+        y = self._remove_datetime(y)
+        X, y = self._match_indices(X, y)
+
         if X is not None and not X.empty:
-            X = X.select_dtypes(exclude=["datetime64"])
             self._component_obj.fit(y=y, X=X)
         else:
             self._component_obj.fit(y=y)
@@ -197,23 +164,15 @@ def predict(self, X, y=None):
             ValueError: If X was passed to `fit` but not passed in `predict`.
         """
         X, y = self._manage_woodwork(X, y)
-        dates, X = self._get_dates(X, y)
-        X, y, fh_ = self._format_dates(dates, X, y, predict=True)
-        if X is not None and not X.empty:
-            X = X.select_dtypes(exclude=["datetime64"])
+        fh_ = self._set_forecast(X)
+        X = X.select_dtypes(exclude=["datetime64"])
+
+        if not X.empty:
             y_pred = self._component_obj.predict(fh=fh_, X=X)
         else:
-            try:
-                y_pred = self._component_obj.predict(fh=fh_)
-            except ValueError as ve:
-                error = str(ve)
-                if "When an ARIMA is fit with an X array" in error:
-                    raise ValueError(
-                        "If X was passed to the fit method of the ARIMARegressor, "
-                        "then it must be passed to the predict method as well."
-                    )
-                else:
-                    raise ve
+            y_pred = self._component_obj.predict(fh=fh_)
+        y_pred.index = X.index
+
         return infer_feature_types(y_pred)
 
     @property

diff --git a/evalml/pipelines/components/estimators/regressors/prophet_regressor.py b/evalml/pipelines/components/estimators/regressors/prophet_regressor.py
@@ -2,7 +2,6 @@
 import copy
 
 import numpy as np
-import pandas as pd
 from skopt.space import Real
 
 from evalml.model_family import ModelFamily
@@ -69,39 +68,30 @@ def __init__(
 
         prophet_regressor = prophet.Prophet(**parameters)
         parameters["date_index"] = date_index
+        self.date_index = date_index
+
         super().__init__(
             parameters=parameters,
             component_obj=prophet_regressor,
             random_state=random_seed,
         )
 
     @staticmethod
-    def build_prophet_df(X, y=None, date_column="ds"):
+    def build_prophet_df(X, y=None, date_index="ds"):
         """Build the Prophet data to pass fit and predict on."""
-        if X is not None:
-            X = copy.deepcopy(X)
-        if y is not None:
-            y = copy.deepcopy(y)
+        X = copy.deepcopy(X)
+        y = copy.deepcopy(y)
+        if date_index is None:
+            raise ValueError("date_index cannot be None!")
 
-        if date_column in X.columns:
-            date_column = X.pop(date_column)
+        if date_index in X.columns:
+            date_column = X.pop(date_index)
         else:
-            if isinstance(X.index, pd.DatetimeIndex):
-                X = X.reset_index()
-                date_column = X.pop("index")
-            elif isinstance(y.index, pd.DatetimeIndex):
-                y = y.reset_index()
-                date_column = y.pop("index")
-                y = pd.Series(y.values.flatten())
-            else:
-                msg = "Prophet estimator requires input data X to have a datetime column specified by the 'date_index' parameter. If it doesn't find one, it will look for the datetime column in the index of X or y."
-                raise ValueError(msg)
+            raise ValueError(f"Column {date_index} was not found in X!")
 
         prophet_df = X
-
         if y is not None:
-            if not prophet_df.empty:
-                y.index = prophet_df.index
+            y.index = prophet_df.index
             prophet_df["y"] = y
         prophet_df["ds"] = date_column
 
@@ -117,12 +107,10 @@ def fit(self, X, y=None):
         Returns:
             self
         """
-        if X is None:
-            X = pd.DataFrame()
         X, y = super()._manage_woodwork(X, y)
 
         prophet_df = ProphetRegressor.build_prophet_df(
-            X=X, y=y, date_column=self.parameters["date_index"]
+            X=X, y=y, date_index=self.date_index
         )
 
         self._component_obj.fit(prophet_df)
@@ -133,26 +121,23 @@ def predict(self, X, y=None):
 
         Args:
             X (pd.DataFrame): Data of shape [n_samples, n_features].
-            y (pd.Series): Target data.
+            y (pd.Series): Target data. Ignored.
 
         Returns:
             pd.Series: Predicted values.
         """
-        if X is None:
-            X = pd.DataFrame()
         X = infer_feature_types(X)
 
         prophet_df = ProphetRegressor.build_prophet_df(
-            X=X, y=y, date_column=self.parameters["date_index"]
+            X=X, y=y, date_index=self.date_index
         )
 
         prophet_output = self._component_obj.predict(prophet_df)
         predictions = prophet_output["yhat"]
         predictions = infer_feature_types(predictions)
         predictions = predictions.rename(None)
+        predictions.index = X.index
 
-        if not X.empty:
-            predictions.index = X.index
         return predictions
 
     def get_params(self):

diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py
@@ -37,12 +37,12 @@ def __init__(
                 "Please specify them as a dictionary with the key 'pipeline'."
             )
         pipeline_params = parameters["pipeline"]
-        date_index = pipeline_params["date_index"]
-        if date_index is None:
-            raise ValueError("Parameter date_index cannot be None!")
         self.gap = pipeline_params["gap"]
         self.max_delay = pipeline_params["max_delay"]
         self.forecast_horizon = pipeline_params["forecast_horizon"]
+        self.date_index = pipeline_params["date_index"]
+        if self.date_index is None:
+            raise ValueError("Parameter date_index cannot be None!")
         super().__init__(
             component_graph,
             custom_name=custom_name,

diff --git a/evalml/tests/automl_tests/test_automl_search_regression.py b/evalml/tests/automl_tests/test_automl_search_regression.py
@@ -445,8 +445,6 @@ def test_automl_supports_time_series_regression(freq, AutoMLTestEnv, ts_data):
 
         if result["id"] == 0:
             continue
-        if freq == "MS":
-            assert "ARIMA Regressor" not in result["parameters"]
         if "ARIMA Regressor" in result["parameters"]:
             dt_ = result["parameters"]["ARIMA Regressor"].pop("date_index")
             assert "DateTime Featurization Component" not in result["parameters"].keys()