For IterativeAlgorithm, put time series algorithms first (#3407)

* Put ts estimators first * Add to release notes * Fixing test * only run test non-core deps * Fix test for windows * lint * Update Makefile * Lint * Linting * Change heuristic * Lint with correct version * Add prophet to min deps. Update release notes * Mark test as non-core * Update test * Linting
alteryx · Mar 31, 2022 · 9282cc6 · 9282cc6
1 parent 0ddf705
commit 9282cc6
Show file tree

Hide file tree

Showing 10 changed files with 192 additions and 15 deletions.
diff --git a/.github/workflows/linux_unit_tests_with_latest_deps.yml b/.github/workflows/linux_unit_tests_with_latest_deps.yml
@@ -83,8 +83,7 @@ jobs:
           pip install virtualenv
           virtualenv test_python -q
           source test_python/bin/activate
-          pip install cmdstan-builder==0.0.8
-          make installdeps
+          make installdeps-prophet
           make installdeps-test
           pip freeze
       - name: Erase Coverage

diff --git a/.github/workflows/linux_unit_tests_with_minimum_deps.yml b/.github/workflows/linux_unit_tests_with_minimum_deps.yml
@@ -39,7 +39,7 @@ jobs:
         name: Install evalml with test dependencies, core dependencies, & optional requirements (Prophet)
         run: |
           source test_python/bin/activate
-          pip install cmdstan-builder==0.0.8
+          pip install prophet-prebuilt==1.0.2
           pip install -e . --no-dependencies
           pip install -r evalml/tests/dependency_update_check/minimum_test_requirements.txt
           pip install -r evalml/tests/dependency_update_check/minimum_core_requirements.txt

diff --git a/Makefile b/Makefile
@@ -94,13 +94,10 @@ installdeps-min:
 	pip install -r evalml/tests/dependency_update_check/minimum_core_requirements.txt
 	pip install -r evalml/tests/dependency_update_check/minimum_requirements.txt
 
-SITE_PACKAGES_DIR=$$(python -c 'import site; print(site.getsitepackages()[0])')
+
 .PHONY: installdeps-prophet
 installdeps-prophet:
-	pip install cmdstanpy==0.9.68
-	python ${SITE_PACKAGES_DIR}/cmdstanpy/install_cmdstan.py --dir ${SITE_PACKAGES_DIR} -v 2.28.0
-	echo "Installing Prophet with CMDSTANPY backend"
-	CMDSTAN=${SITE_PACKAGES_DIR}/cmdstan-2.28.0 STAN_BACKEND=CMDSTANPY pip install --no-cache-dir prophet==1.0.1
+	pip install -e .[prophet]
 
 .PHONY: installdeps-core
 installdeps-core:

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -3,10 +3,14 @@
 
 **Future Releases**
     * Enhancements
+        * Added ``use_covariates`` parameter to ``ARIMARegressor`` :pr:`3407`
+        * ``AutoMLSearch`` will set ``use_covariates`` to ``False`` for ARIMA when dataset is large :pr:`3407`
     * Fixes
     * Changes
         * Moved model understanding metrics from ``graph.py`` into a separate file :pr:`3417`
         * Unpin ``click`` dependency :pr:`3420`
+        * For ``IterativeAlgorithm``, put time series algorithms first :pr:`3407`
+        * Use ``prophet-prebuilt`` to install prophet in extras :pr:`3407`
     * Documentation Changes
     * Testing Changes
 

diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py
@@ -18,14 +18,16 @@
 from evalml.utils.logger import get_logger
 
 _ESTIMATOR_FAMILY_ORDER = [
+    ModelFamily.ARIMA,
+    ModelFamily.PROPHET,
+    ModelFamily.EXPONENTIAL_SMOOTHING,
     ModelFamily.LINEAR_MODEL,
     ModelFamily.XGBOOST,
     ModelFamily.LIGHTGBM,
     ModelFamily.CATBOOST,
     ModelFamily.RANDOM_FOREST,
     ModelFamily.DECISION_TREE,
     ModelFamily.EXTRA_TREES,
-    ModelFamily.ARIMA,
 ]
 
 

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -14,6 +14,7 @@
 import pandas as pd
 from dask import distributed as dd
 from sklearn.model_selection import BaseCrossValidator
+from skopt.space import Categorical
 
 from .pipeline_search_plots import PipelineSearchPlots, SearchIterationPlot
 
@@ -47,6 +48,7 @@
     MulticlassClassificationPipeline,
     RegressionPipeline,
 )
+from evalml.pipelines.components import ARIMARegressor
 from evalml.pipelines.utils import make_timeseries_baseline_pipeline
 from evalml.problem_types import (
     ProblemTypes,
@@ -631,6 +633,22 @@ def __init__(
         self.data_splitter = self.data_splitter or default_data_splitter
         self.pipeline_parameters = pipeline_parameters or {}
         self.custom_hyperparameters = custom_hyperparameters or {}
+        # Fitting takes a long time if the data is too wide or long.
+        if is_time_series(problem_type) and (
+            self.X_train.shape[1] >= 10 or self.X_train.shape[0] >= 10000
+        ):
+            user_arima_hyperparams = ARIMARegressor.name in self.custom_hyperparameters
+            if user_arima_hyperparams and not self.custom_hyperparameters[
+                ARIMARegressor.name
+            ].get("use_covariates"):
+                self.custom_hyperparameters[ARIMARegressor.name].update(
+                    {"use_covariates": Categorical([False])}
+                )
+            elif not user_arima_hyperparams:
+                self.custom_hyperparameters[ARIMARegressor.name] = {
+                    "use_covariates": Categorical([False])
+                }
+
         self.search_iteration_plot = None
         self._interrupted = False
 
@@ -707,7 +725,7 @@ def __init__(
                 ensembling=self.ensembling,
                 text_in_ensembling=text_in_ensembling,
                 pipeline_params=parameters,
-                custom_hyperparameters=custom_hyperparameters,
+                custom_hyperparameters=self.custom_hyperparameters,
                 allow_long_running_models=allow_long_running_models,
                 features=features,
                 verbose=self.verbose,

diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py
@@ -68,6 +68,7 @@ def __init__(
         n_jobs=-1,
         random_seed=0,
         maxiter=10,
+        use_covariates=True,
         **kwargs,
     ):
         parameters = {
@@ -93,6 +94,9 @@ def __init__(
             "sktime.forecasting.arima", error_msg=arima_model_msg
         )
         arima_model = sktime_arima.AutoARIMA(**parameters)
+        parameters["use_covariates"] = use_covariates
+
+        self.use_covariates = use_covariates
 
         super().__init__(
             parameters=parameters, component_obj=arima_model, random_seed=random_seed
@@ -154,8 +158,7 @@ def fit(self, X, y=None):
             )
         y = self._remove_datetime(y)
         X, y = self._match_indices(X, y)
-
-        if X is not None and not X.empty:
+        if X is not None and not X.empty and self.use_covariates:
             self._component_obj.fit(y=y, X=X)
         else:
             self._component_obj.fit(y=y)
@@ -183,8 +186,7 @@ def predict(self, X, y=None):
                 for col in X.ww.select(["Boolean"], return_schema=True).columns
             }
         )
-
-        if not X.empty:
+        if not X.empty and self.use_covariates:
             y_pred = self._component_obj.predict(fh=fh_, X=X)
         else:
             y_pred = self._component_obj.predict(fh=fh_)

diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
@@ -4544,3 +4544,143 @@ def test_automl_accepts_features(
         assert all(
             ["DFS Transformer" not in p for p in automl.full_rankings["parameters"][1:]]
         )
+
+
+@pytest.mark.noncore_dependency
+def test_automl_with_iterative_algorithm_puts_ts_estimators_first(
+    ts_data, AutoMLTestEnv, is_using_windows
+):
+
+    X, y = ts_data
+
+    env = AutoMLTestEnv("time series regression")
+    automl = AutoMLSearch(
+        X,
+        y,
+        "time series regression",
+        max_iterations=5,
+        problem_configuration={
+            "max_delay": 2,
+            "gap": 0,
+            "forecast_horizon": 2,
+            "time_index": "Date",
+        },
+        verbose=True,
+        automl_algorithm="iterative",
+    )
+    with env.test_context(score_return_value={automl.objective.name: 1.0}):
+        automl.search()
+
+    estimator_order = (
+        automl.full_rankings.sort_values("search_order")
+        .id.map(lambda id_: automl.get_pipeline(id_).estimator.name)
+        .tolist()
+    )
+    if is_using_windows:
+        expected_order = [
+            "Time Series Baseline Estimator",
+            "ARIMA Regressor",
+            "Exponential Smoothing Regressor",
+            "Elastic Net Regressor",
+            "XGBoost Regressor",
+        ]
+    else:
+        expected_order = [
+            "Time Series Baseline Estimator",
+            "ARIMA Regressor",
+            "Prophet Regressor",
+            "Exponential Smoothing Regressor",
+            "Elastic Net Regressor",
+        ]
+    assert estimator_order == expected_order
+
+
+@pytest.mark.noncore_dependency
+@pytest.mark.parametrize("automl_algo", ["iterative", "default"])
+@pytest.mark.parametrize(
+    "hyperparams",
+    [
+        None,
+        {"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent"])}},
+        {"ARIMA Regressor": {"seasonal": Categorical([True])}},
+    ],
+)
+def test_automl_restricts_use_covariates_for_arima(
+    hyperparams, automl_algo, AutoMLTestEnv, is_using_windows, X_y_binary
+):
+
+    X, y = X_y_binary
+    X = pd.DataFrame(X)
+    X["Date"] = pd.date_range("2010-01-01", periods=X.shape[0])
+
+    env = AutoMLTestEnv("time series regression")
+    automl = AutoMLSearch(
+        X,
+        y,
+        "time series regression",
+        problem_configuration={
+            "max_delay": 2,
+            "gap": 0,
+            "forecast_horizon": 2,
+            "time_index": "Date",
+        },
+        verbose=True,
+        custom_hyperparameters=hyperparams,
+        automl_algorithm=automl_algo,
+        max_batches=6,
+    )
+    with env.test_context(score_return_value={automl.objective.name: 1.0}):
+        automl.search()
+
+    params = automl.full_rankings.parameters.map(
+        lambda p: p.get("ARIMA Regressor", {}).get("use_covariates")
+    ).tolist()
+    arima_params = [p for p in params if p is not None]
+    assert arima_params
+    assert all(not p for p in arima_params)
+
+
+@pytest.mark.noncore_dependency
+@pytest.mark.parametrize("automl_algo", ["iterative", "default"])
+@pytest.mark.parametrize(
+    "hyperparams",
+    [
+        {"ARIMA Regressor": {"use_covariates": Categorical([True])}},
+        {
+            "ARIMA Regressor": {"use_covariates": Categorical([True])},
+            "Imputer": {"numeric_impute_strategy": Categorical(["most_frequent"])},
+        },
+    ],
+)
+def test_automl_does_not_restrict_use_covariates_if_user_specified(
+    hyperparams, automl_algo, AutoMLTestEnv, is_using_windows, X_y_binary
+):
+
+    X, y = X_y_binary
+    X = pd.DataFrame(X)
+    X["Date"] = pd.date_range("2010-01-01", periods=X.shape[0])
+    env = AutoMLTestEnv("time series regression")
+    automl = AutoMLSearch(
+        X,
+        y,
+        "time series regression",
+        problem_configuration={
+            "max_delay": 2,
+            "gap": 0,
+            "forecast_horizon": 2,
+            "time_index": "Date",
+        },
+        verbose=True,
+        automl_algorithm=automl_algo,
+        custom_hyperparameters=hyperparams,
+        max_batches=6,
+    )
+    with env.test_context(score_return_value={automl.objective.name: 1.0}):
+        automl.search()
+
+    params = automl.full_rankings.parameters.map(
+        lambda p: p.get("ARIMA Regressor", {}).get("use_covariates")
+    ).tolist()
+    arima_params = [p for p in params if p is not None]
+    assert arima_params
+    assert all(p for p in arima_params)
diff --git a/evalml/tests/component_tests/test_arima_regressor.py b/evalml/tests/component_tests/test_arima_regressor.py
@@ -303,3 +303,18 @@ def test_arima_supports_boolean_features():
     ar.fit(X, y)
     preds = ar.predict(X)
     assert not preds.isna().any()
+
+
+def test_arima_regressor_respects_use_covariates(ts_data):
+    X, y = ts_data
+    X_train, y_train = X.iloc[:25], y.iloc[:25]
+    X_test, _ = X.iloc[25:], y.iloc[25:]
+    clf = ARIMARegressor(use_covariates=False)
+    with patch.object(clf, "_component_obj") as mock_obj:
+        clf.fit(X_train, y_train)
+        clf.predict(X_test)
+        mock_obj.fit.assert_called_once()
+        assert "X" not in mock_obj.fit.call_args.kwargs
+        assert "y" in mock_obj.fit.call_args.kwargs
+        mock_obj.predict.assert_called_once()
+        assert "X" not in mock_obj.predict.call_args.kwargs
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 extras_require = {
     'update_checker': ['alteryx-open-src-update-checker >= 2.0.0'],
-    'prophet': ['cmdstan-builder == 0.0.8']
+    'prophet': ['prophet-prebuilt == 1.0.2']
 }
 extras_require['complete'] = sorted(set(sum(extras_require.values(), [])))