Skip to content

Commit

Permalink
For IterativeAlgorithm, put time series algorithms first (#3407)
Browse files Browse the repository at this point in the history
* Put ts estimators first

* Add to release notes

* Fixing test

* only run test non-core deps

* Fix test for windows

* lint

* Update Makefile

* Lint

* Linting

* Change heuristic

* Lint with correct version

* Add prophet to min deps. Update release notes

* Mark test as non-core

* Update test

* Linting
  • Loading branch information
freddyaboulton committed Mar 31, 2022
1 parent 0ddf705 commit 9282cc6
Show file tree
Hide file tree
Showing 10 changed files with 192 additions and 15 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/linux_unit_tests_with_latest_deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ jobs:
pip install virtualenv
virtualenv test_python -q
source test_python/bin/activate
pip install cmdstan-builder==0.0.8
make installdeps
make installdeps-prophet
make installdeps-test
pip freeze
- name: Erase Coverage
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/linux_unit_tests_with_minimum_deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
name: Install evalml with test dependencies, core dependencies, & optional requirements (Prophet)
run: |
source test_python/bin/activate
pip install cmdstan-builder==0.0.8
pip install prophet-prebuilt==1.0.2
pip install -e . --no-dependencies
pip install -r evalml/tests/dependency_update_check/minimum_test_requirements.txt
pip install -r evalml/tests/dependency_update_check/minimum_core_requirements.txt
Expand Down
7 changes: 2 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,10 @@ installdeps-min:
pip install -r evalml/tests/dependency_update_check/minimum_core_requirements.txt
pip install -r evalml/tests/dependency_update_check/minimum_requirements.txt

SITE_PACKAGES_DIR=$$(python -c 'import site; print(site.getsitepackages()[0])')

.PHONY: installdeps-prophet
installdeps-prophet:
pip install cmdstanpy==0.9.68
python ${SITE_PACKAGES_DIR}/cmdstanpy/install_cmdstan.py --dir ${SITE_PACKAGES_DIR} -v 2.28.0
echo "Installing Prophet with CMDSTANPY backend"
CMDSTAN=${SITE_PACKAGES_DIR}/cmdstan-2.28.0 STAN_BACKEND=CMDSTANPY pip install --no-cache-dir prophet==1.0.1
pip install -e .[prophet]

.PHONY: installdeps-core
installdeps-core:
Expand Down
4 changes: 4 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@

**Future Releases**
* Enhancements
* Added ``use_covariates`` parameter to ``ARIMARegressor`` :pr:`3407`
* ``AutoMLSearch`` will set ``use_covariates`` to ``False`` for ARIMA when dataset is large :pr:`3407`
* Fixes
* Changes
* Moved model understanding metrics from ``graph.py`` into a separate file :pr:`3417`
* Unpin ``click`` dependency :pr:`3420`
* For ``IterativeAlgorithm``, put time series algorithms first :pr:`3407`
* Use ``prophet-prebuilt`` to install prophet in extras :pr:`3407`
* Documentation Changes
* Testing Changes

Expand Down
4 changes: 3 additions & 1 deletion evalml/automl/automl_algorithm/iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@
from evalml.utils.logger import get_logger

_ESTIMATOR_FAMILY_ORDER = [
ModelFamily.ARIMA,
ModelFamily.PROPHET,
ModelFamily.EXPONENTIAL_SMOOTHING,
ModelFamily.LINEAR_MODEL,
ModelFamily.XGBOOST,
ModelFamily.LIGHTGBM,
ModelFamily.CATBOOST,
ModelFamily.RANDOM_FOREST,
ModelFamily.DECISION_TREE,
ModelFamily.EXTRA_TREES,
ModelFamily.ARIMA,
]


Expand Down
20 changes: 19 additions & 1 deletion evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pandas as pd
from dask import distributed as dd
from sklearn.model_selection import BaseCrossValidator
from skopt.space import Categorical

from .pipeline_search_plots import PipelineSearchPlots, SearchIterationPlot

Expand Down Expand Up @@ -47,6 +48,7 @@
MulticlassClassificationPipeline,
RegressionPipeline,
)
from evalml.pipelines.components import ARIMARegressor
from evalml.pipelines.utils import make_timeseries_baseline_pipeline
from evalml.problem_types import (
ProblemTypes,
Expand Down Expand Up @@ -631,6 +633,22 @@ def __init__(
self.data_splitter = self.data_splitter or default_data_splitter
self.pipeline_parameters = pipeline_parameters or {}
self.custom_hyperparameters = custom_hyperparameters or {}
# Fitting takes a long time if the data is too wide or long.
if is_time_series(problem_type) and (
self.X_train.shape[1] >= 10 or self.X_train.shape[0] >= 10000
):
user_arima_hyperparams = ARIMARegressor.name in self.custom_hyperparameters
if user_arima_hyperparams and not self.custom_hyperparameters[
ARIMARegressor.name
].get("use_covariates"):
self.custom_hyperparameters[ARIMARegressor.name].update(
{"use_covariates": Categorical([False])}
)
elif not user_arima_hyperparams:
self.custom_hyperparameters[ARIMARegressor.name] = {
"use_covariates": Categorical([False])
}

self.search_iteration_plot = None
self._interrupted = False

Expand Down Expand Up @@ -707,7 +725,7 @@ def __init__(
ensembling=self.ensembling,
text_in_ensembling=text_in_ensembling,
pipeline_params=parameters,
custom_hyperparameters=custom_hyperparameters,
custom_hyperparameters=self.custom_hyperparameters,
allow_long_running_models=allow_long_running_models,
features=features,
verbose=self.verbose,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def __init__(
n_jobs=-1,
random_seed=0,
maxiter=10,
use_covariates=True,
**kwargs,
):
parameters = {
Expand All @@ -93,6 +94,9 @@ def __init__(
"sktime.forecasting.arima", error_msg=arima_model_msg
)
arima_model = sktime_arima.AutoARIMA(**parameters)
parameters["use_covariates"] = use_covariates

self.use_covariates = use_covariates

super().__init__(
parameters=parameters, component_obj=arima_model, random_seed=random_seed
Expand Down Expand Up @@ -154,8 +158,7 @@ def fit(self, X, y=None):
)
y = self._remove_datetime(y)
X, y = self._match_indices(X, y)

if X is not None and not X.empty:
if X is not None and not X.empty and self.use_covariates:
self._component_obj.fit(y=y, X=X)
else:
self._component_obj.fit(y=y)
Expand Down Expand Up @@ -183,8 +186,7 @@ def predict(self, X, y=None):
for col in X.ww.select(["Boolean"], return_schema=True).columns
}
)

if not X.empty:
if not X.empty and self.use_covariates:
y_pred = self._component_obj.predict(fh=fh_, X=X)
else:
y_pred = self._component_obj.predict(fh=fh_)
Expand Down
140 changes: 140 additions & 0 deletions evalml/tests/automl_tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4544,3 +4544,143 @@ def test_automl_accepts_features(
assert all(
["DFS Transformer" not in p for p in automl.full_rankings["parameters"][1:]]
)


@pytest.mark.noncore_dependency
def test_automl_with_iterative_algorithm_puts_ts_estimators_first(
ts_data, AutoMLTestEnv, is_using_windows
):

X, y = ts_data

env = AutoMLTestEnv("time series regression")
automl = AutoMLSearch(
X,
y,
"time series regression",
max_iterations=5,
problem_configuration={
"max_delay": 2,
"gap": 0,
"forecast_horizon": 2,
"time_index": "Date",
},
verbose=True,
automl_algorithm="iterative",
)
with env.test_context(score_return_value={automl.objective.name: 1.0}):
automl.search()

estimator_order = (
automl.full_rankings.sort_values("search_order")
.id.map(lambda id_: automl.get_pipeline(id_).estimator.name)
.tolist()
)
if is_using_windows:
expected_order = [
"Time Series Baseline Estimator",
"ARIMA Regressor",
"Exponential Smoothing Regressor",
"Elastic Net Regressor",
"XGBoost Regressor",
]
else:
expected_order = [
"Time Series Baseline Estimator",
"ARIMA Regressor",
"Prophet Regressor",
"Exponential Smoothing Regressor",
"Elastic Net Regressor",
]
assert estimator_order == expected_order


@pytest.mark.noncore_dependency
@pytest.mark.parametrize("automl_algo", ["iterative", "default"])
@pytest.mark.parametrize(
"hyperparams",
[
None,
{"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent"])}},
{"ARIMA Regressor": {"seasonal": Categorical([True])}},
],
)
def test_automl_restricts_use_covariates_for_arima(
hyperparams, automl_algo, AutoMLTestEnv, is_using_windows, X_y_binary
):

X, y = X_y_binary
X = pd.DataFrame(X)
X["Date"] = pd.date_range("2010-01-01", periods=X.shape[0])

env = AutoMLTestEnv("time series regression")
automl = AutoMLSearch(
X,
y,
"time series regression",
problem_configuration={
"max_delay": 2,
"gap": 0,
"forecast_horizon": 2,
"time_index": "Date",
},
verbose=True,
custom_hyperparameters=hyperparams,
automl_algorithm=automl_algo,
max_batches=6,
)
with env.test_context(score_return_value={automl.objective.name: 1.0}):
automl.search()

params = automl.full_rankings.parameters.map(
lambda p: p.get("ARIMA Regressor", {}).get("use_covariates")
).tolist()
arima_params = [p for p in params if p is not None]
assert arima_params
assert all(not p for p in arima_params)


@pytest.mark.noncore_dependency
@pytest.mark.parametrize("automl_algo", ["iterative", "default"])
@pytest.mark.parametrize(
"hyperparams",
[
{"ARIMA Regressor": {"use_covariates": Categorical([True])}},
{
"ARIMA Regressor": {"use_covariates": Categorical([True])},
"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent"])},
},
],
)
def test_automl_does_not_restrict_use_covariates_if_user_specified(
hyperparams, automl_algo, AutoMLTestEnv, is_using_windows, X_y_binary
):

X, y = X_y_binary
X = pd.DataFrame(X)
X["Date"] = pd.date_range("2010-01-01", periods=X.shape[0])
env = AutoMLTestEnv("time series regression")
automl = AutoMLSearch(
X,
y,
"time series regression",
problem_configuration={
"max_delay": 2,
"gap": 0,
"forecast_horizon": 2,
"time_index": "Date",
},
verbose=True,
automl_algorithm=automl_algo,
custom_hyperparameters=hyperparams,
max_batches=6,
)
with env.test_context(score_return_value={automl.objective.name: 1.0}):
automl.search()

params = automl.full_rankings.parameters.map(
lambda p: p.get("ARIMA Regressor", {}).get("use_covariates")
).tolist()
arima_params = [p for p in params if p is not None]
assert arima_params
assert all(p for p in arima_params)
15 changes: 15 additions & 0 deletions evalml/tests/component_tests/test_arima_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,3 +303,18 @@ def test_arima_supports_boolean_features():
ar.fit(X, y)
preds = ar.predict(X)
assert not preds.isna().any()


def test_arima_regressor_respects_use_covariates(ts_data):
X, y = ts_data
X_train, y_train = X.iloc[:25], y.iloc[:25]
X_test, _ = X.iloc[25:], y.iloc[25:]
clf = ARIMARegressor(use_covariates=False)
with patch.object(clf, "_component_obj") as mock_obj:
clf.fit(X_train, y_train)
clf.predict(X_test)
mock_obj.fit.assert_called_once()
assert "X" not in mock_obj.fit.call_args.kwargs
assert "y" in mock_obj.fit.call_args.kwargs
mock_obj.predict.assert_called_once()
assert "X" not in mock_obj.predict.call_args.kwargs
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

extras_require = {
'update_checker': ['alteryx-open-src-update-checker >= 2.0.0'],
'prophet': ['cmdstan-builder == 0.0.8']
'prophet': ['prophet-prebuilt == 1.0.2']
}
extras_require['complete'] = sorted(set(sum(extras_require.values(), [])))

Expand Down

0 comments on commit 9282cc6

Please sign in to comment.