-
Notifications
You must be signed in to change notification settings - Fork 87
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add prophet regressor #1704
Add prophet regressor #1704
Changes from all commits
85cb500
a80347e
5aa3fd0
f549fa3
14cae58
709fe4b
292aa38
921f76e
284193c
5048cc8
6fc5982
81770e2
f1a8cc3
2f208ad
871de4a
9e0a618
c932cae
16755ad
dc22b2a
d0ca8b1
1e06c27
bb739e5
3d041a2
a8a13d6
e02fe13
d3a8716
e182bbb
f166e7c
c646f26
2385295
83085c0
333bbfa
841be47
70ce64f
d723488
c64b500
de3629c
eb01777
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,4 +20,5 @@ | |
BaselineRegressor, | ||
TimeSeriesBaselineEstimator, | ||
DecisionTreeRegressor, | ||
ProphetRegressor, | ||
SVMRegressor) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from skopt.space import Real | ||
|
||
from evalml.model_family import ModelFamily | ||
from evalml.pipelines.components.estimators import Estimator | ||
from evalml.problem_types import ProblemTypes | ||
from evalml.utils import SEED_BOUNDS, import_or_raise | ||
from evalml.utils.gen_utils import ( | ||
_convert_to_woodwork_structure, | ||
_convert_woodwork_types_wrapper, | ||
suppress_stdout_stderr | ||
) | ||
|
||
|
||
class ProphetRegressor(Estimator): | ||
""" | ||
Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. | ||
It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well. | ||
|
||
More information here: https://facebook.github.io/prophet/ | ||
|
||
""" | ||
name = "Prophet Regressor" | ||
hyperparameter_ranges = { | ||
"changepoint_prior_scale": Real(0.001, 0.5), | ||
"seasonality_prior_scale": Real(0.01, 10), | ||
"holidays_prior_scale": Real(0.01, 10), | ||
"seasonality_mode": ['additive', 'multiplicative'], | ||
} | ||
model_family = ModelFamily.PROPHET | ||
supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION] | ||
|
||
SEED_MIN = 0 | ||
SEED_MAX = SEED_BOUNDS.max_bound | ||
|
||
def __init__(self, date_column='ds', changepoint_prior_scale=0.05, seasonality_prior_scale=10, holidays_prior_scale=10, seasonality_mode="additive", | ||
random_state=0, **kwargs): | ||
self.date_column = date_column | ||
|
||
parameters = {'changepoint_prior_scale': changepoint_prior_scale, | ||
"seasonality_prior_scale": seasonality_prior_scale, | ||
"holidays_prior_scale": holidays_prior_scale, | ||
"seasonality_mode": seasonality_mode} | ||
|
||
parameters.update(kwargs) | ||
|
||
p_error_msg = "prophet is not installed. Please install using `pip install pystan` and `pip install fbprophet`." | ||
prophet = import_or_raise("fbprophet", error_msg=p_error_msg) | ||
|
||
prophet_regressor = prophet.Prophet(**parameters) | ||
super().__init__(parameters=parameters, | ||
component_obj=prophet_regressor, | ||
random_state=random_state) | ||
|
||
@staticmethod | ||
def build_prophet_df(X, y=None, date_column='ds'): | ||
if X is not None: | ||
X = X.copy(deep=True) | ||
if y is not None: | ||
y = y.copy(deep=True) | ||
|
||
if date_column in X.columns: | ||
date_col = X[date_column] | ||
elif isinstance(X.index, pd.DatetimeIndex): | ||
date_col = X.reset_index() | ||
date_col = date_col['index'] | ||
elif isinstance(y.index, pd.DatetimeIndex): | ||
date_col = y.reset_index() | ||
date_col = date_col['index'] | ||
else: | ||
msg = "Prophet estimator requires input data X to have a datetime column specified by the 'date_column' parameter. If not it will look for the datetime column in the index of X or y." | ||
raise ValueError(msg) | ||
|
||
date_col = date_col.rename('ds') | ||
prophet_df = date_col.to_frame() | ||
if y is not None: | ||
y.index = prophet_df.index | ||
prophet_df['y'] = y | ||
return prophet_df | ||
|
||
def fit(self, X, y=None): | ||
if X is None: | ||
X = pd.DataFrame() | ||
|
||
X = _convert_to_woodwork_structure(X) | ||
X = _convert_woodwork_types_wrapper(X.to_dataframe()) | ||
|
||
y = _convert_to_woodwork_structure(y) | ||
y = _convert_woodwork_types_wrapper(y.to_series()) | ||
|
||
prophet_df = ProphetRegressor.build_prophet_df(X=X, y=y, date_column=self.date_column) | ||
|
||
with suppress_stdout_stderr(): | ||
self._component_obj.fit(prophet_df) | ||
return self | ||
|
||
def predict(self, X, y=None): | ||
if X is None: | ||
X = pd.DataFrame() | ||
|
||
X = _convert_to_woodwork_structure(X) | ||
X = _convert_woodwork_types_wrapper(X.to_dataframe()) | ||
|
||
prophet_df = ProphetRegressor.build_prophet_df(X=X, y=y, date_column=self.date_column) | ||
|
||
with suppress_stdout_stderr(): | ||
y_pred = self._component_obj.predict(prophet_df)['yhat'] | ||
return y_pred | ||
|
||
@property | ||
def feature_importance(self): | ||
""" | ||
Returns array of 0's with len(1) as feature_importance is not defined for Prophet regressor. | ||
""" | ||
return np.zeros(1) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ | |
LogisticRegressionClassifier, | ||
OneHotEncoder, | ||
PerColumnImputer, | ||
ProphetRegressor, | ||
RandomForestClassifier, | ||
RandomForestRegressor, | ||
RFClassifierSelectFromModel, | ||
|
@@ -228,6 +229,11 @@ def test_describe_component(): | |
'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}} | ||
except ImportError: | ||
pass | ||
try: | ||
prophet_regressor = ProphetRegressor() | ||
assert prophet_regressor.describe(return_dict=True) == {'name': 'Prophet Regressor', 'parameters': {'changepoint_prior_scale': 0.05, 'holidays_prior_scale': 10, 'seasonality_mode': 'additive', 'seasonality_prior_scale': 10}} | ||
except ImportError: | ||
pass | ||
|
||
|
||
def test_missing_attributes(X_y_binary): | ||
|
@@ -706,14 +712,18 @@ def test_all_transformers_check_fit(X_y_binary): | |
component.transform(X) | ||
|
||
|
||
def test_all_estimators_check_fit(X_y_binary, test_estimator_needs_fitting_false, helper_functions): | ||
X, y = X_y_binary | ||
def test_all_estimators_check_fit(X_y_binary, ts_data, test_estimator_needs_fitting_false, helper_functions): | ||
estimators_to_check = [estimator for estimator in _all_estimators() if estimator not in [StackedEnsembleClassifier, StackedEnsembleRegressor, TimeSeriesBaselineEstimator]] + [test_estimator_needs_fitting_false] | ||
for component_class in estimators_to_check: | ||
if not component_class.needs_fitting: | ||
continue | ||
|
||
component = helper_functions.safe_init_component_with_njobs_1(component_class) | ||
if component.supported_problem_types == [ProblemTypes.TIME_SERIES_REGRESSION]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prophet does not follow sklearn's estimator API (and this is our first estimator that only does time series problem types) and tests in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! I think it's fine to have this code outside a util function since it's only used twice and only in our tests. |
||
X, y = ts_data | ||
else: | ||
X, y = X_y_binary | ||
|
||
with pytest.raises(ComponentNotYetFittedError, match=f'You must fit {component_class.__name__}'): | ||
component.predict(X) | ||
|
||
|
@@ -757,8 +767,7 @@ def test_no_fitting_required_components(X_y_binary, test_estimator_needs_fitting | |
component.transform(X, y) | ||
|
||
|
||
def test_serialization(X_y_binary, tmpdir, helper_functions): | ||
X, y = X_y_binary | ||
def test_serialization(X_y_binary, ts_data, tmpdir, helper_functions): | ||
jeremyliweishih marked this conversation as resolved.
Show resolved
Hide resolved
|
||
path = os.path.join(str(tmpdir), 'component.pkl') | ||
for component_class in all_components(): | ||
print('Testing serialization of component {}'.format(component_class.name)) | ||
|
@@ -769,6 +778,12 @@ def test_serialization(X_y_binary, tmpdir, helper_functions): | |
component = component_class(input_pipelines=[make_pipeline_from_components([RandomForestClassifier()], ProblemTypes.BINARY)], n_jobs=1) | ||
elif (component_class == StackedEnsembleRegressor): | ||
component = component_class(input_pipelines=[make_pipeline_from_components([RandomForestRegressor()], ProblemTypes.REGRESSION)], n_jobs=1) | ||
|
||
if isinstance(component, Estimator) and component.supported_problem_types == [ProblemTypes.TIME_SERIES_REGRESSION]: | ||
X, y = ts_data | ||
else: | ||
X, y = X_y_binary | ||
|
||
component.fit(X, y) | ||
|
||
for pickle_protocol in range(cloudpickle.DEFAULT_PROTOCOL + 1): | ||
|
@@ -812,7 +827,10 @@ def test_estimators_accept_all_kwargs(estimator_class, | |
if estimator_class.model_family == ModelFamily.ENSEMBLE: | ||
params = estimator.parameters | ||
else: | ||
params = estimator._component_obj.get_params() | ||
try: | ||
jeremyliweishih marked this conversation as resolved.
Show resolved
Hide resolved
|
||
params = estimator._component_obj.get_params() | ||
except AttributeError: | ||
pytest.skip('estimator does not have `get_params()` method.') | ||
if estimator_class.model_family == ModelFamily.CATBOOST: | ||
# Deleting because we call it random_state in our api | ||
del params["random_seed"] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
jeremyliweishih marked this conversation as resolved.
Show resolved
Hide resolved
|
||
from pytest import importorskip | ||
|
||
from evalml.model_family import ModelFamily | ||
from evalml.pipelines.components import ProphetRegressor | ||
from evalml.problem_types import ProblemTypes | ||
from evalml.utils.gen_utils import suppress_stdout_stderr | ||
|
||
prophet = importorskip('fbprophet', reason='Skipping test because xgboost not installed') | ||
|
||
|
||
def test_model_family(): | ||
assert ProphetRegressor.model_family == ModelFamily.PROPHET | ||
|
||
|
||
def test_problem_types(): | ||
assert set(ProphetRegressor.supported_problem_types) == {ProblemTypes.TIME_SERIES_REGRESSION} | ||
|
||
|
||
def test_init_with_other_params(): | ||
clf = ProphetRegressor(daily_seasonality=True, mcmc_samples=5, interval_width=0.8, uncertainty_samples=0) | ||
assert clf.parameters == {'changepoint_prior_scale': 0.05, | ||
'daily_seasonality': True, | ||
'holidays_prior_scale': 10, | ||
'interval_width': 0.8, | ||
'mcmc_samples': 5, | ||
'seasonality_mode': 'additive', | ||
'seasonality_prior_scale': 10, | ||
'uncertainty_samples': 0} | ||
|
||
|
||
def test_feature_importance(ts_data): | ||
X, y = ts_data | ||
clf = ProphetRegressor() | ||
clf.fit(X, y) | ||
clf.feature_importance == np.zeros(1) | ||
|
||
|
||
def test_fit_predict_ts_with_X_index(ts_data): | ||
X, y = ts_data | ||
assert isinstance(X.index, pd.DatetimeIndex) | ||
|
||
p_clf = prophet.Prophet() | ||
prophet_df = ProphetRegressor.build_prophet_df(X=X, y=y, date_column='ds') | ||
|
||
with suppress_stdout_stderr(): | ||
p_clf.fit(prophet_df) | ||
y_pred_p = p_clf.predict(prophet_df)['yhat'] | ||
|
||
clf = ProphetRegressor() | ||
clf.fit(X, y) | ||
y_pred = clf.predict(X) | ||
|
||
assert (y_pred == y_pred_p).all() | ||
|
||
|
||
def test_fit_predict_ts_with_y_index(ts_data): | ||
X, y = ts_data | ||
X = X.reset_index(drop=True) | ||
assert isinstance(y.index, pd.DatetimeIndex) | ||
|
||
p_clf = prophet.Prophet() | ||
prophet_df = ProphetRegressor.build_prophet_df(X=X, y=y, date_column='ds') | ||
|
||
with suppress_stdout_stderr(): | ||
p_clf.fit(prophet_df) | ||
y_pred_p = p_clf.predict(prophet_df)['yhat'] | ||
|
||
clf = ProphetRegressor() | ||
clf.fit(X, y) | ||
y_pred = clf.predict(X, y) | ||
|
||
assert (y_pred == y_pred_p).all() | ||
|
||
|
||
def test_fit_predict_ts_no_X(ts_data): | ||
X, y = ts_data | ||
|
||
p_clf = prophet.Prophet() | ||
prophet_df = ProphetRegressor.build_prophet_df(X=pd.DataFrame(), y=y, date_column='ds') | ||
|
||
with suppress_stdout_stderr(): | ||
p_clf.fit(prophet_df) | ||
y_pred_p = p_clf.predict(prophet_df)['yhat'] | ||
|
||
clf = ProphetRegressor() | ||
clf.fit(X=None, y=y) | ||
y_pred = clf.predict(X=None, y=y) | ||
|
||
assert (y_pred == y_pred_p).all() | ||
|
||
|
||
def test_fit_predict_date_col(ts_data): | ||
X, y = ts_data | ||
|
||
p_clf = prophet.Prophet() | ||
prophet_df = ProphetRegressor.build_prophet_df(X=X, y=y, date_column='ds') | ||
|
||
with suppress_stdout_stderr(): | ||
p_clf.fit(prophet_df) | ||
y_pred_p = p_clf.predict(prophet_df)['yhat'] | ||
|
||
X = X.reset_index() | ||
X = X['index'].rename('ds').to_frame() | ||
clf = ProphetRegressor(date_column='ds') | ||
clf.fit(X, y) | ||
y_pred = clf.predict(X) | ||
|
||
assert (y_pred == y_pred_p).all() | ||
|
||
|
||
def test_fit_predict_no_date_col_or_index(X_y_binary): | ||
X, y = X_y_binary | ||
|
||
clf = ProphetRegressor() | ||
with pytest.raises(ValueError): | ||
clf.fit(X, y) | ||
jeremyliweishih marked this conversation as resolved.
Show resolved
Hide resolved
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need this for windows installation. In the PR to add prophet to automl we should also include installation instructions in our docs.