Skip to content

Commit

Permalink
Merge branch 'main' into nullable_types_preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
chukarsten committed Dec 7, 2021
2 parents 8ea2cce + 6a82b55 commit 168fa5c
Show file tree
Hide file tree
Showing 9 changed files with 311 additions and 535 deletions.
3 changes: 3 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ Release Notes
* Fixes
* Default parameters for ``RFRegressorSelectFromModel`` and ``RFClassifierSelectFromModel`` has been fixed to avoid selecting all features :pr:`3110`
* Changes
* Removed reliance on a datetime index for ``ARIMARegressor`` and ``ProphetRegressor`` :pr:`3104`
* Included target leakage check when fitting ``ARIMARegressor`` to account for the lack of ``TimeSeriesFeaturizer`` in ``ARIMARegressor`` based pipelines :pr:`3104`
* Cleaned up and refactored ``InvalidTargetDataCheck`` implementation and docstring :pr:`3122`
* Removed indices information from the output of ``HighlyNullDataCheck``'s ``validate()`` method :pr:`3092`
* Added ``ReplaceNullableTypes`` component to prepare for handling pandas nullable types. :pr:`3090`
Expand All @@ -18,6 +20,7 @@ Release Notes

**Breaking Changes**
* Renamed ``DelayedFeatureTransformer`` to ``TimeSeriesFeaturizer`` :pr:`3028`
* ``ProphetRegressor`` now requires a datetime column in ``X`` represented by the ``date_index`` parameter :pr:`3104`
* Renamed module ``evalml.data_checks.invalid_target_data_check`` to ``evalml.data_checks.invalid_targets_data_check`` :pr:`3122`


Expand Down
18 changes: 1 addition & 17 deletions evalml/automl/automl_algorithm/iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from operator import itemgetter

import numpy as np
import pandas as pd
from skopt.space import Categorical, Integer, Real

from .automl_algorithm import AutoMLAlgorithm, AutoMLAlgorithmException
Expand All @@ -15,7 +14,7 @@
from evalml.model_family import ModelFamily
from evalml.pipelines.components.utils import get_estimators
from evalml.pipelines.utils import make_pipeline
from evalml.problem_types import is_multiclass, is_time_series
from evalml.problem_types import is_multiclass
from evalml.utils import infer_feature_types
from evalml.utils.logger import get_logger

Expand Down Expand Up @@ -187,21 +186,6 @@ def _create_pipelines(self):
self.problem_type, self.allowed_model_families
)
allowed_estimators = self._filter_estimators(allowed_estimators)
if (
is_time_series(self.problem_type)
and self._pipeline_params["pipeline"]["date_index"]
):
if (
pd.infer_freq(
self.X[self._pipeline_params["pipeline"]["date_index"]]
)
== "MS"
):
allowed_estimators = [
estimator
for estimator in allowed_estimators
if estimator.name != "ARIMA Regressor"
]
self.logger.debug(
f"allowed_estimators set to {[estimator.name for estimator in allowed_estimators]}"
)
Expand Down
111 changes: 35 additions & 76 deletions evalml/pipelines/components/estimators/regressors/arima_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,66 +96,32 @@ def __init__(
parameters=parameters, component_obj=arima_model, random_seed=random_seed
)

def _get_dates(self, X, y):
date_col = None
if y is not None:
y_index_type = infer_feature_types(
pd.Series(y.index)
).ww.logical_type.type_string
if y_index_type == "datetime":
date_col = y.index
def _remove_datetime(self, data, features=False):
if data is None:
return None
data_no_dt = data.copy()
if isinstance(
data_no_dt.index, (pd.DatetimeIndex, pd.PeriodIndex, pd.IntervalIndex)
):
data_no_dt = data_no_dt.reset_index(drop=True)
if features:
data_no_dt = data_no_dt.select_dtypes(exclude=["datetime64"])

return data_no_dt

def _match_indices(self, X, y):
if X is not None:
X_index_type = infer_feature_types(
pd.Series(X.index)
).ww.logical_type.type_string
if self.parameters["date_index"] in X.columns:
date_col = X.pop(self.parameters["date_index"])
elif X_index_type == "datetime":
date_col = X.index
if date_col is None:
msg = (
"ARIMA regressor requires input data X to have a datetime column specified by the 'date_index' parameter. "
"If not it will look for the datetime column in the index of X or y."
)
raise ValueError(msg)
return date_col, X

def _match_indices(self, X, y, date_col):
if X is not None:
X = X.copy()
X.index = date_col
if y is not None:
y = y.copy()
y.index = date_col
if X.index.equals(y.index):
return X, y
else:
y.index = X.index
return X, y

def _format_dates(self, dates, X, y, predict=False):
if len(dates.shape) == 1:
dates = pd.DataFrame(dates)
if dates.shape[1] == 1:
dates.set_index(dates.columns[0], drop=True, inplace=True)
dates = pd.DatetimeIndex(dates.index)
elif dates.shape[1] > 1:
raise ValueError(
f"The dates parameter should not consist of any additional data outside of the datetime information located in the index or in a column."
f" Found {dates.shape[1]} columns."
)
freq = pd.infer_freq(dates)
dates = pd.DatetimeIndex(dates, freq=freq)
X, y = self._match_indices(X, y, dates)
if predict:
arima_model_msg = (
"sktime is not installed. Please install using `pip install sktime.`"
)
forecasting_ = import_or_raise(
"sktime.forecasting.base", error_msg=arima_model_msg
)
fh_ = forecasting_.ForecastingHorizon(
[i + 1 for i in range(len(dates))], is_relative=True
)
return X, y, fh_
else:
return X, y, None
def _set_forecast(self, X):
from sktime.forecasting.base import ForecastingHorizon

fh_ = ForecastingHorizon([i + 1 for i in range(len(X))], is_relative=True)
return fh_

def fit(self, X, y=None):
"""Fits ARIMA regressor to data.
Expand All @@ -170,14 +136,15 @@ def fit(self, X, y=None):
Raises:
ValueError: If X was passed to `fit` but not passed in `predict`.
"""
X, y = self._manage_woodwork(X, y)
if y is None:
raise ValueError("ARIMA Regressor requires y as input.")

X, y = self._manage_woodwork(X, y)
dates, X = self._get_dates(X, y)
X, y, _ = self._format_dates(dates, X, y)
X = self._remove_datetime(X, features=True)
y = self._remove_datetime(y)
X, y = self._match_indices(X, y)

if X is not None and not X.empty:
X = X.select_dtypes(exclude=["datetime64"])
self._component_obj.fit(y=y, X=X)
else:
self._component_obj.fit(y=y)
Expand All @@ -197,23 +164,15 @@ def predict(self, X, y=None):
ValueError: If X was passed to `fit` but not passed in `predict`.
"""
X, y = self._manage_woodwork(X, y)
dates, X = self._get_dates(X, y)
X, y, fh_ = self._format_dates(dates, X, y, predict=True)
if X is not None and not X.empty:
X = X.select_dtypes(exclude=["datetime64"])
fh_ = self._set_forecast(X)
X = X.select_dtypes(exclude=["datetime64"])

if not X.empty:
y_pred = self._component_obj.predict(fh=fh_, X=X)
else:
try:
y_pred = self._component_obj.predict(fh=fh_)
except ValueError as ve:
error = str(ve)
if "When an ARIMA is fit with an X array" in error:
raise ValueError(
"If X was passed to the fit method of the ARIMARegressor, "
"then it must be passed to the predict method as well."
)
else:
raise ve
y_pred = self._component_obj.predict(fh=fh_)
y_pred.index = X.index

return infer_feature_types(y_pred)

@property
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import copy

import numpy as np
import pandas as pd
from skopt.space import Real

from evalml.model_family import ModelFamily
Expand Down Expand Up @@ -69,39 +68,30 @@ def __init__(

prophet_regressor = prophet.Prophet(**parameters)
parameters["date_index"] = date_index
self.date_index = date_index

super().__init__(
parameters=parameters,
component_obj=prophet_regressor,
random_state=random_seed,
)

@staticmethod
def build_prophet_df(X, y=None, date_column="ds"):
def build_prophet_df(X, y=None, date_index="ds"):
"""Build the Prophet data to pass fit and predict on."""
if X is not None:
X = copy.deepcopy(X)
if y is not None:
y = copy.deepcopy(y)
X = copy.deepcopy(X)
y = copy.deepcopy(y)
if date_index is None:
raise ValueError("date_index cannot be None!")

if date_column in X.columns:
date_column = X.pop(date_column)
if date_index in X.columns:
date_column = X.pop(date_index)
else:
if isinstance(X.index, pd.DatetimeIndex):
X = X.reset_index()
date_column = X.pop("index")
elif isinstance(y.index, pd.DatetimeIndex):
y = y.reset_index()
date_column = y.pop("index")
y = pd.Series(y.values.flatten())
else:
msg = "Prophet estimator requires input data X to have a datetime column specified by the 'date_index' parameter. If it doesn't find one, it will look for the datetime column in the index of X or y."
raise ValueError(msg)
raise ValueError(f"Column {date_index} was not found in X!")

prophet_df = X

if y is not None:
if not prophet_df.empty:
y.index = prophet_df.index
y.index = prophet_df.index
prophet_df["y"] = y
prophet_df["ds"] = date_column

Expand All @@ -117,12 +107,10 @@ def fit(self, X, y=None):
Returns:
self
"""
if X is None:
X = pd.DataFrame()
X, y = super()._manage_woodwork(X, y)

prophet_df = ProphetRegressor.build_prophet_df(
X=X, y=y, date_column=self.parameters["date_index"]
X=X, y=y, date_index=self.date_index
)

self._component_obj.fit(prophet_df)
Expand All @@ -133,26 +121,23 @@ def predict(self, X, y=None):
Args:
X (pd.DataFrame): Data of shape [n_samples, n_features].
y (pd.Series): Target data.
y (pd.Series): Target data. Ignored.
Returns:
pd.Series: Predicted values.
"""
if X is None:
X = pd.DataFrame()
X = infer_feature_types(X)

prophet_df = ProphetRegressor.build_prophet_df(
X=X, y=y, date_column=self.parameters["date_index"]
X=X, y=y, date_index=self.date_index
)

prophet_output = self._component_obj.predict(prophet_df)
predictions = prophet_output["yhat"]
predictions = infer_feature_types(predictions)
predictions = predictions.rename(None)
predictions.index = X.index

if not X.empty:
predictions.index = X.index
return predictions

def get_params(self):
Expand Down
6 changes: 3 additions & 3 deletions evalml/pipelines/time_series_pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ def __init__(
"Please specify them as a dictionary with the key 'pipeline'."
)
pipeline_params = parameters["pipeline"]
date_index = pipeline_params["date_index"]
if date_index is None:
raise ValueError("Parameter date_index cannot be None!")
self.gap = pipeline_params["gap"]
self.max_delay = pipeline_params["max_delay"]
self.forecast_horizon = pipeline_params["forecast_horizon"]
self.date_index = pipeline_params["date_index"]
if self.date_index is None:
raise ValueError("Parameter date_index cannot be None!")
super().__init__(
component_graph,
custom_name=custom_name,
Expand Down
2 changes: 0 additions & 2 deletions evalml/tests/automl_tests/test_automl_search_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,6 @@ def test_automl_supports_time_series_regression(freq, AutoMLTestEnv, ts_data):

if result["id"] == 0:
continue
if freq == "MS":
assert "ARIMA Regressor" not in result["parameters"]
if "ARIMA Regressor" in result["parameters"]:
dt_ = result["parameters"]["ARIMA Regressor"].pop("date_index")
assert "DateTime Featurization Component" not in result["parameters"].keys()
Expand Down
Loading

0 comments on commit 168fa5c

Please sign in to comment.