Skip to content
Merged
3 changes: 3 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ Release Notes
* Fixes
* Changes
* Deleted ``_put_into_original_order`` helper function :pr:`2639`
* Refactored time series pipeline code using a time series pipeline base class :pr:`2649`
* Documentation Changes
* Add complete install command to README and Install section :pr:`2627`
* Testing Changes

.. warning::

**Breaking Changes**
* ``TimeSeriesRegressionPipeline`` no longer inherits from ``TimeSeriesRegressionPipeline`` :pr:`2649`



**v0.30.2 Aug. 16, 2021**
Expand Down
76 changes: 12 additions & 64 deletions evalml/pipelines/time_series_classification_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from evalml.objectives import get_objective
from evalml.pipelines.classification_pipeline import ClassificationPipeline
from evalml.pipelines.pipeline_meta import TimeSeriesPipelineBaseMeta
from evalml.pipelines.time_series_pipeline_base import TimeSeriesPipelineBase
from evalml.problem_types import ProblemTypes
from evalml.utils import (
drop_rows_with_nans,
Expand All @@ -15,9 +15,7 @@
)


class TimeSeriesClassificationPipeline(
ClassificationPipeline, metaclass=TimeSeriesPipelineBaseMeta
):
class TimeSeriesClassificationPipeline(TimeSeriesPipelineBase, ClassificationPipeline):
"""Pipeline base class for time series classification problems.

Arguments:
Expand All @@ -33,37 +31,6 @@ class TimeSeriesClassificationPipeline(
random_seed (int): Seed for the random number generator. Defaults to 0.
"""

def __init__(
self,
component_graph,
parameters=None,
custom_name=None,
random_seed=0,
):
if "pipeline" not in parameters:
raise ValueError(
"date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. "
"Please specify them as a dictionary with the key 'pipeline'."
)
pipeline_params = parameters["pipeline"]
self.date_index = pipeline_params["date_index"]
self.gap = pipeline_params["gap"]
self.max_delay = pipeline_params["max_delay"]
super().__init__(
component_graph,
custom_name=custom_name,
parameters=parameters,
random_seed=random_seed,
)

@staticmethod
def _convert_to_woodwork(X, y):
if X is None:
X = pd.DataFrame()
X = infer_feature_types(X)
y = infer_feature_types(y)
return X, y

def fit(self, X, y):
"""Fit a time series classification pipeline.

Expand All @@ -77,26 +44,9 @@ def fit(self, X, y):
X, y = self._convert_to_woodwork(X, y)
self._encoder.fit(y)
y = self._encode_targets(y)

self.input_target_name = y.name
X_t = self.component_graph.fit_features(X, y)

y_shifted = y.shift(-self.gap)
X_t, y_shifted = drop_rows_with_nans(X_t, y_shifted)
self.estimator.fit(X_t, y_shifted)
self.input_feature_names = self.component_graph.input_feature_names
self._fit(X, y)
return self

def _estimator_predict(self, features, y):
"""Get estimator predictions.

This helper passes y as an argument if needed by the estimator.
"""
y_arg = None
if self.estimator.predict_uses_y:
y_arg = y
return self.estimator.predict(features, y=y_arg)

def _estimator_predict_proba(self, features, y):
"""Get estimator predicted probabilities.

Expand All @@ -122,9 +72,9 @@ def predict(self, X, y=None, objective=None):
"""Make predictions using selected features.

Arguments:
X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
y (pd.Series, np.ndarray, None): The target training targets of length [n_samples]
objective (Object or string): The objective to use to make predictions
X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features].
y (pd.Series, np.ndarray, None): The target training targets of length [n_samples].
objective (Object or string): The objective to use to make predictions.

Returns:
pd.Series: Predicted values.
Expand All @@ -145,10 +95,10 @@ def predict_proba(self, X, y=None):
"""Make probability estimates for labels.

Arguments:
X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features].

Returns:
pd.DataFrame: Probability estimates
pd.DataFrame: Probability estimates.
"""
X, y = self._convert_to_woodwork(X, y)
y = self._encode_targets(y)
Expand All @@ -163,16 +113,15 @@ def score(self, X, y, objectives):
"""Evaluate model performance on current and additional objectives.

Arguments:
X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
y (pd.Series): True labels of length [n_samples]
objectives (list): Non-empty list of objectives to score on
X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features].
y (pd.Series): True labels of length [n_samples].
objectives (list): Non-empty list of objectives to score on.

Returns:
dict: Ordered dictionary of objective scores
dict: Ordered dictionary of objective scores.
"""
X, y = self._convert_to_woodwork(X, y)
objectives = self.create_objectives(objectives)

y_encoded = self._encode_targets(y)
y_shifted = y_encoded.shift(-self.gap)
y_predicted, y_predicted_proba = self._compute_predictions(
Expand All @@ -193,7 +142,6 @@ def score(self, X, y, objectives):
class TimeSeriesBinaryClassificationPipeline(
BinaryClassificationPipelineMixin,
TimeSeriesClassificationPipeline,
metaclass=TimeSeriesPipelineBaseMeta,
):
"""Pipeline base class for time series binary classification problems.

Expand Down
86 changes: 86 additions & 0 deletions evalml/pipelines/time_series_pipeline_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import pandas as pd

from evalml.pipelines import PipelineBase
from evalml.pipelines.pipeline_meta import TimeSeriesPipelineBaseMeta
from evalml.utils import drop_rows_with_nans, infer_feature_types


class TimeSeriesPipelineBase(PipelineBase, metaclass=TimeSeriesPipelineBaseMeta):

"""Pipeline base class for time series problems.

Arguments:
component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list.
Note that when duplicate components are specified in a list, the duplicate component names will be modified with the
component's index in the list. For example, the component graph
[Imputer, One Hot Encoder, Imputer, Logistic Regression Classifier] will have names
["Imputer", "One Hot Encoder", "Imputer_2", "Logistic Regression Classifier"]
parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values.
An empty dictionary {} implies using all default values for component parameters. Pipeline-level
parameters such as date_index, gap, and max_delay must be specified with the "pipeline" key. For example:
Pipeline(parameters={"pipeline": {"date_index": "Date", "max_delay": 4, "gap": 2}}).
random_seed (int): Seed for the random number generator. Defaults to 0.
"""

def __init__(
self,
component_graph,
parameters=None,
custom_name=None,
random_seed=0,
):
if not parameters or "pipeline" not in parameters:
raise ValueError(
"date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. "
"Please specify them as a dictionary with the key 'pipeline'."
)
pipeline_params = parameters["pipeline"]
self.date_index = pipeline_params["date_index"]
self.gap = pipeline_params["gap"]
self.max_delay = pipeline_params["max_delay"]
super().__init__(
component_graph,
custom_name=custom_name,
parameters=parameters,
random_seed=random_seed,
)

@staticmethod
def _convert_to_woodwork(X, y):
if X is None:
X = pd.DataFrame()
X = infer_feature_types(X)
y = infer_feature_types(y)
return X, y

def fit(self, X, y):
"""Fit a time series pipeline.

Arguments:
X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features].
y (pd.Series, np.ndarray): The target training targets of length [n_samples].

Returns:
self
"""
X, y = self._convert_to_woodwork(X, y)
self._fit(X, y)
return self

def _fit(self, X, y):
self.input_target_name = y.name
X_t = self.component_graph.fit_features(X, y)
y_shifted = y.shift(-self.gap)
X_t, y_shifted = drop_rows_with_nans(X_t, y_shifted)
self.estimator.fit(X_t, y_shifted)
self.input_feature_names = self.component_graph.input_feature_names

def _estimator_predict(self, features, y):
"""Get estimator predictions.

This helper passes y as an argument if needed by the estimator.
"""
y_arg = None
if self.estimator.predict_uses_y:
y_arg = y
return self.estimator.predict(features, y=y_arg)
95 changes: 14 additions & 81 deletions evalml/pipelines/time_series_regression_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
import pandas as pd

from evalml.pipelines.pipeline_meta import TimeSeriesPipelineBaseMeta
from evalml.pipelines.regression_pipeline import RegressionPipeline
from evalml.pipelines.time_series_pipeline_base import TimeSeriesPipelineBase
from evalml.problem_types import ProblemTypes
from evalml.utils import (
drop_rows_with_nans,
Expand All @@ -10,9 +7,7 @@
)


class TimeSeriesRegressionPipeline(
RegressionPipeline, metaclass=TimeSeriesPipelineBaseMeta
):
class TimeSeriesRegressionPipeline(TimeSeriesPipelineBase):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please mark that TimeSeriesRegressionPipeline does not inherit from RegressionPipeline as a breaking change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure thing 😁

"""Pipeline base class for time series regression problems.

Arguments:
Expand All @@ -29,79 +24,23 @@ class TimeSeriesRegressionPipeline(
"""

problem_type = ProblemTypes.TIME_SERIES_REGRESSION
"""ProblemTypes.TIME_SERIES_REGRESSIO"""

def __init__(
self,
component_graph,
parameters=None,
custom_name=None,
random_seed=0,
):
if not parameters or "pipeline" not in parameters:
raise ValueError(
"date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. "
"Please specify them as a dictionary with the key 'pipeline'."
)
pipeline_params = parameters["pipeline"]
self.date_index = pipeline_params["date_index"]
self.gap = pipeline_params["gap"]
self.max_delay = pipeline_params["max_delay"]
super().__init__(
component_graph,
custom_name=custom_name,
parameters=parameters,
random_seed=random_seed,
)

def fit(self, X, y):
"""Fit a time series regression pipeline.

Arguments:
X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
y (pd.Series, np.ndarray): The target training targets of length [n_samples]

Returns:
self
"""
if X is None:
X = pd.DataFrame()

X = infer_feature_types(X)
y = infer_feature_types(y)

self.input_target_name = y.name
X_t = self.component_graph.fit_features(X, y)

y_shifted = y.shift(-self.gap)
X_t, y_shifted = drop_rows_with_nans(X_t, y_shifted)
self.estimator.fit(X_t, y_shifted)
self.input_feature_names = self.component_graph.input_feature_names

return self
"""ProblemTypes.TIME_SERIES_REGRESSION"""

def predict(self, X, y=None, objective=None):
"""Make predictions using selected features.

Arguments:
X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
y (pd.Series, np.ndarray, None): The target training targets of length [n_samples]
objective (Object or string): The objective to use to make predictions
X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features].
y (pd.Series, np.ndarray, None): The target training targets of length [n_samples].
objective (Object or string): The objective to use to make predictions.

Returns:
pd.Series: Predicted values.
"""
if X is None:
X = pd.DataFrame()
X = infer_feature_types(X)
y = infer_feature_types(y)
X, y = self._convert_to_woodwork(X, y)
features = self.compute_estimator_features(X, y)
features_no_nan, y = drop_rows_with_nans(features, y)
y_arg = None
if self.estimator.predict_uses_y:
y_arg = y
predictions = self.estimator.predict(features_no_nan, y_arg)

predictions = self._estimator_predict(features_no_nan, y)
predictions.index = y.index
predictions = self.inverse_transform(predictions)
predictions = predictions.rename(self.input_target_name)
Expand All @@ -114,23 +53,17 @@ def score(self, X, y, objectives):
"""Evaluate model performance on current and additional objectives.

Arguments:
X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
y (pd.Series): True labels of length [n_samples]
objectives (list): Non-empty list of objectives to score on
X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features].
y (pd.Series): True labels of length [n_samples].
objectives (list): Non-empty list of objectives to score on.

Returns:
dict: Ordered dictionary of objective scores
dict: Ordered dictionary of objective scores.
"""
# Only converting X for the call to _score_all_objectives
if X is None:
X = pd.DataFrame()
X = infer_feature_types(X)
y = infer_feature_types(y)

X, y = self._convert_to_woodwork(X, y)
objectives = self.create_objectives(objectives)
y_predicted = self.predict(X, y)

y_shifted = y.shift(-self.gap)
objectives = self.create_objectives(objectives)
y_shifted, y_predicted = drop_rows_with_nans(y_shifted, y_predicted)
return self._score_all_objectives(
X, y_shifted, y_predicted, y_pred_proba=None, objectives=objectives
Expand Down