diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index a51187c7f9..33a78260b3 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -6,6 +6,7 @@ Release Notes * Fixes * Changes * Deleted ``_put_into_original_order`` helper function :pr:`2639` + * Refactored time series pipeline code using a time series pipeline base class :pr:`2649` * Documentation Changes * Add complete install command to README and Install section :pr:`2627` * Testing Changes @@ -13,6 +14,8 @@ Release Notes .. warning:: **Breaking Changes** + * ``TimeSeriesRegressionPipeline`` no longer inherits from ``TimeSeriesRegressionPipeline`` :pr:`2649` + **v0.30.2 Aug. 16, 2021** diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py index 070feb3743..d195f56b3c 100644 --- a/evalml/pipelines/time_series_classification_pipelines.py +++ b/evalml/pipelines/time_series_classification_pipelines.py @@ -6,7 +6,7 @@ from evalml.objectives import get_objective from evalml.pipelines.classification_pipeline import ClassificationPipeline -from evalml.pipelines.pipeline_meta import TimeSeriesPipelineBaseMeta +from evalml.pipelines.time_series_pipeline_base import TimeSeriesPipelineBase from evalml.problem_types import ProblemTypes from evalml.utils import ( drop_rows_with_nans, @@ -15,9 +15,7 @@ ) -class TimeSeriesClassificationPipeline( - ClassificationPipeline, metaclass=TimeSeriesPipelineBaseMeta -): +class TimeSeriesClassificationPipeline(TimeSeriesPipelineBase, ClassificationPipeline): """Pipeline base class for time series classification problems. Arguments: @@ -33,37 +31,6 @@ class TimeSeriesClassificationPipeline( random_seed (int): Seed for the random number generator. Defaults to 0. """ - def __init__( - self, - component_graph, - parameters=None, - custom_name=None, - random_seed=0, - ): - if "pipeline" not in parameters: - raise ValueError( - "date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. " - "Please specify them as a dictionary with the key 'pipeline'." - ) - pipeline_params = parameters["pipeline"] - self.date_index = pipeline_params["date_index"] - self.gap = pipeline_params["gap"] - self.max_delay = pipeline_params["max_delay"] - super().__init__( - component_graph, - custom_name=custom_name, - parameters=parameters, - random_seed=random_seed, - ) - - @staticmethod - def _convert_to_woodwork(X, y): - if X is None: - X = pd.DataFrame() - X = infer_feature_types(X) - y = infer_feature_types(y) - return X, y - def fit(self, X, y): """Fit a time series classification pipeline. @@ -77,26 +44,9 @@ def fit(self, X, y): X, y = self._convert_to_woodwork(X, y) self._encoder.fit(y) y = self._encode_targets(y) - - self.input_target_name = y.name - X_t = self.component_graph.fit_features(X, y) - - y_shifted = y.shift(-self.gap) - X_t, y_shifted = drop_rows_with_nans(X_t, y_shifted) - self.estimator.fit(X_t, y_shifted) - self.input_feature_names = self.component_graph.input_feature_names + self._fit(X, y) return self - def _estimator_predict(self, features, y): - """Get estimator predictions. - - This helper passes y as an argument if needed by the estimator. - """ - y_arg = None - if self.estimator.predict_uses_y: - y_arg = y - return self.estimator.predict(features, y=y_arg) - def _estimator_predict_proba(self, features, y): """Get estimator predicted probabilities. @@ -122,9 +72,9 @@ def predict(self, X, y=None, objective=None): """Make predictions using selected features. Arguments: - X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] - y (pd.Series, np.ndarray, None): The target training targets of length [n_samples] - objective (Object or string): The objective to use to make predictions + X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. + y (pd.Series, np.ndarray, None): The target training targets of length [n_samples]. + objective (Object or string): The objective to use to make predictions. Returns: pd.Series: Predicted values. @@ -145,10 +95,10 @@ def predict_proba(self, X, y=None): """Make probability estimates for labels. Arguments: - X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] + X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. Returns: - pd.DataFrame: Probability estimates + pd.DataFrame: Probability estimates. """ X, y = self._convert_to_woodwork(X, y) y = self._encode_targets(y) @@ -163,16 +113,15 @@ def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. Arguments: - X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] - y (pd.Series): True labels of length [n_samples] - objectives (list): Non-empty list of objectives to score on + X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. + y (pd.Series): True labels of length [n_samples]. + objectives (list): Non-empty list of objectives to score on. Returns: - dict: Ordered dictionary of objective scores + dict: Ordered dictionary of objective scores. """ X, y = self._convert_to_woodwork(X, y) objectives = self.create_objectives(objectives) - y_encoded = self._encode_targets(y) y_shifted = y_encoded.shift(-self.gap) y_predicted, y_predicted_proba = self._compute_predictions( @@ -193,7 +142,6 @@ def score(self, X, y, objectives): class TimeSeriesBinaryClassificationPipeline( BinaryClassificationPipelineMixin, TimeSeriesClassificationPipeline, - metaclass=TimeSeriesPipelineBaseMeta, ): """Pipeline base class for time series binary classification problems. diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py new file mode 100644 index 0000000000..563d63db79 --- /dev/null +++ b/evalml/pipelines/time_series_pipeline_base.py @@ -0,0 +1,86 @@ +import pandas as pd + +from evalml.pipelines import PipelineBase +from evalml.pipelines.pipeline_meta import TimeSeriesPipelineBaseMeta +from evalml.utils import drop_rows_with_nans, infer_feature_types + + +class TimeSeriesPipelineBase(PipelineBase, metaclass=TimeSeriesPipelineBaseMeta): + + """Pipeline base class for time series problems. + + Arguments: + component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. + Note that when duplicate components are specified in a list, the duplicate component names will be modified with the + component's index in the list. For example, the component graph + [Imputer, One Hot Encoder, Imputer, Logistic Regression Classifier] will have names + ["Imputer", "One Hot Encoder", "Imputer_2", "Logistic Regression Classifier"] + parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. + An empty dictionary {} implies using all default values for component parameters. Pipeline-level + parameters such as date_index, gap, and max_delay must be specified with the "pipeline" key. For example: + Pipeline(parameters={"pipeline": {"date_index": "Date", "max_delay": 4, "gap": 2}}). + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + def __init__( + self, + component_graph, + parameters=None, + custom_name=None, + random_seed=0, + ): + if not parameters or "pipeline" not in parameters: + raise ValueError( + "date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. " + "Please specify them as a dictionary with the key 'pipeline'." + ) + pipeline_params = parameters["pipeline"] + self.date_index = pipeline_params["date_index"] + self.gap = pipeline_params["gap"] + self.max_delay = pipeline_params["max_delay"] + super().__init__( + component_graph, + custom_name=custom_name, + parameters=parameters, + random_seed=random_seed, + ) + + @staticmethod + def _convert_to_woodwork(X, y): + if X is None: + X = pd.DataFrame() + X = infer_feature_types(X) + y = infer_feature_types(y) + return X, y + + def fit(self, X, y): + """Fit a time series pipeline. + + Arguments: + X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. + y (pd.Series, np.ndarray): The target training targets of length [n_samples]. + + Returns: + self + """ + X, y = self._convert_to_woodwork(X, y) + self._fit(X, y) + return self + + def _fit(self, X, y): + self.input_target_name = y.name + X_t = self.component_graph.fit_features(X, y) + y_shifted = y.shift(-self.gap) + X_t, y_shifted = drop_rows_with_nans(X_t, y_shifted) + self.estimator.fit(X_t, y_shifted) + self.input_feature_names = self.component_graph.input_feature_names + + def _estimator_predict(self, features, y): + """Get estimator predictions. + + This helper passes y as an argument if needed by the estimator. + """ + y_arg = None + if self.estimator.predict_uses_y: + y_arg = y + return self.estimator.predict(features, y=y_arg) diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index 965d52da02..1699c355a8 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -1,7 +1,4 @@ -import pandas as pd - -from evalml.pipelines.pipeline_meta import TimeSeriesPipelineBaseMeta -from evalml.pipelines.regression_pipeline import RegressionPipeline +from evalml.pipelines.time_series_pipeline_base import TimeSeriesPipelineBase from evalml.problem_types import ProblemTypes from evalml.utils import ( drop_rows_with_nans, @@ -10,9 +7,7 @@ ) -class TimeSeriesRegressionPipeline( - RegressionPipeline, metaclass=TimeSeriesPipelineBaseMeta -): +class TimeSeriesRegressionPipeline(TimeSeriesPipelineBase): """Pipeline base class for time series regression problems. Arguments: @@ -29,79 +24,23 @@ class TimeSeriesRegressionPipeline( """ problem_type = ProblemTypes.TIME_SERIES_REGRESSION - """ProblemTypes.TIME_SERIES_REGRESSIO""" - - def __init__( - self, - component_graph, - parameters=None, - custom_name=None, - random_seed=0, - ): - if not parameters or "pipeline" not in parameters: - raise ValueError( - "date_index, gap, and max_delay parameters cannot be omitted from the parameters dict. " - "Please specify them as a dictionary with the key 'pipeline'." - ) - pipeline_params = parameters["pipeline"] - self.date_index = pipeline_params["date_index"] - self.gap = pipeline_params["gap"] - self.max_delay = pipeline_params["max_delay"] - super().__init__( - component_graph, - custom_name=custom_name, - parameters=parameters, - random_seed=random_seed, - ) - - def fit(self, X, y): - """Fit a time series regression pipeline. - - Arguments: - X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] - y (pd.Series, np.ndarray): The target training targets of length [n_samples] - - Returns: - self - """ - if X is None: - X = pd.DataFrame() - - X = infer_feature_types(X) - y = infer_feature_types(y) - - self.input_target_name = y.name - X_t = self.component_graph.fit_features(X, y) - - y_shifted = y.shift(-self.gap) - X_t, y_shifted = drop_rows_with_nans(X_t, y_shifted) - self.estimator.fit(X_t, y_shifted) - self.input_feature_names = self.component_graph.input_feature_names - - return self + """ProblemTypes.TIME_SERIES_REGRESSION""" def predict(self, X, y=None, objective=None): """Make predictions using selected features. Arguments: - X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] - y (pd.Series, np.ndarray, None): The target training targets of length [n_samples] - objective (Object or string): The objective to use to make predictions + X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]. + y (pd.Series, np.ndarray, None): The target training targets of length [n_samples]. + objective (Object or string): The objective to use to make predictions. Returns: pd.Series: Predicted values. """ - if X is None: - X = pd.DataFrame() - X = infer_feature_types(X) - y = infer_feature_types(y) + X, y = self._convert_to_woodwork(X, y) features = self.compute_estimator_features(X, y) features_no_nan, y = drop_rows_with_nans(features, y) - y_arg = None - if self.estimator.predict_uses_y: - y_arg = y - predictions = self.estimator.predict(features_no_nan, y_arg) - + predictions = self._estimator_predict(features_no_nan, y) predictions.index = y.index predictions = self.inverse_transform(predictions) predictions = predictions.rename(self.input_target_name) @@ -114,23 +53,17 @@ def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. Arguments: - X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] - y (pd.Series): True labels of length [n_samples] - objectives (list): Non-empty list of objectives to score on + X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. + y (pd.Series): True labels of length [n_samples]. + objectives (list): Non-empty list of objectives to score on. Returns: - dict: Ordered dictionary of objective scores + dict: Ordered dictionary of objective scores. """ - # Only converting X for the call to _score_all_objectives - if X is None: - X = pd.DataFrame() - X = infer_feature_types(X) - y = infer_feature_types(y) - + X, y = self._convert_to_woodwork(X, y) + objectives = self.create_objectives(objectives) y_predicted = self.predict(X, y) - y_shifted = y.shift(-self.gap) - objectives = self.create_objectives(objectives) y_shifted, y_predicted = drop_rows_with_nans(y_shifted, y_predicted) return self._score_all_objectives( X, y_shifted, y_predicted, y_pred_proba=None, objectives=objectives