-
Notifications
You must be signed in to change notification settings - Fork 91
Integrate Time Series Classification Pipelines into AutoMLSearch #1666
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0f93176
fa6082d
b47c6ef
7e51ecc
93702d9
987058c
5f1693a
7149adb
1a53622
e97ce89
c636e3b
c7fcabe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,23 +6,25 @@ | |
| from evalml.problem_types import ProblemTypes | ||
| from evalml.utils.gen_utils import ( | ||
| _convert_to_woodwork_structure, | ||
| _convert_woodwork_types_wrapper | ||
| _convert_woodwork_types_wrapper, | ||
| pad_with_nans | ||
| ) | ||
|
|
||
|
|
||
| class TimeSeriesBaselineRegressor(Estimator): | ||
| """Time series regressor that predicts using the naive forecasting approach. | ||
| class TimeSeriesBaselineEstimator(Estimator): | ||
| """Time series estimator that predicts using the naive forecasting approach. | ||
|
|
||
| This is useful as a simple baseline regressor for time series problems | ||
| This is useful as a simple baseline estimator for time series problems | ||
| """ | ||
| name = "Time Series Baseline Regressor" | ||
| name = "Time Series Baseline Estimator" | ||
| hyperparameter_ranges = {} | ||
| model_family = ModelFamily.BASELINE | ||
| supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION] | ||
| supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, | ||
| ProblemTypes.TIME_SERIES_MULTICLASS] | ||
| predict_uses_y = True | ||
|
|
||
| def __init__(self, gap=1, random_state=0, **kwargs): | ||
| """Baseline time series regressor that predicts using the naive forecasting approach. | ||
| """Baseline time series estimator that predicts using the naive forecasting approach. | ||
|
|
||
| Arguments: | ||
| gap (int): gap between prediction date and target date and must be a positive integer. If gap is 0, target date will be shifted ahead by 1 time period. | ||
|
|
@@ -54,7 +56,7 @@ def fit(self, X, y=None): | |
|
|
||
| def predict(self, X, y=None): | ||
| if y is None: | ||
| raise ValueError("Cannot predict Time Series Baseline Regressor if y is None") | ||
| raise ValueError("Cannot predict Time Series Baseline Estimator if y is None") | ||
| y = _convert_to_woodwork_structure(y) | ||
| y = _convert_woodwork_types_wrapper(y.to_series()) | ||
|
|
||
|
|
@@ -63,9 +65,21 @@ def predict(self, X, y=None): | |
|
|
||
| return y | ||
|
|
||
| def predict_proba(self, X, y=None): | ||
| if y is None: | ||
| raise ValueError("Cannot predict Time Series Baseline Estimator if y is None") | ||
| y = _convert_to_woodwork_structure(y) | ||
| y = _convert_woodwork_types_wrapper(y.to_series()) | ||
| preds = self.predict(X, y).dropna(axis=0, how='any').astype('int') | ||
| proba_arr = np.zeros((len(preds), y.max() + 1)) | ||
| proba_arr[np.arange(len(preds)), preds] = 1 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! |
||
| return pad_with_nans(pd.DataFrame(proba_arr), len(y) - len(preds)) | ||
|
|
||
| @property | ||
| def feature_importance(self): | ||
| """Returns importance associated with each feature. Since baseline regressors do not use input features to calculate predictions, returns an array of zeroes. | ||
| """Returns importance associated with each feature. | ||
|
|
||
| Since baseline estimators do not use input features to calculate predictions, returns an array of zeroes. | ||
|
|
||
| Returns: | ||
| np.ndarray (float): an array of zeroes | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1 @@ | ||
| from .baseline_regression import BaselineRegressionPipeline, MeanBaselineRegressionPipeline | ||
| from .time_series_baseline_regression import TimeSeriesBaselineRegressionPipeline |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| from evalml.pipelines import ( | ||
| TimeSeriesBinaryClassificationPipeline, | ||
| TimeSeriesMulticlassClassificationPipeline, | ||
| TimeSeriesRegressionPipeline | ||
| ) | ||
|
|
||
|
|
||
| class TimeSeriesBaselineRegressionPipeline(TimeSeriesRegressionPipeline): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consolidating all of the baseline pipelines for ts into the same file
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this |
||
| """Baseline Pipeline for time series regression problems.""" | ||
| _name = "Time Series Baseline Regression Pipeline" | ||
| component_graph = ["Time Series Baseline Estimator"] | ||
|
|
||
|
|
||
| class TimeSeriesBaselineBinaryPipeline(TimeSeriesBinaryClassificationPipeline): | ||
| """Baseline Pipeline for time series binary classification problems.""" | ||
| _name = "Time Series Baseline Binary Pipeline" | ||
| component_graph = ["Time Series Baseline Estimator"] | ||
|
|
||
|
|
||
| class TimeSeriesBaselineMulticlassPipeline(TimeSeriesMulticlassClassificationPipeline): | ||
| """Baseline Pipeline for time series multiclass classification problems.""" | ||
| _name = "Time Series Baseline Multiclass Pipeline" | ||
| component_graph = ["Time Series Baseline Estimator"] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -111,7 +111,11 @@ def predict(self, X, y=None, objective=None): | |
| y = _convert_woodwork_types_wrapper(y.to_series()) | ||
| n_features = max(len(y), X.shape[0]) | ||
| predictions = self._predict(X, y, objective=objective, pad=False) | ||
| predictions = pd.Series(self._decode_targets(predictions), name=self.input_target_name) | ||
|
|
||
| # In case gap is 0 and this is a baseline pipeline, we drop the nans in the | ||
| # predictions before decoding them | ||
| predictions = pd.Series(self._decode_targets(predictions.dropna()), name=self.input_target_name) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch! |
||
|
|
||
| return pad_with_nans(predictions, max(0, n_features - predictions.shape[0])) | ||
|
|
||
| def predict_proba(self, X, y=None): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,7 +30,11 @@ | |
| TextFeaturizer | ||
| ) | ||
| from evalml.pipelines.components.utils import all_components, get_estimators | ||
| from evalml.problem_types import ProblemTypes, handle_problem_types | ||
| from evalml.problem_types import ( | ||
| ProblemTypes, | ||
| handle_problem_types, | ||
| is_time_series | ||
| ) | ||
| from evalml.utils import get_logger | ||
| from evalml.utils.gen_utils import _convert_to_woodwork_structure | ||
|
|
||
|
|
@@ -67,7 +71,7 @@ def _get_preprocessing_components(X, y, problem_type, text_columns, estimator_cl | |
| if add_datetime_featurizer: | ||
| pp_components.append(DateTimeFeaturizer) | ||
|
|
||
| if problem_type in [ProblemTypes.TIME_SERIES_REGRESSION]: | ||
| if is_time_series(problem_type): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So that AutoML can create pipelines for ts classification when |
||
| pp_components.append(DelayedFeatureTransformer) | ||
|
|
||
| categorical_cols = X.select('category') | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,15 +48,11 @@ | |
| BinaryClassificationPipeline, | ||
| Estimator, | ||
| MulticlassClassificationPipeline, | ||
| RegressionPipeline, | ||
| TimeSeriesRegressionPipeline | ||
| RegressionPipeline | ||
| ) | ||
| from evalml.pipelines.components.utils import get_estimators | ||
| from evalml.pipelines.utils import make_pipeline | ||
| from evalml.preprocessing.data_splitters import ( | ||
| TimeSeriesSplit, | ||
| TrainingValidationSplit | ||
| ) | ||
| from evalml.preprocessing.data_splitters import TrainingValidationSplit | ||
| from evalml.problem_types import ProblemTypes, handle_problem_types | ||
| from evalml.tuners import NoParamsException, RandomSearchTuner | ||
| from evalml.utils.gen_utils import ( | ||
|
|
@@ -1976,32 +1972,6 @@ def test_automl_validates_problem_configuration(X_y_binary): | |
| assert problem_config == {"max_delay": 2, "gap": 3} | ||
|
|
||
|
|
||
| @patch('evalml.pipelines.TimeSeriesRegressionPipeline.score', return_value={"R2": 0.3}) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moving this test to |
||
| @patch('evalml.pipelines.TimeSeriesRegressionPipeline.fit') | ||
| def test_automl_time_series_regression(mock_fit, mock_score, X_y_regression): | ||
| X, y = X_y_regression | ||
|
|
||
| configuration = {"gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True} | ||
|
|
||
| class Pipeline1(TimeSeriesRegressionPipeline): | ||
| name = "Pipeline 1" | ||
| component_graph = ["Delayed Feature Transformer", "Random Forest Regressor"] | ||
|
|
||
| class Pipeline2(TimeSeriesRegressionPipeline): | ||
| name = "Pipeline 2" | ||
| component_graph = ["Delayed Feature Transformer", "Elastic Net Regressor"] | ||
|
|
||
| automl = AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression", problem_configuration=configuration, | ||
| allowed_pipelines=[Pipeline1, Pipeline2], max_batches=2) | ||
| automl.search() | ||
| assert isinstance(automl.data_splitter, TimeSeriesSplit) | ||
| for result in automl.results['pipeline_results'].values(): | ||
| if result["id"] == 0: | ||
| continue | ||
| assert result['parameters']['Delayed Feature Transformer'] == configuration | ||
| assert result['parameters']['pipeline'] == configuration | ||
|
|
||
|
|
||
| @patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold') | ||
| def test_automl_best_pipeline(mock_optimize, X_y_binary): | ||
| X, y = X_y_binary | ||
|
|
@@ -2085,7 +2055,7 @@ def test_timeseries_baseline_init_with_correct_gap_max_delay(mock_fit, mock_scor | |
|
|
||
| # Best pipeline is baseline pipeline because we only run one iteration | ||
| assert automl.best_pipeline.parameters == {"pipeline": {"gap": 6, "max_delay": 3}, | ||
| "Time Series Baseline Regressor": {"gap": 6, "max_delay": 3}} | ||
| "Time Series Baseline Estimator": {"gap": 6, "max_delay": 3}} | ||
|
|
||
|
|
||
| @pytest.mark.parametrize('problem_type', [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.