Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ Regressors are components that output a predicted target value.
RandomForestRegressor
XGBoostRegressor
BaselineRegressor
TimeSeriesBaselineRegressor
TimeSeriesBaselineEstimator
StackedEnsembleRegressor
DecisionTreeRegressor
LightGBMRegressor
Expand Down
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Release Notes
* Support graphviz 0.16 :pr:`1657`
* Enhanced time series pipelines to accept empty features :pr:`1651`
* Added support for list inputs for objectives :pr:`1663`
* Added support for ``AutoMLSearch`` to handle time series classification pipelines :pr:`1666`
* Fixes
* Fixed thresholding for pipelines in ``AutoMLSearch`` to only threshold binary classification pipelines :pr:`1622` :pr:`1626`
* Updated ``load_data`` to return Woodwork structures and update default parameter value for ``index`` to ``None`` :pr:`1610`
Expand Down
10 changes: 8 additions & 2 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
ModeBaselineBinaryPipeline,
ModeBaselineMulticlassPipeline,
PipelineBase,
TimeSeriesBaselineBinaryPipeline,
TimeSeriesBaselineMulticlassPipeline,
TimeSeriesBaselineRegressionPipeline
)
from evalml.pipelines.components.utils import get_estimators
Expand Down Expand Up @@ -634,10 +636,14 @@ def _add_baseline_pipelines(self):
elif self.problem_type == ProblemTypes.REGRESSION:
baseline = MeanBaselineRegressionPipeline(parameters={})
else:
pipeline_class = {ProblemTypes.TIME_SERIES_REGRESSION: TimeSeriesBaselineRegressionPipeline,
ProblemTypes.TIME_SERIES_MULTICLASS: TimeSeriesBaselineMulticlassPipeline,
ProblemTypes.TIME_SERIES_BINARY: TimeSeriesBaselineBinaryPipeline}[self.problem_type]
gap = self.problem_configuration['gap']
max_delay = self.problem_configuration['max_delay']
baseline = TimeSeriesBaselineRegressionPipeline(parameters={"pipeline": {"gap": gap, "max_delay": max_delay},
"Time Series Baseline Regressor": {"gap": gap, "max_delay": max_delay}})
baseline = pipeline_class(parameters={"pipeline": {"gap": gap, "max_delay": max_delay},
"Time Series Baseline Estimator": {"gap": gap, "max_delay": max_delay}})

pipelines = [baseline]
scores = self._evaluate_pipelines(pipelines, baseline=True)
if scores == []:
Expand Down
14 changes: 9 additions & 5 deletions evalml/automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
TimeSeriesSplit,
TrainingValidationSplit
)
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.problem_types import (
ProblemTypes,
handle_problem_types,
is_time_series
)

_LARGE_DATA_ROW_THRESHOLD = int(1e5)

Expand All @@ -25,7 +29,9 @@ def get_default_primary_search_objective(problem_type):
objective_name = {'binary': 'Log Loss Binary',
'multiclass': 'Log Loss Multiclass',
'regression': 'R2',
'time series regression': 'R2'}[problem_type.value]
'time series regression': 'R2',
'time series binary': 'Log Loss Binary',
'time series multiclass': 'Log Loss Multiclass'}[problem_type.value]
return get_objective(objective_name, return_instance=True)


Expand All @@ -51,9 +57,7 @@ def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=
data_splitter = KFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
data_splitter = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
elif problem_type in [ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.TIME_SERIES_BINARY,
ProblemTypes.TIME_SERIES_MULTICLASS]:
elif is_time_series(problem_type):
if not problem_configuration:
raise ValueError("problem_configuration is required for time series problem types")
data_splitter = TimeSeriesSplit(n_splits=n_splits, gap=problem_configuration.get('gap'),
Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,5 @@
from .regression import (
BaselineRegressionPipeline,
MeanBaselineRegressionPipeline,
TimeSeriesBaselineRegressionPipeline
)
from .time_series_baselines import TimeSeriesBaselineRegressionPipeline, TimeSeriesBaselineBinaryPipeline, TimeSeriesBaselineMulticlassPipeline
2 changes: 1 addition & 1 deletion evalml/pipelines/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
BaselineRegressor,
DecisionTreeClassifier,
DecisionTreeRegressor,
TimeSeriesBaselineRegressor
TimeSeriesBaselineEstimator
)
from .transformers import (
Transformer,
Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/components/estimators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@
ElasticNetRegressor,
ExtraTreesRegressor,
BaselineRegressor,
TimeSeriesBaselineRegressor,
TimeSeriesBaselineEstimator,
DecisionTreeRegressor)
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
from .et_regressor import ExtraTreesRegressor
from .baseline_regressor import BaselineRegressor
from .decision_tree_regressor import DecisionTreeRegressor
from .time_series_baseline_regressor import TimeSeriesBaselineRegressor
from .time_series_baseline_estimator import TimeSeriesBaselineEstimator
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,25 @@
from evalml.problem_types import ProblemTypes
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
_convert_woodwork_types_wrapper,
pad_with_nans
)


class TimeSeriesBaselineRegressor(Estimator):
"""Time series regressor that predicts using the naive forecasting approach.
class TimeSeriesBaselineEstimator(Estimator):
"""Time series estimator that predicts using the naive forecasting approach.

This is useful as a simple baseline regressor for time series problems
This is useful as a simple baseline estimator for time series problems
"""
name = "Time Series Baseline Regressor"
name = "Time Series Baseline Estimator"
hyperparameter_ranges = {}
model_family = ModelFamily.BASELINE
supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION]
supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY,
ProblemTypes.TIME_SERIES_MULTICLASS]
predict_uses_y = True

def __init__(self, gap=1, random_state=0, **kwargs):
"""Baseline time series regressor that predicts using the naive forecasting approach.
"""Baseline time series estimator that predicts using the naive forecasting approach.

Arguments:
gap (int): gap between prediction date and target date and must be a positive integer. If gap is 0, target date will be shifted ahead by 1 time period.
Expand Down Expand Up @@ -54,7 +56,7 @@ def fit(self, X, y=None):

def predict(self, X, y=None):
if y is None:
raise ValueError("Cannot predict Time Series Baseline Regressor if y is None")
raise ValueError("Cannot predict Time Series Baseline Estimator if y is None")
y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())

Expand All @@ -63,9 +65,21 @@ def predict(self, X, y=None):

return y

def predict_proba(self, X, y=None):
if y is None:
raise ValueError("Cannot predict Time Series Baseline Estimator if y is None")
y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())
preds = self.predict(X, y).dropna(axis=0, how='any').astype('int')
proba_arr = np.zeros((len(preds), y.max() + 1))
proba_arr[np.arange(len(preds)), preds] = 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

return pad_with_nans(pd.DataFrame(proba_arr), len(y) - len(preds))

@property
def feature_importance(self):
"""Returns importance associated with each feature. Since baseline regressors do not use input features to calculate predictions, returns an array of zeroes.
"""Returns importance associated with each feature.

Since baseline estimators do not use input features to calculate predictions, returns an array of zeroes.

Returns:
np.ndarray (float): an array of zeroes
Expand Down
1 change: 0 additions & 1 deletion evalml/pipelines/regression/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
from .baseline_regression import BaselineRegressionPipeline, MeanBaselineRegressionPipeline
from .time_series_baseline_regression import TimeSeriesBaselineRegressionPipeline

This file was deleted.

23 changes: 23 additions & 0 deletions evalml/pipelines/time_series_baselines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from evalml.pipelines import (
TimeSeriesBinaryClassificationPipeline,
TimeSeriesMulticlassClassificationPipeline,
TimeSeriesRegressionPipeline
)


class TimeSeriesBaselineRegressionPipeline(TimeSeriesRegressionPipeline):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consolidating all of the baseline pipelines for ts into the same file

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this

"""Baseline Pipeline for time series regression problems."""
_name = "Time Series Baseline Regression Pipeline"
component_graph = ["Time Series Baseline Estimator"]


class TimeSeriesBaselineBinaryPipeline(TimeSeriesBinaryClassificationPipeline):
"""Baseline Pipeline for time series binary classification problems."""
_name = "Time Series Baseline Binary Pipeline"
component_graph = ["Time Series Baseline Estimator"]


class TimeSeriesBaselineMulticlassPipeline(TimeSeriesMulticlassClassificationPipeline):
"""Baseline Pipeline for time series multiclass classification problems."""
_name = "Time Series Baseline Multiclass Pipeline"
component_graph = ["Time Series Baseline Estimator"]
6 changes: 5 additions & 1 deletion evalml/pipelines/time_series_classification_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,11 @@ def predict(self, X, y=None, objective=None):
y = _convert_woodwork_types_wrapper(y.to_series())
n_features = max(len(y), X.shape[0])
predictions = self._predict(X, y, objective=objective, pad=False)
predictions = pd.Series(self._decode_targets(predictions), name=self.input_target_name)

# In case gap is 0 and this is a baseline pipeline, we drop the nans in the
# predictions before decoding them
predictions = pd.Series(self._decode_targets(predictions.dropna()), name=self.input_target_name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch!


return pad_with_nans(predictions, max(0, n_features - predictions.shape[0]))

def predict_proba(self, X, y=None):
Expand Down
8 changes: 6 additions & 2 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@
TextFeaturizer
)
from evalml.pipelines.components.utils import all_components, get_estimators
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.problem_types import (
ProblemTypes,
handle_problem_types,
is_time_series
)
from evalml.utils import get_logger
from evalml.utils.gen_utils import _convert_to_woodwork_structure

Expand Down Expand Up @@ -67,7 +71,7 @@ def _get_preprocessing_components(X, y, problem_type, text_columns, estimator_cl
if add_datetime_featurizer:
pp_components.append(DateTimeFeaturizer)

if problem_type in [ProblemTypes.TIME_SERIES_REGRESSION]:
if is_time_series(problem_type):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So that AutoML can create pipelines for ts classification when allowed_pipelines=None

pp_components.append(DelayedFeatureTransformer)

categorical_cols = X.select('category')
Expand Down
36 changes: 3 additions & 33 deletions evalml/tests/automl_tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,11 @@
BinaryClassificationPipeline,
Estimator,
MulticlassClassificationPipeline,
RegressionPipeline,
TimeSeriesRegressionPipeline
RegressionPipeline
)
from evalml.pipelines.components.utils import get_estimators
from evalml.pipelines.utils import make_pipeline
from evalml.preprocessing.data_splitters import (
TimeSeriesSplit,
TrainingValidationSplit
)
from evalml.preprocessing.data_splitters import TrainingValidationSplit
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.tuners import NoParamsException, RandomSearchTuner
from evalml.utils.gen_utils import (
Expand Down Expand Up @@ -1976,32 +1972,6 @@ def test_automl_validates_problem_configuration(X_y_binary):
assert problem_config == {"max_delay": 2, "gap": 3}


@patch('evalml.pipelines.TimeSeriesRegressionPipeline.score', return_value={"R2": 0.3})
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving this test to test_automl_search_regression.py

@patch('evalml.pipelines.TimeSeriesRegressionPipeline.fit')
def test_automl_time_series_regression(mock_fit, mock_score, X_y_regression):
X, y = X_y_regression

configuration = {"gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True}

class Pipeline1(TimeSeriesRegressionPipeline):
name = "Pipeline 1"
component_graph = ["Delayed Feature Transformer", "Random Forest Regressor"]

class Pipeline2(TimeSeriesRegressionPipeline):
name = "Pipeline 2"
component_graph = ["Delayed Feature Transformer", "Elastic Net Regressor"]

automl = AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression", problem_configuration=configuration,
allowed_pipelines=[Pipeline1, Pipeline2], max_batches=2)
automl.search()
assert isinstance(automl.data_splitter, TimeSeriesSplit)
for result in automl.results['pipeline_results'].values():
if result["id"] == 0:
continue
assert result['parameters']['Delayed Feature Transformer'] == configuration
assert result['parameters']['pipeline'] == configuration


@patch('evalml.objectives.BinaryClassificationObjective.optimize_threshold')
def test_automl_best_pipeline(mock_optimize, X_y_binary):
X, y = X_y_binary
Expand Down Expand Up @@ -2085,7 +2055,7 @@ def test_timeseries_baseline_init_with_correct_gap_max_delay(mock_fit, mock_scor

# Best pipeline is baseline pipeline because we only run one iteration
assert automl.best_pipeline.parameters == {"pipeline": {"gap": 6, "max_delay": 3},
"Time Series Baseline Regressor": {"gap": 6, "max_delay": 3}}
"Time Series Baseline Estimator": {"gap": 6, "max_delay": 3}}


@pytest.mark.parametrize('problem_type', [ProblemTypes.BINARY, ProblemTypes.MULTICLASS,
Expand Down
45 changes: 41 additions & 4 deletions evalml/tests/automl_tests/test_automl_search_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandas as pd
import pytest
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
from sklearn.model_selection import StratifiedKFold
from skopt.space import Categorical

from evalml import AutoMLSearch
Expand All @@ -22,10 +22,13 @@
ModeBaselineBinaryPipeline,
ModeBaselineMulticlassPipeline,
MulticlassClassificationPipeline,
PipelineBase
PipelineBase,
TimeSeriesBaselineBinaryPipeline,
TimeSeriesBaselineMulticlassPipeline
)
from evalml.pipelines.components.utils import get_estimators
from evalml.pipelines.utils import make_pipeline
from evalml.preprocessing import TimeSeriesSplit
from evalml.problem_types import ProblemTypes


Expand Down Expand Up @@ -77,8 +80,8 @@ def test_data_splitter(X_y_binary):
assert isinstance(automl.rankings, pd.DataFrame)
assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds

automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', data_splitter=TimeSeriesSplit(cv_folds), max_iterations=1,
n_jobs=1)
automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', data_splitter=TimeSeriesSplit(n_splits=cv_folds),
max_iterations=1, n_jobs=1)
automl.search()

assert isinstance(automl.rankings, pd.DataFrame)
Expand Down Expand Up @@ -677,3 +680,37 @@ def test_automl_multiclass_nonlinear_pipeline_search_more_iterations(nonlinear_m
assert start_iteration_callback.call_args_list[0][0][0] == ModeBaselineMulticlassPipeline
assert start_iteration_callback.call_args_list[1][0][0] == nonlinear_multiclass_pipeline_class
assert start_iteration_callback.call_args_list[4][0][0] == nonlinear_multiclass_pipeline_class


@pytest.mark.parametrize('problem_type', [ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_BINARY])
@patch('evalml.pipelines.TimeSeriesMulticlassClassificationPipeline.score')
@patch('evalml.pipelines.TimeSeriesBinaryClassificationPipeline.score')
@patch('evalml.pipelines.TimeSeriesMulticlassClassificationPipeline.fit')
@patch('evalml.pipelines.TimeSeriesBinaryClassificationPipeline.fit')
def test_automl_supports_time_series_classification(mock_binary_fit, mock_multi_fit, mock_binary_score, mock_multiclass_score,
problem_type, X_y_binary, X_y_multi):
if problem_type == ProblemTypes.TIME_SERIES_BINARY:
X, y = X_y_binary
baseline = TimeSeriesBaselineBinaryPipeline
mock_binary_score.return_value = {"Log Loss Binary": 0.2}
problem_type = 'time series binary'
else:
X, y = X_y_multi
baseline = TimeSeriesBaselineMulticlassPipeline
mock_multiclass_score.return_value = {"Log Loss Multiclass": 0.25}
problem_type = 'time series multiclass'

configuration = {"gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True}

automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type,
problem_configuration=configuration,
max_batches=2)
automl.search()
assert isinstance(automl.data_splitter, TimeSeriesSplit)
for result in automl.results['pipeline_results'].values():
if result["id"] == 0:
assert result['pipeline_class'] == baseline
continue

assert result['parameters']['Delayed Feature Transformer'] == configuration
assert result['parameters']['pipeline'] == configuration
Loading