Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rolling Mean features for time series #3028

Merged
merged 11 commits into from Nov 30, 2021
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/meta.yaml
Expand Up @@ -39,7 +39,7 @@ outputs:
- shap >=0.36.0
- texttable >=1.6.2
- woodwork >=0.8.2
- featuretools>=1.0.0
- featuretools>=1.2.0
- nlp-primitives>=2.0.0
- python >=3.7.*
- networkx >=2.5,<2.6
Expand Down
2 changes: 1 addition & 1 deletion core-requirements.txt
Expand Up @@ -15,5 +15,5 @@ texttable>=1.6.2
woodwork>=0.8.2
dask>=2021.10.0
nlp-primitives>=2.0.0
featuretools>=1.0.0
featuretools>=1.2.0
networkx>=2.5,<2.6
2 changes: 1 addition & 1 deletion docs/source/api_index.rst
Expand Up @@ -175,7 +175,7 @@ Transformers are components that take in data as input and output transformed da
evalml.pipelines.components.DropNullColumns
evalml.pipelines.components.DateTimeFeaturizer
evalml.pipelines.components.NaturalLanguageFeaturizer
evalml.pipelines.components.DelayedFeatureTransformer
evalml.pipelines.components.TimeSeriesFeaturizer
evalml.pipelines.components.DFSTransformer
evalml.pipelines.components.PolynomialDetrender
evalml.pipelines.components.Undersampler
Expand Down
2 changes: 2 additions & 0 deletions docs/source/release_notes.rst
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Renamed ``DelayedFeatureTransformer`` to ``TimeSeriesFeaturizer`` and enhanced it to compute rolling features :pr:`3028`
* Fixes
* Changes
* Documentation Changes
Expand All @@ -10,6 +11,7 @@ Release Notes
.. warning::

**Breaking Changes**
* Renamed ``DelayedFeatureTransformer`` to ``TimeSeriesFeaturizer`` :pr:`3028`


**v0.38.0 Nov. 27, 2021**
Expand Down
5 changes: 3 additions & 2 deletions docs/source/user_guide/timeseries.ipynb
Expand Up @@ -193,8 +193,9 @@
"\n",
"![estimator predictions](ts_viz/estimator_viz.png)\n",
"\n",
"#### Feature engineering components for time series\n",
"For an example of a time-series feature engineering component see [DelayedFeatureTransformer](../autoapi/evalml/pipelines/components/index.rst#evalml.pipelines.components.DelayedFeatureTransformer)"
"#### Feature engineering components for time series",
"\n",
"For an example of a time-series feature engineering component see [TimeSeriesFeaturizer](../autoapi/evalml/pipelines/components/index.rst#evalml.pipelines.components.TimeSeriesFeaturizer)"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/__init__.py
Expand Up @@ -28,7 +28,7 @@
DecisionTreeRegressor,
StackedEnsembleClassifier,
StackedEnsembleRegressor,
DelayedFeatureTransformer,
TimeSeriesFeaturizer,
DFSTransformer,
KNeighborsClassifier,
SVMClassifier,
Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/components/__init__.py
Expand Up @@ -37,7 +37,7 @@
RFClassifierSelectFromModel,
RFRegressorSelectFromModel,
PerColumnImputer,
DelayedFeatureTransformer,
TimeSeriesFeaturizer,
SimpleImputer,
Imputer,
StandardScaler,
Expand Down
Expand Up @@ -3,7 +3,7 @@

from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.pipelines.components.transformers import DelayedFeatureTransformer
from evalml.pipelines.components.transformers import TimeSeriesFeaturizer
from evalml.problem_types import ProblemTypes
from evalml.utils import infer_feature_types

Expand Down Expand Up @@ -83,13 +83,13 @@ def predict(self, X):
ValueError: If input y is None.
"""
X = infer_feature_types(X)
feature_name = DelayedFeatureTransformer.target_colname_prefix.format(
feature_name = TimeSeriesFeaturizer.target_colname_prefix.format(
self.start_delay
)
if feature_name not in X.columns:
raise ValueError(
"Time Series Baseline Estimator is meant to be used in a pipeline with "
"a DelayedFeaturesTransformer"
"a Time Series Featurizer"
)
return X.ww[feature_name]

Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/components/transformers/__init__.py
Expand Up @@ -23,7 +23,7 @@
DropNullColumns,
LSA,
NaturalLanguageFeaturizer,
DelayedFeatureTransformer,
TimeSeriesFeaturizer,
DFSTransformer,
PolynomialDetrender,
LogTransformer,
Expand Down
Expand Up @@ -4,7 +4,7 @@
from .text_transformer import TextTransformer
from .lsa import LSA
from .natural_language_featurizer import NaturalLanguageFeaturizer
from .delayed_feature_transformer import DelayedFeatureTransformer
from .time_series_featurizer import TimeSeriesFeaturizer
from .featuretools import DFSTransformer
from .polynomial_detrender import PolynomialDetrender
from .log_transformer import LogTransformer
Expand Down
@@ -1,6 +1,8 @@
"""Transformer that delays input features and target variable for time series problems."""
import numpy as np
import pandas as pd
import woodwork as ww
from featuretools.primitives import RollingMean
from scipy.signal import find_peaks
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from skopt.space import Real
Expand All @@ -11,7 +13,7 @@
from evalml.utils import infer_feature_types


class DelayedFeatureTransformer(Transformer):
class TimeSeriesFeaturizer(Transformer):
"""Transformer that delays input features and target variable for time series problems.

This component uses an algorithm based on the autocorrelation values of the target variable
Expand Down Expand Up @@ -45,9 +47,14 @@ class DelayedFeatureTransformer(Transformer):
random_seed (int): Seed for the random number generator. This transformer performs the same regardless of the random seed provided.
"""

name = "Delayed Feature Transformer"
hyperparameter_ranges = {"conf_level": Real(0.001, 1.0)}
"""{}"""
name = "Time Series Featurizer"
hyperparameter_ranges = {
"conf_level": Real(0.001, 1.0),
"rolling_window_size": Real(0.001, 1.0),
}
"""{"conf_level": Real(0.001, 1.0),
"rolling_window_size": Real(0.001, 1.0)
}"""
needs_fitting = True
target_colname_prefix = "target_delay_{}"
"""target_delay_{}"""
Expand All @@ -59,6 +66,7 @@ def __init__(
gap=0,
forecast_horizon=1,
conf_level=0.05,
rolling_window_size=0.25,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be added to the docstring!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Absolutely.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add to docstring

delay_features=True,
delay_target=True,
random_seed=0,
Expand All @@ -70,6 +78,7 @@ def __init__(
self.delay_target = delay_target
self.forecast_horizon = forecast_horizon
self.gap = gap
self.rolling_window_size = rolling_window_size
self.statistically_significant_lags = None

if conf_level is None:
Expand All @@ -92,6 +101,7 @@ def __init__(
"forecast_horizon": forecast_horizon,
"conf_level": conf_level,
"gap": gap,
"rolling_window_size": rolling_window_size,
}
parameters.update(kwargs)
super().__init__(parameters=parameters, random_seed=random_seed)
Expand Down Expand Up @@ -160,62 +170,115 @@ def _find_significant_lags(y, conf_level, max_delay):
significant_lags = all_lags
return significant_lags

def transform(self, X, y=None):
"""Computes the delayed features for all features in X and y.
def _compute_rolling_transforms(self, X, y, original_features):
"""Compute the rolling features from the original features.

For each feature in X, it will add a column to the output dataframe for each
delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original
feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature
value at row n will be taken from the n-3rd row of that feature
Args:
X (pd.DataFrame or None): Data to transform.
y (pd.Series, or None): Target.

If y is not None, it will also compute the delayed values for the target variable.
Returns:
pd.DataFrame: Data with rolling features. All new features.
"""
size = int(self.rolling_window_size * self.max_delay)
rolling_mean = RollingMean(
window_length=size + 1,
gap=self.start_delay,
min_periods=size + 1,
)
rolling_mean = rolling_mean.get_function()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried computing the features with dfs but I think it's a bit awkward/confusing to have one set of features go through featuretools while the other does not: #3088

I spoke with the featuretools team, and it's probably best to wait until they release Lagged rolling primitives so we can refactor the whole component to use dfs at that point.

numerics = set(
X.ww.select(["numeric"], return_schema=True).columns
).intersection(original_features)
data = pd.DataFrame(
{f"{col}_rolling_mean": rolling_mean(X.index, X[col]) for col in numerics}
)
if y is not None and "numeric" in y.ww.semantic_tags:
data[f"target_rolling_mean"] = rolling_mean(y.index, y)
data.index = X.index
data.ww.init()
return data

def _compute_delays(self, X_ww, y, original_features):
"""Computes the delayed features for all features in X and y.

Use the autocorrelation to determine delays.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess if we're going to keep this docstring, we should update.


Args:
X (pd.DataFrame): Data to transform.
y (pd.Series, or None): Target.

Returns:
pd.DataFrame: Transformed X.
pd.DataFrame: Data with original features and delays.
"""
# Normalize the data into pandas objects
X_ww = infer_feature_types(X)
cols_to_delay = list(
X_ww.ww.select(
["numeric", "category", "boolean"], return_schema=True
).columns
)
X_ww = X_ww.ww.copy()
categorical_columns = self._get_categorical_columns(X_ww)
cols_derived_from_categoricals = []
if self.delay_features and len(X) > 0:
lagged_features = {}
if self.delay_features and len(X_ww) > 0:
X_categorical = self._encode_X_while_preserving_index(
X_ww[categorical_columns]
)
for col_name in cols_to_delay:

col = X_ww[col_name]
if col_name in categorical_columns:
col = X_categorical[col_name]
for t in self.statistically_significant_lags:
feature_name = f"{col_name}_delay_{self.start_delay + t}"
X_ww.ww[f"{col_name}_delay_{self.start_delay + t}"] = col.shift(
self.start_delay + t
)
lagged_features[
f"{col_name}_delay_{self.start_delay + t}"
] = col.shift(self.start_delay + t)
if col_name in categorical_columns:
cols_derived_from_categoricals.append(feature_name)
# Handle cases where the target was passed in
if self.delay_target and y is not None:
y = infer_feature_types(y)
if type(y.ww.logical_type) == logical_types.Categorical:
y = self._encode_y_while_preserving_index(y)
for t in self.statistically_significant_lags:
X_ww.ww[
lagged_features[
self.target_colname_prefix.format(t + self.start_delay)
] = y.shift(self.start_delay + t)
# Features created from categorical columns should no longer be categorical
X_ww.ww.set_types({col: "Double" for col in cols_derived_from_categoricals})
return X_ww.ww.drop(cols_to_delay)
lagged_features = pd.DataFrame(lagged_features)
lagged_features.ww.init(
logical_types={col: "Double" for col in cols_derived_from_categoricals}
)
lagged_features.index = X_ww.index
return ww.concat_columns([X_ww, lagged_features])

def transform(self, X, y=None):
"""Computes the delayed values and rolling means for X and y.

The chosen delays are determined by the autocorrelation function of the target variable. See the class docstring
for more information on how they are chosen. If y is None, all possible lags are chosen.

If y is not None, it will also compute the delayed values for the target variable.

The rolling means for all numeric features in X and y, if y is numeric, are also returned.

Args:
X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used.
y (pd.Series, or None): Target.

Returns:
pd.DataFrame: Transformed X. No original features are returned.
"""
if y is not None:
y = infer_feature_types(y)
# Normalize the data into pandas objects
X_ww = infer_feature_types(X)
original_features = [col for col in X_ww.columns if col != self.date_index]
delayed_features = self._compute_delays(X_ww, y, original_features)
rolling_means = self._compute_rolling_transforms(X_ww, y, original_features)
features = ww.concat_columns([delayed_features, rolling_means])
return features.ww.drop(original_features)

def fit_transform(self, X, y):
def fit_transform(self, X, y=None):
"""Fit the component and transform the input data.

Args:
Expand Down
8 changes: 4 additions & 4 deletions evalml/pipelines/utils.py
Expand Up @@ -22,7 +22,6 @@
CatBoostRegressor,
ComponentBase,
DateTimeFeaturizer,
DelayedFeatureTransformer,
DropColumns,
DropNullColumns,
DropRowsTransformer,
Expand All @@ -38,6 +37,7 @@
StackedEnsembleRegressor,
StandardScaler,
TargetImputer,
TimeSeriesFeaturizer,
Undersampler,
URLFeaturizer,
)
Expand Down Expand Up @@ -191,7 +191,7 @@ def _get_time_series_featurizer(X, y, problem_type, estimator_class, sampler_nam
is_time_series(problem_type)
and estimator_class.model_family != ModelFamily.ARIMA
):
components.append(DelayedFeatureTransformer)
components.append(TimeSeriesFeaturizer)
return components


Expand Down Expand Up @@ -666,7 +666,7 @@ def make_timeseries_baseline_pipeline(problem_type, gap, forecast_horizon, date_
}[problem_type]
baseline = pipeline_class(
component_graph=[
"Delayed Feature Transformer",
"Time Series Featurizer",
"Time Series Baseline Estimator",
],
custom_name=pipeline_name,
Expand All @@ -677,7 +677,7 @@ def make_timeseries_baseline_pipeline(problem_type, gap, forecast_horizon, date_
"max_delay": 0,
"forecast_horizon": forecast_horizon,
},
"Delayed Feature Transformer": {
"Time Series Featurizer": {
"max_delay": 0,
"gap": gap,
"forecast_horizon": forecast_horizon,
Expand Down
3 changes: 2 additions & 1 deletion evalml/tests/automl_tests/test_automl.py
Expand Up @@ -3721,14 +3721,15 @@ def test_timeseries_baseline_init_with_correct_gap_max_delay(AutoMLTestEnv, ts_d
"max_delay": 0,
"forecast_horizon": 7,
},
"Delayed Feature Transformer": {
"Time Series Featurizer": {
"date_index": "date",
"delay_features": False,
"delay_target": True,
"max_delay": 0,
"gap": 6,
"forecast_horizon": 7,
"conf_level": 0.05,
"rolling_window_size": 0.25,
},
"Time Series Baseline Estimator": {"forecast_horizon": 7, "gap": 6},
}
Expand Down
Expand Up @@ -1156,6 +1156,7 @@ def test_automl_supports_time_series_classification(
"delay_target": False,
"delay_features": True,
"conf_level": 0.05,
"rolling_window_size": 0.25,
}

automl = AutoMLSearch(
Expand All @@ -1175,7 +1176,7 @@ def test_automl_supports_time_series_classification(
assert result["pipeline_class"] == baseline.__class__
continue

assert result["parameters"]["Delayed Feature Transformer"] == configuration
assert result["parameters"]["Time Series Featurizer"] == configuration
assert result["parameters"]["pipeline"] == configuration


Expand Down