New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Rolling Mean features for time series #3028
Changes from 9 commits
fec0125
8a83d63
96d53a1
fddc446
4800648
7f21143
42b927a
b6192f0
667d943
b024d0d
f5f496d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,8 @@ | ||
"""Transformer that delays input features and target variable for time series problems.""" | ||
import numpy as np | ||
import pandas as pd | ||
import woodwork as ww | ||
from featuretools.primitives import RollingMean | ||
from scipy.signal import find_peaks | ||
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder | ||
from skopt.space import Real | ||
|
@@ -11,7 +13,7 @@ | |
from evalml.utils import infer_feature_types | ||
|
||
|
||
class DelayedFeatureTransformer(Transformer): | ||
class TimeSeriesFeaturizer(Transformer): | ||
"""Transformer that delays input features and target variable for time series problems. | ||
|
||
This component uses an algorithm based on the autocorrelation values of the target variable | ||
|
@@ -45,9 +47,14 @@ class DelayedFeatureTransformer(Transformer): | |
random_seed (int): Seed for the random number generator. This transformer performs the same regardless of the random seed provided. | ||
""" | ||
|
||
name = "Delayed Feature Transformer" | ||
hyperparameter_ranges = {"conf_level": Real(0.001, 1.0)} | ||
"""{}""" | ||
name = "Time Series Featurizer" | ||
hyperparameter_ranges = { | ||
"conf_level": Real(0.001, 1.0), | ||
"rolling_window_size": Real(0.001, 1.0), | ||
} | ||
"""{"conf_level": Real(0.001, 1.0), | ||
"rolling_window_size": Real(0.001, 1.0) | ||
}""" | ||
needs_fitting = True | ||
target_colname_prefix = "target_delay_{}" | ||
"""target_delay_{}""" | ||
|
@@ -59,6 +66,7 @@ def __init__( | |
gap=0, | ||
forecast_horizon=1, | ||
conf_level=0.05, | ||
rolling_window_size=0.25, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add to docstring |
||
delay_features=True, | ||
delay_target=True, | ||
random_seed=0, | ||
|
@@ -70,6 +78,7 @@ def __init__( | |
self.delay_target = delay_target | ||
self.forecast_horizon = forecast_horizon | ||
self.gap = gap | ||
self.rolling_window_size = rolling_window_size | ||
self.statistically_significant_lags = None | ||
|
||
if conf_level is None: | ||
|
@@ -92,6 +101,7 @@ def __init__( | |
"forecast_horizon": forecast_horizon, | ||
"conf_level": conf_level, | ||
"gap": gap, | ||
"rolling_window_size": rolling_window_size, | ||
} | ||
parameters.update(kwargs) | ||
super().__init__(parameters=parameters, random_seed=random_seed) | ||
|
@@ -160,62 +170,115 @@ def _find_significant_lags(y, conf_level, max_delay): | |
significant_lags = all_lags | ||
return significant_lags | ||
|
||
def transform(self, X, y=None): | ||
"""Computes the delayed features for all features in X and y. | ||
def _compute_rolling_transforms(self, X, y, original_features): | ||
"""Compute the rolling features from the original features. | ||
|
||
For each feature in X, it will add a column to the output dataframe for each | ||
delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original | ||
feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature | ||
value at row n will be taken from the n-3rd row of that feature | ||
Args: | ||
X (pd.DataFrame or None): Data to transform. | ||
y (pd.Series, or None): Target. | ||
|
||
If y is not None, it will also compute the delayed values for the target variable. | ||
Returns: | ||
pd.DataFrame: Data with rolling features. All new features. | ||
""" | ||
size = int(self.rolling_window_size * self.max_delay) | ||
rolling_mean = RollingMean( | ||
window_length=size + 1, | ||
gap=self.start_delay, | ||
min_periods=size + 1, | ||
) | ||
rolling_mean = rolling_mean.get_function() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried computing the features with dfs but I think it's a bit awkward/confusing to have one set of features go through featuretools while the other does not: #3088 I spoke with the featuretools team, and it's probably best to wait until they release Lagged rolling primitives so we can refactor the whole component to use dfs at that point. |
||
numerics = set( | ||
X.ww.select(["numeric"], return_schema=True).columns | ||
).intersection(original_features) | ||
data = pd.DataFrame( | ||
{f"{col}_rolling_mean": rolling_mean(X.index, X[col]) for col in numerics} | ||
) | ||
if y is not None and "numeric" in y.ww.semantic_tags: | ||
data[f"target_rolling_mean"] = rolling_mean(y.index, y) | ||
data.index = X.index | ||
data.ww.init() | ||
return data | ||
|
||
def _compute_delays(self, X_ww, y, original_features): | ||
"""Computes the delayed features for all features in X and y. | ||
|
||
Use the autocorrelation to determine delays. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess if we're going to keep this docstring, we should update. |
||
|
||
Args: | ||
X (pd.DataFrame): Data to transform. | ||
y (pd.Series, or None): Target. | ||
|
||
Returns: | ||
pd.DataFrame: Transformed X. | ||
pd.DataFrame: Data with original features and delays. | ||
""" | ||
# Normalize the data into pandas objects | ||
X_ww = infer_feature_types(X) | ||
cols_to_delay = list( | ||
X_ww.ww.select( | ||
["numeric", "category", "boolean"], return_schema=True | ||
).columns | ||
) | ||
X_ww = X_ww.ww.copy() | ||
categorical_columns = self._get_categorical_columns(X_ww) | ||
cols_derived_from_categoricals = [] | ||
if self.delay_features and len(X) > 0: | ||
lagged_features = {} | ||
if self.delay_features and len(X_ww) > 0: | ||
X_categorical = self._encode_X_while_preserving_index( | ||
X_ww[categorical_columns] | ||
) | ||
for col_name in cols_to_delay: | ||
|
||
col = X_ww[col_name] | ||
if col_name in categorical_columns: | ||
col = X_categorical[col_name] | ||
for t in self.statistically_significant_lags: | ||
feature_name = f"{col_name}_delay_{self.start_delay + t}" | ||
X_ww.ww[f"{col_name}_delay_{self.start_delay + t}"] = col.shift( | ||
self.start_delay + t | ||
) | ||
lagged_features[ | ||
f"{col_name}_delay_{self.start_delay + t}" | ||
] = col.shift(self.start_delay + t) | ||
if col_name in categorical_columns: | ||
cols_derived_from_categoricals.append(feature_name) | ||
# Handle cases where the target was passed in | ||
if self.delay_target and y is not None: | ||
y = infer_feature_types(y) | ||
if type(y.ww.logical_type) == logical_types.Categorical: | ||
y = self._encode_y_while_preserving_index(y) | ||
for t in self.statistically_significant_lags: | ||
X_ww.ww[ | ||
lagged_features[ | ||
self.target_colname_prefix.format(t + self.start_delay) | ||
] = y.shift(self.start_delay + t) | ||
# Features created from categorical columns should no longer be categorical | ||
X_ww.ww.set_types({col: "Double" for col in cols_derived_from_categoricals}) | ||
return X_ww.ww.drop(cols_to_delay) | ||
lagged_features = pd.DataFrame(lagged_features) | ||
lagged_features.ww.init( | ||
logical_types={col: "Double" for col in cols_derived_from_categoricals} | ||
) | ||
lagged_features.index = X_ww.index | ||
return ww.concat_columns([X_ww, lagged_features]) | ||
|
||
def transform(self, X, y=None): | ||
"""Computes the delayed values and rolling means for X and y. | ||
|
||
The chosen delays are determined by the autocorrelation function of the target variable. See the class docstring | ||
for more information on how they are chosen. If y is None, all possible lags are chosen. | ||
|
||
If y is not None, it will also compute the delayed values for the target variable. | ||
|
||
The rolling means for all numeric features in X and y, if y is numeric, are also returned. | ||
|
||
Args: | ||
X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. | ||
y (pd.Series, or None): Target. | ||
|
||
Returns: | ||
pd.DataFrame: Transformed X. No original features are returned. | ||
""" | ||
if y is not None: | ||
y = infer_feature_types(y) | ||
# Normalize the data into pandas objects | ||
X_ww = infer_feature_types(X) | ||
original_features = [col for col in X_ww.columns if col != self.date_index] | ||
delayed_features = self._compute_delays(X_ww, y, original_features) | ||
rolling_means = self._compute_rolling_transforms(X_ww, y, original_features) | ||
features = ww.concat_columns([delayed_features, rolling_means]) | ||
return features.ww.drop(original_features) | ||
|
||
def fit_transform(self, X, y): | ||
def fit_transform(self, X, y=None): | ||
"""Fit the component and transform the input data. | ||
|
||
Args: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be added to the docstring!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Absolutely.