/
dates.py
208 lines (160 loc) · 7.35 KB
/
dates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# -*- coding: utf-8 -*-
from .base import BaseExogFeaturizer
from ...compat import pmdarima as pm_compat
import numpy as np
import pandas as pd
import warnings
__all__ = [
"DateFeaturizer"
]
# TODO: future usecases might include with_hour_of_day
def _safe_hstack_numpy(left, right):
if left is None:
return right
return np.hstack([left, right])
class DateFeaturizer(BaseExogFeaturizer):
"""Create exogenous date features
Given an exogenous feature of dtype TimeStamp, creates a set of dummy and
ordinal variables indicating:
* Day of the week
Particular days of the week may align with quasi-seasonal trends.
* Day of the month
Useful for modeling things like the end-of-month effect, ie., a
department spends the remainder of its monthly budget to avoid future
budget cuts, and the last Friday of the month is heavy on spending.
The motivation for this featurizer comes from a blog post by Rob Hyndman
[1] on modeling quasi-seasonal patterns in time series. Note that an
exogenous array _must_ be provided at inference.
Parameters
----------
column_name : str
The name of the date column. This forces the exogenous array to be a
Pandas DataFrame, and does not permit a np.ndarray as others may.
with_day_of_week : bool, optional (default=True)
Whether to include dummy variables for the day of the week (in {0, 1}).
with_day_of_month : bool, optional (default=True)
Whether to include an ordinal feature for the day of the month (1-31).
prefix : str or None, optional (default=None)
The feature prefix
Examples
--------
>>> from pmdarima.datasets._base import load_date_example
>>> y, X = load_date_example()
>>> feat = DateFeaturizer(column_name='date')
>>> _, X_prime = feat.fit_transform(y, X)
>>> X_prime.head()
DATE-WEEKDAY-0 DATE-WEEKDAY-1 ... DATE-WEEKDAY-6 DATE-DAY-OF-MONTH
0 0 1 ... 0 1
1 0 0 ... 0 2
2 0 0 ... 0 3
3 0 0 ... 0 4
4 0 0 ... 0 5
Notes
-----
* In order to use time series with holes, it is required that an X
array be provided at prediction time. Other featurizers automatically
create exog arrays into the future for inference, but this is not
possible currently with the date featurizer. Your code must provide the
dates for which you are forecasting as exog features.
* The ``column_name`` field is dropped in the transformed exogenous array.
References
----------
.. [1] https://robjhyndman.com/hyndsight/monthly-seasonality/
"""
def __init__(self, column_name, with_day_of_week=True,
with_day_of_month=True, prefix=None):
super().__init__(prefix=prefix)
self.column_name = column_name
self.with_day_of_week = with_day_of_week
self.with_day_of_month = with_day_of_month
def _check_X(self, X):
# exog must be a pd.DataFrame, and the column_name must be a timestamp
if not isinstance(X, pd.DataFrame):
raise TypeError(
f"X must be a DataFrame to use the DateFeaturizer, but got "
f"type={type(X)}"
)
name = self.column_name
if not (name in X.columns and
'datetime64' in X[name].dtype.name):
raise ValueError("column '%s' must exist in exog as a "
"pd.Timestamp type"
% name)
def _get_prefix(self):
pfx = self.prefix
if pfx is None:
pfx = "DATE"
return pfx
# Overrides super abstract method
def _get_feature_names(self, X):
pfx = self._get_prefix()
out = []
# Something to note is that in Python, 0 is Monday (not Sunday). See
# comments here: https://stackoverflow.com/a/9847269/3015734
# E.g., ['DATE-WEEKDAY-0', 'DATE-WEEKDAY-1', ...]
if self.with_day_of_week:
out += ['%s-WEEKDAY-%i' % (pfx, i) for i in range(7)]
if self.with_day_of_month:
out += ['%s-DAY-OF-MONTH' % pfx]
return out
def fit(self, y, X=None, **kwargs): # TODO: remove kwargs later
"""Fit the transformer
Parameters
----------
y : array-like or None, shape=(n_samples,)
The endogenous (time-series) array.
X : array-like, shape=(n_samples, n_features)
The exogenous array of additional covariates. Must include the
``column_name`` feature, which must be a pd.Timestamp dtype.
"""
# Temporary shim until we remove `exogenous` support completely
X, _ = pm_compat.get_X(X, **kwargs)
y, X = self._check_y_X(y, X, null_allowed=False)
# enforce pd.DataFrame
self._check_X(X)
# we don't _technically_ need to do this, but it seems like a nice bit
# of friendly validation to make sure that at least _something_ will
# happen in this transformer.
if not (self.with_day_of_month or self.with_day_of_week):
warnings.warn("DateTransformer will have no effect given disabled "
"parameters")
return self
def transform(self, y, X=None, **kwargs):
"""Create date features
When an ARIMA is fit with an X array, it must be forecasted
with one also. However, unlike other exogenous featurizers, an X
array is required at inference time for the DateFeaturizer.
Parameters
----------
y : array-like or None, shape=(n_samples,)
The endogenous (time-series) array. This is unused and technically
optional for the Fourier terms, since it uses the pre-computed
``n`` to calculate the seasonal Fourier terms.
X : array-like, shape=(n_samples, n_features)
The exogenous array of additional covariates. The ``column_name``
feature must be present, and of dtype pd.Timestamp
"""
# Temporary shim until we remove `exogenous` support completely
X, _ = pm_compat.get_X(X, **kwargs)
y, X = self._check_y_X(y, X, null_allowed=True)
# enforce pd.DataFrame
self._check_X(X)
date_series = X[self.column_name] # type: pd.Series
m = X.shape[0]
# the right side of the exog array out
right_side = None
if self.with_day_of_week:
# we cannot use pd.get_dummies because for a test set with < 7 obs
# we will not produce all the features we need to. create a matrix
# of zeros and mask manually
zeros = np.zeros((m, 7), dtype=int)
zeros[np.arange(zeros.shape[0]), date_series.dt.weekday.values] = 1
right_side = zeros
if self.with_day_of_month:
day_of_month = date_series.dt.day.values.reshape(-1, 1)
right_side = _safe_hstack_numpy(right_side, day_of_month)
# stack along axis 1
if right_side is not None:
X = self._safe_hstack(X.drop(self.column_name, axis=1),
right_side)
return y, X