/
time_since.py
338 lines (289 loc) · 14 KB
/
time_since.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
"""A transformer to compute the time elapsed since a reference time."""
from __future__ import annotations
__maintainer__ = []
import datetime
import warnings
from string import digits
import numpy as np
import pandas as pd
from pandas.api.types import is_any_real_numeric_dtype
from pandas.tseries.frequencies import get_period_alias
from aeon.transformations.base import BaseTransformer
class TimeSince(BaseTransformer):
"""Get Element-wise time elapsed between the timeindex and a reference start time.
Creates a column(s) which represents: `t` - `start`, where `start` is
a reference time and `t` is the time index. The type of `start` must be
compatible with the index of `X` used in `.fit()` and `.transform()`.
The output can be converted to an integer representing the number of periods
elapsed since the start time by setting `to_numeric=True`. The period is
determined by the frequency of the index. For example, if the `freq` of
the index is "MS" or "M" then the output is the integer number of months
between `t` and `start`.
Parameters
----------
start : a list of start times, optional, default=None (use earliest time in index)
a "start time" can be one of the following types:
* int: Start time to compute the time elapsed, use when index is integer.
* time-like: `Period` or `datetime`
Start time to compute the time elapsed.
* str: String is converted to datetime or period, depending on the index type, \
to give the start time.
to_numeric : string, default=True
Return the integer number of periods elapsed since `start`; the period
is defined by the frequency of the data. Converts datetime types to
pd.Period before calculating time differences.
freq : 'str', optional, default=None
Only used when X has a pd.DatetimeIndex without a specified frequency.
Specifies the frequency of the index of your data. The string should
match a pandas offset alias:
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
keep_original_columns : boolean, optional, default=False
Keep original columns in X passed to `.transform()`.
positive_only : boolean, optional, default=False
Clips negative values to zero when `to_numeric` is True.
Examples
--------
>>> from aeon.datasets import load_airline
>>> from aeon.transformations.time_since import TimeSince
>>> X = load_airline()
Create a single column with time elapsed since start date of time series.
The output is in units of integer number of months, same as the index `freq`.
>>> transformer = TimeSince()
>>> Xt = transformer.fit_transform(X)
Create multiple columns with different start times. The output is in units
of integer number of months, same as the index `freq`.
>>> transformer = TimeSince(["2000-01", "2000-02"])
>>> Xt = transformer.fit_transform(X)
"""
_tags = {
# what is the abstract type of X: Series, or Panel
"input_data_type": "Series",
# what abstract type is returned: Primitives, Series, Panel
"output_data_type": "Series",
"instancewise": True, # is this an instance-wise transform?
"transform_labels": "None",
"X_inner_type": ["pd.DataFrame", "pd-multiindex", "pd_multiindex_hier"],
"y_inner_type": "None",
"univariate-only": False,
"requires_y": False,
"remember_data": False,
"fit_is_empty": False, # is fit empty and can be skipped?
"X-y-must-have-same-index": False,
"enforce_index_type": [pd.PeriodIndex, pd.DatetimeIndex],
"transform-returns-same-time-index": True,
"capability:inverse_transform": False, # can the transformer inverse transform?
"skip-inverse-transform": True, # is inverse-transform skipped when called?
"capability:unequal_length": True,
"capability:unequal_length:removes": False,
"capability:missing_values": True, # can estimator handle missing data?
"capability:missing_values:removes": False,
}
def __init__(
self,
start: list[str | datetime.datetime | pd.Period | None] | None = None,
*,
to_numeric: bool | None = True,
freq: str | None = None,
keep_original_columns: bool | None = False,
positive_only: bool | None = False,
):
self.start = start
self.to_numeric = to_numeric
self.freq = freq
self.keep_original_columns = keep_original_columns
self.positive_only = positive_only
super().__init__()
def _fit(self, X, y=None):
"""Fit transformer to X and y.
private _fit containing the core logic, called from fit
Parameters
----------
X: data structure of type X_inner_type
if X_inner_type is list, _fit must support all types in it
Data to fit transform to
y : data structure of type y_inner_type, default=None
Additional data, e.g., labels for transformation
Returns
-------
self: reference to self
"""
time_index = _get_time_index(X)
if is_any_real_numeric_dtype(time_index):
if self.freq:
warnings.warn(
"Index is integer type. `freq` will be ignored.", stacklevel=2
)
self.freq_ = None
elif isinstance(time_index, (pd.DatetimeIndex, pd.PeriodIndex)):
# Chooses first non None value
self.freq_ = time_index.freqstr or self.freq or pd.infer_freq(time_index)
if self.freq_ is None:
raise ValueError("X has no known frequency and none is supplied")
if (
(self.freq_ == time_index.freqstr)
and (self.freq_ != self.freq)
and (self.freq)
):
warnings.warn(
f"Using frequency from index: {time_index.freq}, which "
f"does not match the frequency given: {self.freq}.",
stacklevel=2,
)
else:
raise ValueError("Index must be of type int, datetime, or period.")
self.start_ = []
if self.start is None:
self.start_.append(time_index.min())
else:
for start in self.start:
if start is None:
self.start_.append(time_index.min())
elif isinstance(start, str):
if isinstance(time_index, pd.PeriodIndex):
self.start_.append(pd.Period(start))
else:
self.start_.append(pd.to_datetime(start))
else:
self.start_.append(start)
# Check `start_` is compatible with index
for start_ in self.start_:
if isinstance(time_index, pd.DatetimeIndex) and not isinstance(
start_, datetime.datetime
):
raise ValueError(
f"start_={start_} incompatible with a "
f"datetime index. Check that `start` is of type "
f"datetime or a pd.Datetime parsable string."
)
elif isinstance(time_index, pd.PeriodIndex) and not isinstance(
start_, pd.Period
):
raise ValueError(
f"start_={start_} incompatible with a "
f"Period index. Check that `start` is of type "
f"pd.Period or a pd.Period parsable string."
)
elif is_any_real_numeric_dtype(time_index) and not isinstance(
start_, (int, np.integer)
):
raise ValueError(
f"start_={start_} incompatible with a numeric index."
f"Check that `start` is an integer."
)
return self
def _transform(self, X, y=None):
"""Transform X and return a transformed version.
private _transform containing core logic, called from transform
Parameters
----------
X: data structure of type X_inner_type
if X_inner_type is list, _transform must support all types in it
Data to be transformed
y : data structure of type y_inner_type, default=None
Additional data, e.g., labels for transformation
Returns
-------
transformed version of X
"""
time_index = _get_time_index(X)
Xt = pd.DataFrame(index=X.index)
for start_ in self.start_:
if self.to_numeric:
if isinstance(time_index, pd.DatetimeIndex):
# To support calculating integer time differences when `freq`
# is not compatible with timedelta type (e.g., "M", "MS", "Y")
# X.index and `start` are first converted to pandas Period.
# Infer the freq needed to convert to period. We use the
# `get_period_alas` helper method from Pandas. This method
# maps from a frequency that is compatible with a datetime
# index to one that is compatible with a period index
# (e.g., "MS" -> "M"). We must strip the freq str of any
# integer multiplier (e.g., "15T" -> "T"). This is needed so that
# `get_period_alias` returns the correct result.
# If `get_period_alias` recieves a freq str with a multiplier
# (e.g., "15T") it returns `None` which causes errors downstream.
freq_ = _remove_digits_from_str(self.freq_)
freq_period = get_period_alias(freq_)
# Convert `start` and `time_index` to period.
# Casting `start_` to PeriodIndex using pd.period_range
# here so we can later cast it to an int using
# `astype(int)` in _get_period_diff_as_int().
start_period = pd.period_range(
start=start_, periods=1, freq=freq_period
)
time_index_period = time_index.to_period(freq=freq_period)
# Compute time differences.
time_deltas = _get_period_diff_as_int(
time_index_period, start_period
)
elif isinstance(time_index, pd.PeriodIndex):
if pd.__version__ < "1.5.0":
# Earlier versions of pandas returned incorrect result
# when taking a difference between Periods when the frequency
# is a multiple of a unit (e.g. "15T"). Solution is to
# cast to lowest frequency when taking difference (e.g., "T").
freq_ = _remove_digits_from_str(self.freq_)
# Change freq of `start` and `time_index`.
# Casting `start_` to PeriodIndex using pd.period_range
# here so we can later cast it to an int using
# `astype(int)` in _get_period_diff_as_int().
start_period = pd.period_range(
start=start_.to_timestamp(), periods=1, freq=freq_
)
time_index = time_index.to_timestamp().to_period(freq_)
else:
# Casting `start_` to PeriodIndex using pd.period_range
# here so we can later cast it to an int using
# `astype(int)` in _get_period_diff_as_int().
start_period = pd.period_range(
start=start_, periods=1, freq=self.freq_
)
# Compute time differences.
time_deltas = _get_period_diff_as_int(time_index, start_period)
elif is_any_real_numeric_dtype(time_index):
time_deltas = time_index - start_
else:
time_deltas = time_index - start_
col_name = f"time_since_{start_}"
Xt[col_name] = time_deltas
if self.to_numeric and self.positive_only:
Xt = Xt.clip(lower=0)
if self.keep_original_columns:
Xt = pd.concat([X, Xt], axis=1, copy=True)
return Xt
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.
There are currently no reserved values for transformers.
Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
return [
{"start": None, "to_numeric": True},
{
"start": ["2000-01-01", "2000-01-02"],
"to_numeric": False,
"positive_only": True,
},
]
def _get_period_diff_as_int(x: pd.PeriodIndex, y: pd.PeriodIndex) -> pd.Index:
return x.astype("int64") - y.astype("int64")
def _remove_digits_from_str(x: str) -> str:
return x.translate({ord(k): None for k in digits})
def _get_time_index(X: pd.DataFrame) -> pd.PeriodIndex | pd.DatetimeIndex:
"""Get time index from single and multi-index dataframes."""
if isinstance(X.index, pd.MultiIndex):
time_index = X.index.get_level_values(-1)
else:
time_index = X.index
return time_index