/
binning.py
150 lines (126 loc) · 5.66 KB
/
binning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""Time binning for turning series equally spaced."""
__maintainer__ = []
import warnings
import numpy as np
import pandas as pd
from aeon.transformations.base import BaseTransformer
class TimeBinAggregate(BaseTransformer):
r"""Bins time series and aggregates by bin.
In `transform`, applies `groupby` with `aggfunc` on the temporal coordinate.
More precisely:
`bins` encodes bins :math:`B_1, \dots, B_k` where :math:`B_i` are intervals,
in the reals or in a temporal (time stamp) range.
In `transform`, the estimator `TimeBinAggregate` collects values
at time stamps of `X` falling into :math:`B_i` as a sample :math:`S_i`,
and then applies `aggfunc` to :math:`S_i` to obtain an aggregate value :math:`v_i`.
The transformed series are values :math:`v_i` at time stamps :math:`t_i`,
determined from :math:`B_i` per the rule in `return_index`.
Parameters
----------
bins : 1D array-like or pd.IntervalIndex
if 1D array-like, is interpreted as breaks of bins
breaks of bins defining intervals considered by aggfunc
aggfunc : callable *1D array-like -> float), optional, default=np.mean
Function used to aggregate the values in intervals.
Should have signature 1D -> float and defaults
to mean if None
return_index : str, one of the below; optional, default="range"
"range" = RangeIndex with bins indexed in same order as in `bins`
"bin_start" = transformed pd.DataFrame will be indexed by bin starts
"bin_end" = transformed pd.DataFrame will be indexed by bin starts
"bin_mid" = transformed pd.DataFrame will be indexed by bin midpoints
"bin" = transformed pd.DataFrame will have `bins` as `IntervalIndex`
"""
_tags = {
"fit_is_empty": True,
"capability:multivariate": True,
"input_data_type": "Series",
# what is the abstract type of X: Series, or Panel
"output_data_type": "Series",
# what is the abstract type of y: None (not needed), Primitives, Series, Panel
"instancewise": True,
"X_inner_type": ["pd.DataFrame"],
"y_inner_type": "None", # and for y?
"capability:missing_values": True,
"capability:unequal_length": True,
"capability:unequal_length:removes": True,
"transform-returns-same-time-index": False,
"capability:inverse_transform": False,
}
def __init__(self, bins, aggfunc=None, return_index="bin_start"):
self.bins = bins
self.aggfunc = aggfunc
self.return_index = return_index
if not isinstance(bins, pd.IntervalIndex):
self._bins = pd.IntervalIndex.from_breaks(bins)
else:
self._bins = bins
if self.aggfunc is None:
self._aggfunc = np.mean
else:
assert callable(aggfunc), (
"aggfunc should be callable with" "signature 1D -> float"
)
if aggfunc.__name__ == "<lambda>":
warnings.warn(
"Save and load will not work with lambda functions", stacklevel=2
)
self._aggfunc = self.aggfunc
super().__init__()
def _transform(self, X, y=None):
"""Transform X and return a transformed version.
private _transform containing core logic, called from transform
Parameters
----------
X: data structure of type X_inner_type
if X_inner_type is list, _transform must support all types in it
Data to be transformed
y : data structure of type y_inner_type, default=None
Additional data, e.g., labels for transformation
Returns
-------
transformed version of X
"""
bins = self.bins
idx_cut = pd.cut(X.index, bins=self._bins, include_lowest=True)
Xt = X.groupby(idx_cut).apply(self._aggfunc)
if self.return_index == "range":
Xt = Xt.reset_index(drop=True)
elif self.return_index == "bin_start":
if bins is pd.IntervalIndex:
Xt.index = [x.left for x in Xt.index]
else:
Xt.index = bins[:-1]
elif self.return_index == "bin_end":
if bins is pd.IntervalIndex:
Xt.index = [x.right for x in Xt.index]
else:
Xt.index = bins[1:]
elif self.return_index == "bin_mid":
if bins is pd.IntervalIndex:
Xt.index = [(x.left + x.right) / 2 for x in Xt.index]
else:
Xt.index = [(bins[i] + bins[i + 1]) / 2 for i in range(len(bins))]
elif self.return_index == "bin":
Xt.index = self._bins
return Xt
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.
There are currently no reserved values for transformers.
Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
params1 = {"bins": [0, 1]}
params2 = {"bins": [0, 2, 4], "aggfunc": np.sum, "return_index": "bin_start"}
return [params1, params2]