-
Notifications
You must be signed in to change notification settings - Fork 89
/
_sax.py
283 lines (232 loc) · 9.76 KB
/
_sax.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
"""Symbolic Aggregate approXimation (SAX) transformer."""
__author__ = ["MatthewMiddlehurst", "hadifawaz1999"]
__all__ = ["SAX", "_invert_sax_symbols"]
import numpy as np
import scipy.stats
from numba import njit, prange
from aeon.transformations.collection import BaseCollectionTransformer
from aeon.transformations.collection.dictionary_based import PAA
class SAX(BaseCollectionTransformer):
"""Symbolic Aggregate approXimation (SAX) transformer.
as described in
Jessica Lin, Eamonn Keogh, Li Wei and Stefano Lonardi,
"Experiencing SAX: a novel symbolic representation of time series"
Data Mining and Knowledge Discovery, 15(2):107-144
Parameters
----------
n_segments : int, default = 8,
number of segments for the PAA, each segment is represented
by a symbol
alphabet_size : int, default = 4,
size of the alphabet to be used to create the bag of words
distribution : str, default = "Gaussian",
options={"Gaussian"}
the distribution function to use when generating the
alphabet. Currently only Gaussian is supported.
distribution_params : dict, default = None,
the parameters of the used distribution, if the used
distribution is "Gaussian" and this parameter is None
then the default setup is {"scale" : 1.0}
znormalized : bool, default = True,
this parameter is set to True when the input time series
are assume to be be z-normalized, i.e. the mean of each
time series should be 0 and the standard deviation should be
equal to 1. If this parameter is set to False, the z-normalization
is applied before the transformation.
Notes
-----
This implementation is based on the one done by tslearn [1]
References
----------
.. [1] https://github.com/tslearn-team/tslearn/blob/fa40028/tslearn/
piecewise/piecewise.py#L261-L501
Examples
--------
>>> from aeon.transformations.collection.dictionary_based import SAX
>>> from aeon.datasets import load_unit_test
>>> X_train, y_train = load_unit_test(split="train")
>>> X_test, y_test = load_unit_test(split="test")
>>> sax = SAX(n_segments=10, alphabet_size=8)
>>> X_train = sax.fit_transform(X_train)
>>> X_test = sax.fit_transform(X_test)
"""
_tags = {
"capability:multivariate": True,
"fit_is_empty": True,
"algorithm_type": "dictionary",
}
def __init__(
self,
n_segments=8,
alphabet_size=4,
distribution="Gaussian",
distribution_params=None,
znormalized=True,
):
self.n_segments = n_segments
self.alphabet_size = alphabet_size
self.distribution = distribution
self.distribution_params = distribution_params
self.znormalized = znormalized
if self.distribution == "Gaussian":
self.distribution_params_ = (
dict(scale=1.0)
if self.distribution_params is None
else self.distribution_params
)
else:
raise NotImplementedError(self.distribution, "still not added")
self.breakpoints, self.breakpoints_mid = self._generate_breakpoints(
alphabet_size=self.alphabet_size,
distribution=self.distribution,
distribution_params=self.distribution_params_,
)
super(SAX, self).__init__()
def _get_paa(self, X):
"""Transform the input time series to PAA segments.
Parameters
----------
X : np.ndarray of shape = (n_instances, n_channels, series_length)
The input time series
Returns
-------
X_paa : np.ndarray of shape = (n_instances, n_channels, n_segments)
The output of the PAA transformation
"""
if not self.znormalized:
X = scipy.stats.zscore(X, axis=-1)
paa = PAA(n_segments=self.n_segments)
X_paa = paa.fit_transform(X=X)
return X_paa
def _transform(self, X, y=None):
"""Transform the input time series to SAX symbols.
This function will transform the input time series into a bag of
symbols. These symbols are represented as integer values pointing
to the indices of the breakpoint in the alphabet for each
segment produced by PAA.
Parameters
----------
X : np.ndarray of shape = (n_instances, n_channels, series_length)
The input time series
y : np.ndarray of shape = (n_instances,), default = None
The labels are not used
Returns
-------
sax_symbols : np.ndarray of shape = (n_instances, n_channels, n_segments)
The output of the SAX transformation
"""
X_paa = self._get_paa(X=X)
sax_symbols = self._get_sax_symbols(X_paa=X_paa)
return sax_symbols
def _get_sax_symbols(self, X_paa):
"""Produce the SAX transformation.
Parameters
----------
X_paa : np.ndarray of shape = (n_instances, n_channels, n_segments)
The output of the PAA transformation
Returns
-------
sax_symbols : np.ndarray of shape = (n_instances, n_channels, n_segments)
The output of the SAX transformation using np.digitize
"""
sax_symbols = np.digitize(x=X_paa, bins=self.breakpoints)
return sax_symbols
def inverse_sax(self, X, original_length, y=None):
"""Produce the inverse SAX transformation.
Parameters
----------
X : np.ndarray of shape = (n_instances, n_channels, n_segments)
The output of the SAX transformation
y : np.ndarray of shape = (n_instances,), default = None
The labels are not used
Returns
-------
sax_inverse : np.ndarray(n_instances, n_channels, series_length)
The inverse of sax transform
"""
sax_inverse = _invert_sax_symbols(
sax_symbols=X,
series_length=original_length,
breakpoints_mid=self.breakpoints_mid,
)
return sax_inverse
def _generate_breakpoints(
self, alphabet_size, distribution="Gaussian", distribution_params=None
):
"""Generate the breakpoints following a probability distribution.
Parameters
----------
alphabet_size : int
The size of the alphabet, the number of breakpints is alphabet_size -1
distribution : str, default = "Gaussian"
The name of the distribution to follow when generating the breakpoints
distribution_params : dict, default = None,
The parameters of the chosen distribution, if the distribution is
chosen as Gaussian and this is left default to None then the params
used are "scale=1.0"
Returns
-------
breakpoints : np.ndarray of shape = (alphabet_size-1,)
The breakpoints to be used to generate the SAX symbols
breakpoints_mid : np.ndarray of shape = (alphabet_size,)
The middle breakpoints for each breakpoint interval used to produce
the inverse of SAX transformation
"""
if distribution == "Gaussian":
breakpoints = scipy.stats.norm.ppf(
np.arange(1, alphabet_size, dtype=np.float64) / alphabet_size,
scale=distribution_params["scale"],
)
breakpoints_mid = scipy.stats.norm.ppf(
np.arange(1, 2 * alphabet_size, 2, dtype=np.float64)
/ (2 * self.alphabet_size),
scale=distribution_params["scale"],
)
return breakpoints, breakpoints_mid
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.
Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
params = {"n_segments": 10, "alphabet_size": 8}
return params
@njit(parallel=True, fastmath=True)
def _invert_sax_symbols(sax_symbols, series_length, breakpoints_mid):
"""Reconstruct the original time series using a Gaussian estimation.
In other words, try to inverse the SAX transformation.
Parameters
----------
sax_symbols : np.ndarray(n_instances, n_channels, n_segments)
The sax output transformation
series_length : int
The original time series length
breakpoints_mid : np.ndarray(alphabet_size)
The Gaussian estimation of the value for each breakpoint interval
Returns
-------
sax_inverse : np.ndarray(n_instances, n_channels, series_length)
The inverse of sax transform
"""
n_samples, n_channels, sax_length = sax_symbols.shape
segment_length = int(series_length / sax_length)
sax_inverse = np.zeros((n_samples, n_channels, series_length))
for i in prange(n_samples):
for c in prange(n_channels):
for _current_sax_index in prange(sax_length):
start_index = _current_sax_index * segment_length
stop_index = start_index + segment_length
sax_inverse[i, :, start_index:stop_index] = breakpoints_mid[
sax_symbols[i, c, _current_sax_index]
]
return sax_inverse