-
Notifications
You must be signed in to change notification settings - Fork 89
/
boxcox.py
491 lines (397 loc) · 15.3 KB
/
boxcox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
"""Implemenents Box-Cox and Log Transformations."""
__author__ = ["mloning", "aiwalter", "fkiraly"]
__all__ = ["BoxCoxTransformer", "LogTransformer"]
import numpy as np
from scipy import optimize, special, stats
from scipy.special import boxcox, inv_boxcox
from scipy.stats import boxcox_llf, distributions, variation
from aeon.transformations.base import BaseTransformer
from aeon.utils.validation import is_int
# copy-pasted from scipy 1.7.3 since it moved in 1.8.0 and broke this estimator
# find a suitable replacement
def _calc_uniform_order_statistic_medians(n):
"""Approximations of uniform order statistic medians.
Parameters
----------
n : int
Sample size.
Returns
-------
v : 1d float array
Approximations of the order statistic medians.
References
----------
.. [1] James J. Filliben, "The Probability Plot Correlation Coefficient
Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
"""
v = np.empty(n, dtype=np.float64)
v[-1] = 0.5 ** (1.0 / n)
v[0] = 1 - v[-1]
i = np.arange(2, n)
v[1:-1] = (i - 0.3175) / (n + 0.365)
return v
class BoxCoxTransformer(BaseTransformer):
r"""Box-Cox power transform.
Box-Cox transformation is a power transformation that is used to
make data more normally distributed and stabilize its variance based
on the hyperparameter lambda. [1]_
The BoxCoxTransformer solves for the lambda parameter used in the Box-Cox
transformation given `method`, the optimization approach, and input
data provided to `fit`. The use of Guerrero's method for solving for lambda
requires the seasonal periodicity, `sp` be provided. [2]_
Parameters
----------
bounds : tuple
Lower and upper bounds used to restrict the feasible range
when solving for the value of lambda.
method : {"pearsonr", "mle", "all", "guerrero"}, default="mle"
The optimization approach used to determine the lambda value used
in the Box-Cox transformation.
sp : int
Seasonal periodicity of the data in integer form. Only used if
method="guerrero" is chosen. Must be an integer >= 2.
Attributes
----------
bounds : tuple
Lower and upper bounds used to restrict the feasible range when
solving for lambda.
method : str
Optimization approach used to solve for lambda. One of "personr",
"mle", "all", "guerrero".
sp : int
Seasonal periodicity of the data in integer form.
lambda_ : float
The Box-Cox lambda parameter that was solved for based on the supplied
`method` and data provided in `fit`.
See Also
--------
LogTransformer :
Transformer input data using natural log. Can help normalize data and
compress variance of the series.
aeon.transformations.exponent.ExponentTransformer :
Transform input data by raising it to an exponent. Can help compress
variance of series if a fractional exponent is supplied.
aeon.transformations.exponent.SqrtTransformer :
Transform input data by taking its square root. Can help compress
variance of input series.
Notes
-----
The Box-Cox transformation is defined as :math:`\frac{y^{\lambda}-1}{\lambda},
\lambda \ne 0 \text{ or } ln(y), \lambda = 0`.
Therefore, the input data must be positive. In some implementations, a positive
constant is added to the series prior to applying the transformation. But
that is not the case here.
References
----------
.. [1] Box, G. E. P. & Cox, D. R. (1964) An analysis of transformations,
Journal ofthe Royal Statistical Society, Series B, 26, 211-252.
.. [2] V.M. Guerrero, "Time-series analysis supported by Power
Transformations ", Journal of Forecasting, vol. 12, pp. 37-48, 1993.
Examples
--------
>>> from aeon.transformations.boxcox import BoxCoxTransformer
>>> from aeon.datasets import load_airline
>>> y = load_airline()
>>> transformer = BoxCoxTransformer()
>>> y_hat = transformer.fit_transform(y)
"""
_tags = {
"input_data_type": "Series",
# what is the abstract type of X: Series, or Panel
"output_data_type": "Series",
# what abstract type is returned: Primitives, Series, Panel
"instancewise": True, # is this an instance-wise transform?
"X_inner_type": "np.ndarray",
"y_inner_type": "None",
"transform-returns-same-time-index": True,
"fit_is_empty": False,
"univariate-only": True,
"capability:inverse_transform": True,
}
def __init__(self, bounds=None, method="mle", sp=None):
self.bounds = bounds
self.method = method
self.lambda_ = None
self.sp = sp
super().__init__()
def _fit(self, X, y=None):
"""
Fit transformer to X and y.
private _fit containing the core logic, called from fit
Parameters
----------
X : 2D np.ndarray (n x 1)
Data to be transformed
y : ignored argument for interface compatibility
Additional data, e.g., labels for transformation
Returns
-------
self: a fitted instance of the estimator
"""
X = X.flatten()
if self.method != "guerrero":
self.lambda_ = _boxcox_normmax(X, bounds=self.bounds, method=self.method)
else:
self.lambda_ = _guerrero(X, self.sp, self.bounds)
return self
def _transform(self, X, y=None):
"""Transform X and return a transformed version.
private _transform containing the core logic, called from transform
Parameters
----------
X : 2D np.ndarray (n x 1)
Data to be transformed
y : ignored argument for interface compatibility
Additional data, e.g., labels for transformation
Returns
-------
Xt : 2D np.ndarray
transformed version of X
"""
X_shape = X.shape
Xt = boxcox(X.flatten(), self.lambda_)
Xt = Xt.reshape(X_shape)
return Xt
def _inverse_transform(self, X, y=None):
"""Inverse transform X and return an inverse transformed version.
core logic
Parameters
----------
X : 2D np.ndarray (n x 1)
Data to be transformed
y : ignored argument for interface compatibility
Additional data, e.g., labels for transformation
Returns
-------
Xt : 2D np.ndarray
inverse transformed version of X
"""
X_shape = X.shape
Xt = inv_boxcox(X.flatten(), self.lambda_)
Xt = Xt.reshape(X_shape)
return Xt
class LogTransformer(BaseTransformer):
"""Natural logarithm transformation.
The Natural logarithm transformation can be used to make the data more normally
distributed and stabilize its variance.
Transforms each data point x to log(scale *(x+offset))
Parameters
----------
offset : float , default = 0
Additive constant applied to all the data.
scale : float , default = 1
Multiplicative scaling constant applied to all the data.
See Also
--------
BoxCoxTransformer :
Applies Box-Cox power transformation. Can help normalize data and
compress variance of the series.
aeon.transformations.exponent.ExponentTransformer :
Transform input data by raising it to an exponent. Can help compress
variance of series if a fractional exponent is supplied.
aeon.transformations.exponent.SqrtTransformer :
Transform input data by taking its square root. Can help compress
variance of input series.
Notes
-----
The log transformation is applied as :math:`ln(y)`.
Examples
--------
>>> from aeon.transformations.boxcox import LogTransformer
>>> from aeon.datasets import load_airline
>>> y = load_airline()
>>> transformer = LogTransformer()
>>> y_hat = transformer.fit_transform(y)
"""
_tags = {
"input_data_type": "Series",
# what is the abstract type of X: Series, or Panel
"output_data_type": "Series",
# what abstract type is returned: Primitives, Series, Panel
"instancewise": True,
"X_inner_type": "np.ndarray",
"y_inner_type": "None",
"transform-returns-same-time-index": True,
"fit_is_empty": True,
"univariate-only": False,
"capability:inverse_transform": True,
}
def __init__(self, offset=0, scale=1):
self.offset = offset
self.scale = scale
super().__init__()
def _transform(self, X, y=None):
"""Transform X and return a transformed version.
private _transform containing the core logic, called from transform
Parameters
----------
X : 2D np.ndarray
Data to be transformed
y : ignored argument for interface compatibility
Additional data, e.g., labels for transformation
Returns
-------
Xt : 2D np.ndarray
transformed version of X
"""
offset = self.offset
scale = self.scale
Xt = np.log(scale * (X + offset))
return Xt
def _inverse_transform(self, X, y=None):
"""Inverse transform X and return an inverse transformed version.
core logic
Parameters
----------
X : 2D np.ndarray
Data to be transformed
y : ignored argument for interface compatibility
Additional data, e.g., labels for transformation
Returns
-------
Xt : 2D np.ndarray
inverse transformed version of X
"""
offset = self.offset
scale = self.scale
Xt = (np.exp(X) / scale) - offset
return Xt
def _make_boxcox_optimizer(bounds=None, brack=(-2.0, 2.0)):
# bounds is None, use simple Brent optimisation
if bounds is None:
def optimizer(func, args):
return optimize.brent(func, brack=brack, args=args)
# otherwise use bounded Brent optimisation
else:
# input checks on bounds
if not isinstance(bounds, tuple) or len(bounds) != 2:
raise ValueError(
f"`bounds` must be a tuple of length 2, but found: {bounds}"
)
def optimizer(func, args):
return optimize.fminbound(func, bounds[0], bounds[1], args=args)
return optimizer
# TODO replace with scipy version once PR for adding bounds is merged
def _boxcox_normmax(x, bounds=None, brack=(-2.0, 2.0), method="pearsonr"):
optimizer = _make_boxcox_optimizer(bounds, brack)
def _pearsonr(x):
osm_uniform = _calc_uniform_order_statistic_medians(len(x))
xvals = distributions.norm.ppf(osm_uniform)
def _eval_pearsonr(lmbda, xvals, samps):
y = _boxcox(samps, lmbda)
yvals = np.sort(y)
r, prob = stats.pearsonr(xvals, yvals)
return 1 - r
return optimizer(_eval_pearsonr, args=(xvals, x))
def _mle(x):
def _eval_mle(lmb, data):
# function to minimize
return -boxcox_llf(lmb, data)
return optimizer(_eval_mle, args=(x,))
def _all(x):
maxlog = np.zeros(2, dtype=float)
maxlog[0] = _pearsonr(x)
maxlog[1] = _mle(x)
return maxlog
methods = {"pearsonr": _pearsonr, "mle": _mle, "all": _all}
if method not in methods.keys():
raise ValueError("Method %s not recognized." % method)
optimfunc = methods[method]
return optimfunc(x)
def _guerrero(x, sp, bounds=None):
"""Estimate lambda using the Guerrero method as described in [1]_.
Parameters
----------
x : ndarray
Input array. Must be 1-dimensional.
sp : integer
Seasonal periodicity value. Must be an integer >= 2.
bounds : {None, (float, float)}, optional
Bounds on lambda to be used in minimization.
Returns
-------
lambda : float
Lambda value that minimizes the coefficient of variation of
variances of the time series in different periods after
Box-Cox transformation [Guerrero].
References
----------
.. [1] V.M. Guerrero, "Time-series analysis supported by Power
Transformations ", Journal of Forecasting, vol. 12, pp. 37-48, 1993.
"""
if sp is None or not is_int(sp) or sp < 2:
raise ValueError(
"Guerrero method requires an integer seasonal periodicity (sp) value >= 2."
)
x = np.asarray(x)
if x.ndim != 1:
raise ValueError("Data must be 1-dimensional.")
num_obs = len(x)
len_prefix = num_obs % sp
x_trimmed = x[len_prefix:]
x_mat = x_trimmed.reshape((-1, sp))
x_mean = np.mean(x_mat, axis=1)
# [Guerrero, Eq.(5)] uses an unbiased estimation for
# the standard deviation
x_std = np.std(x_mat, axis=1, ddof=1)
def _eval_guerrero(lmb, x_std, x_mean):
x_ratio = x_std / x_mean ** (1 - lmb)
x_ratio_cv = variation(x_ratio)
return x_ratio_cv
optimizer = _make_boxcox_optimizer(bounds)
return optimizer(_eval_guerrero, args=(x_std, x_mean))
def _boxcox(x, lmbda=None, bounds=None):
r"""Return a dataset transformed by a Box-Cox power transformation.
Parameters
----------
x : ndarray
Input array. Must be positive 1-dimensional. Must not be constant.
lmbda : {None, scalar}, optional
If `lmbda` is not None, do the transformation for that value.
If `lmbda` is None, find the lambda that maximizes the log-likelihood
function and return it as the second output argument.
Returns
-------
boxcox : ndarray
Box-Cox power transformed array.
maxlog : float, optional
If the `lmbda` parameter is None, the second returned argument is
the lambda that maximizes the log-likelihood function.
See Also
--------
probplot, boxcox_normplot, boxcox_normmax, boxcox_llf
Notes
-----
The Box-Cox transform is given by::
y = (x**lmbda - 1) / lmbda, for lmbda > 0
log(x), for lmbda = 0
`boxcox` requires the input data to be positive. Sometimes a Box-Cox
transformation provides a shift parameter to achieve this; `boxcox` does
not. Such a shift parameter is equivalent to adding a positive constant to
`x` before calling `boxcox`.
The confidence limits returned when ``alpha`` is provided give the interval
where:
.. math::
llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared
function.
References
----------
G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the
Royal Statistical Society B, 26, 211-252 (1964).
"""
x = np.asarray(x)
if x.ndim != 1:
raise ValueError("Data must be 1-dimensional.")
if x.size == 0:
return x
if np.all(x == x[0]):
raise ValueError("Data must not be constant.")
if any(x <= 0):
raise ValueError("Data must be positive.")
if lmbda is not None: # single transformation
return special.boxcox(x, lmbda)
# If lmbda=None, find the lmbda that maximizes the log-likelihood function.
lmax = _boxcox_normmax(x, bounds=bounds, method="mle")
y = _boxcox(x, lmax)
return y, lmax