/
base.py
135 lines (103 loc) · 4.51 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: utf-8 -*-
#
# Base ARIMA pre-processing classes. Don't import this in __init__, or we'll
# potentially get circular imports in sub-classes
from sklearn.base import BaseEstimator, TransformerMixin
import abc
from ..compat.numpy import DTYPE
from ..utils import check_exog, check_endog
__all__ = [
"BaseTransformer"
]
class BaseTransformer(BaseEstimator, TransformerMixin, metaclass=abc.ABCMeta):
"""A base pre-processing transformer
A subclass of the scikit-learn ``TransformerMixin``, the purpose of the
``BaseTransformer`` is to learn characteristics from the training set and
apply them in a transformation to the test set. For instance, a transformer
aimed at normalizing features in an exogenous array would learn the means
and standard deviations of the training features in the ``fit`` method, and
then center and scale the features in the ``transform`` method.
The ``fit`` method should only ever be applied to the *training* set to
avoid any data leakage, while ``transform`` may be applied to any dataset
of the same schema.
"""
@staticmethod
def _check_y_X(y, X):
"""Validate input"""
# Do not force finite, since a transformer's goal may be imputation.
if y is not None:
y = check_endog(y, dtype=DTYPE, copy=True, force_all_finite=False)
if X is not None:
X = check_exog(X, dtype=None, copy=True, force_all_finite=False)
return y, X
def fit_transform(self, y, X=None, **kwargs):
"""Fit and transform the arrays
Parameters
----------
y : array-like or None, shape=(n_samples,)
The endogenous (time-series) array.
X : array-like or None, shape=(n_samples, n_features), optional
The exogenous array of additional covariates.
**kwargs : keyword args
Keyword arguments required by the transform function.
"""
self.fit(y, X, **kwargs) # TODO: eventually do not pass kwargs to fit
return self.transform(y, X, **kwargs)
@abc.abstractmethod
def fit(self, y, X, **kwargs): # TODO: eventually remove kwargs from fit
"""Fit the transformer
The purpose of the ``fit`` method is to learn a set of statistics or
characteristics from the training set, and store them as "fit
attributes" within the instance. A transformer *must* be fit before
the transformation can be applied to a dataset in the ``transform``
method.
Parameters
----------
y : array-like or None, shape=(n_samples,)
The endogenous (time-series) array.
X : array-like or None, shape=(n_samples, n_features)
The exogenous array of additional covariates.
Returns
-------
self : BaseTransformer
The scikit-learn convention is for the ``fit`` method to return
the instance of the transformer, ``self``. This allows us to
string ``fit(...).transform(...)`` calls together.
"""
@abc.abstractmethod
def transform(self, y, X, **kwargs):
"""Transform the new array
Apply the transformation to the array after learning the training set's
characteristics in the ``fit`` method.
Parameters
----------
y : array-like or None, shape=(n_samples,)
The endogenous (time-series) array.
X : array-like or None, shape=(n_samples, n_features)
The exogenous array of additional covariates.
**kwargs : keyword args
Keyword arguments required by the transform function.
Returns
-------
y : array-like or None
The transformed y array
X : array-like or None
The transformed X array
"""
class UpdatableMixin:
"""Transformers that may update their params, like ARIMAs"""
def _check_endog(self, y):
if y is None:
raise ValueError("endog array cannot be None when updating")
# TODO: remove default None value for X when we remove kwargs
def update_and_transform(self, y, X=None, **kwargs):
"""Update the params and return the transformed arrays
Parameters
----------
y : array-like or None, shape=(n_samples,)
The endogenous (time-series) array.
X : array-like or None, shape=(n_samples, n_features)
The exogenous array of additional covariates.
**kwargs : keyword args
Keyword arguments required by the transform function.
"""