-
Notifications
You must be signed in to change notification settings - Fork 42
/
prep.py
209 lines (174 loc) · 7.2 KB
/
prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
.. Copyright (c) 2014- Marshall Farrier
license http://opensource.org/licenses/MIT
Data - preprocessing functions (:mod:`pynance.data.prep`)
=========================================================
.. currentmodule:: pynance.data.prep
"""
import numpy as np
import pandas as pd
def center(dataset, out=None):
"""
Returns a centered data set.
Each column of the returned data will have mean 0.
The row vector subtracted from each row to achieve this
transformation is also returned.
Parameters
----------
dataset : DataFrame or ndarray
out : DataFrame or ndarray, optional
Alternate output array in which to place the result.
If provided, it must have the same shape and type
(DataFrame or ndarray) as the expected output.
Returns
----------
out : tuple of DataFrame or ndarray
The output data is of the same type as the input.
Notes
----------
To exclude a column (such as a constant feature, which is
usually the first or last column of data) simply don't
include it in the input. For example:
>>> centered_data, means = pn.center(mydata.iloc[:, 1:])
To perform this operation in place:
>>> _, means = pn.center(mydata.iloc[:, 1:], out=mydata.iloc:, 1:])
"""
return _preprocess(_center_fn, dataset, out)
def _preprocess(func, dataset, out):
# Generic preprocessing function used in center() and normalize()
is_df = isinstance(dataset, pd.DataFrame)
_data = (dataset.values if is_df else dataset)
processed_data, adjustment = func(_data)
if not is_df:
if out is not None:
out[:, :] = processed_data
return out, adjustment
return processed_data, adjustment
adj_df = pd.DataFrame(data=adjustment, index=['Mean'], columns=dataset.columns,
dtype='float64')
if out is not None:
out.values[:, :] = processed_data
return out, adj_df
processed_df = pd.DataFrame(data=processed_data, index=dataset.index,
columns=dataset.columns, dtype='float64')
return processed_df, adj_df
def _center_fn(_data):
adjustment = np.mean(_data, axis=0, dtype=np.float64).reshape((1, _data.shape[1]))
centered_data = _data - adjustment
return centered_data, adjustment
def _normalize_fn(_data):
adjustment = np.std(_data, axis=0, dtype=np.float64).reshape((1, _data.shape[1]))
normalized_data = _data / adjustment
return normalized_data, adjustment
def normalize(centered_data, out=None):
"""
Returns a data set with standard deviation of 1.
The input data must be centered for the operation to
yield valid results: The mean of each column must be 0.
Each column of the returned data set will have standard
deviation 1.
The row vector by which each row of data is divided is
also returned.
Parameters
----------
centered_data : DataFrame or ndarray
out : DataFrame or ndarray, optional
Alternate output array in which to place the result.
If provided, it must have the same shape and type
(DataFrame or ndarray) as the expected output.
Returns
----------
out : tuple of DataFrame or ndarray
The output data is of the same type as the input.
Notes
----------
To exclude a column (such as a constant feature, which is
usually the first or last column of data) simply don't
include it in the input. For example:
>>> normalized_data, sd_adj = pn.normalize(mydata.iloc[:, 1:])
To perform this operation in place:
>>> _, sd_adj = pn.normalize(mydata.iloc[:, 1:], out=mydata.iloc:, 1:])
"""
return _preprocess(_normalize_fn, centered_data, out)
def transform(data_frame, **kwargs):
"""
Return a transformed DataFrame.
Transform data_frame along the given axis. By default, each row will be normalized (axis=0).
Parameters
-----------
data_frame : DataFrame
Data to be normalized.
axis : int, optional
0 (default) to normalize each row, 1 to normalize each column.
method : str, optional
Valid methods are:
- "vector" : Default for normalization by row (axis=0).
Normalize along axis as a vector with norm `norm`
- "last" : Linear normalization setting last value along the axis to `norm`
- "first" : Default for normalization of columns (axis=1).
Linear normalization setting first value along the given axis to `norm`
- "mean" : Normalize so that the mean of each vector along the given axis is `norm`
norm : float, optional
Target value of normalization, defaults to 1.0.
labels : DataFrame, optional
Labels may be passed as keyword argument, in which
case the label values will also be normalized and returned.
Returns
-----------
df : DataFrame
Normalized data.
labels : DataFrame, optional
Normalized labels, if provided as input.
Notes
-----------
If labels are real-valued, they should also be normalized.
..
Having row_norms as a numpy array should be benchmarked against
using a DataFrame:
http://stackoverflow.com/questions/12525722/normalize-data-in-pandas
Note: This isn't a bottleneck. Using a feature set with 13k rows and 256
data_frame ('ge' from 1962 until now), the normalization was immediate.
"""
norm = kwargs.get('norm', 1.0)
axis = kwargs.get('axis', 0)
if axis == 0:
norm_vector = _get_norms_of_rows(data_frame, kwargs.get('method', 'vector'))
else:
norm_vector = _get_norms_of_cols(data_frame, kwargs.get('method', 'first'))
if 'labels' in kwargs:
if axis == 0:
return data_frame.apply(lambda col: col * norm / norm_vector, axis=0), \
kwargs['labels'].apply(lambda col: col * norm / norm_vector, axis=0)
else:
raise ValueError("label normalization incompatible with normalization by column")
else:
if axis == 0:
return data_frame.apply(lambda col: col * norm / norm_vector, axis=0)
else:
return data_frame.apply(lambda row: row * norm / norm_vector, axis=1)
def _get_norms_of_rows(data_frame, method):
""" return a column vector containing the norm of each row """
if method == 'vector':
norm_vector = np.linalg.norm(data_frame.values, axis=1)
elif method == 'last':
norm_vector = data_frame.iloc[:, -1].values
elif method == 'mean':
norm_vector = np.mean(data_frame.values, axis=1)
elif method == 'first':
norm_vector = data_frame.iloc[:, 0].values
else:
raise ValueError("no normalization method '{0}'".format(method))
return norm_vector
def _get_norms_of_cols(data_frame, method):
""" return a row vector containing the norm of each column """
if method == 'first':
norm_vector = data_frame.iloc[0, :].values
elif method == 'mean':
norm_vector = np.mean(data_frame.values, axis=0)
elif method == 'last':
norm_vector = data_frame.iloc[-1, :].values
elif method == 'vector':
norm_vector = np.linalg.norm(data_frame.values, axis=0)
else:
raise ValueError("no normalization method '{0}'".format(method))
return norm_vector