/
feat.py
482 lines (430 loc) · 17.7 KB
/
feat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
# -*- coding: utf-8 -*-
"""
Copyright 2016 William La Cava
license: GNU/GPLv3
"""
import argparse
from .versionstr import __version__
from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
import numpy as np
import pandas as pd
from _feat import cppFeat
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import log_loss
from sklearn.utils import check_X_y, check_array
from sklearn.preprocessing import LabelEncoder
import json
class Feat(BaseEstimator):
"""Feature Engineering Automation Tool
Parameters
----------
pop_size: int, optional (default: 100)
Size of the population of models
gens: int, optional (default: 100)
Number of iterations to train for
ml: str, optional (default: "LinearRidgeRegression")
ML pairing. Choices: LinearRidgeRegression, Lasso, L1\_LR, L2\_LR
FeatRegressor sets to "LinearRidgeRegression";
FeatClassifier sets to L2 penalized LR ("LR")
classification: boolean or None, optional (default: None)
Whether to do classification or regression. Set explicitly in
FeatRegressor and FeatClassifier accordingly.
verbosity: int, optional (default: 0)
How much to print out (0, 1, 2)
max_stall: int, optional (default: 0)
How many generations to continue after the validation loss has
stalled. If 0, not used.
sel: str, optional (default: "lexicase")
Selection algorithm to use.
surv: str, optional (default: "nsga2")
Survival algorithm to use.
cross_rate: float, optional (default: 0.5)
How often to do crossover for variation versus mutation.
root_xo_rate: float, optional (default: 0.5)
When performing crossover, how often to choose from the roots of
the trees, rather than within the tree. Root crossover essentially
swaps features in models.
otype: string, optional (default: 'a')
Feature output types:
'a': all
'b': boolean only
'f': floating point only
functions: list[string], optional (default: [])
A comma-separated string of operators to use to build features.
If functions=[], all the available functions are used.
Options: +, -, *, /, ^2, ^3, sqrt, sin, cos, exp, log, ^, logit, tanh,
gauss, relu, split, split_c, b2f, c2f, and, or, not, xor, =, <, <=, >,
>=, if, ite
max_depth: int, optional (default: 3)
Maximum depth of a feature's tree representation.
max_dim: int, optional (default: 10)
Maximum dimension of a model. The dimension of a model is how many
independent features it has. Controls the number of trees in each
individual.
random_state: int, optional (default: 0)
Random seed. If -1, will choose a random random_state.
erc: boolean, optional (default: False)
If true, ephemeral random constants are included as nodes in trees.
objectives: list[str], optional (default: "fitness,complexity")
Objectives to use for multi-objective optimization.
shuffle: boolean, optional (default: True)
Whether to shuffle the training data before beginning training.
split: float, optional (default: 0.75)
The internal fraction of training data to use. The validation fold
is then 1-split fraction of the data.
fb: float, optional (default: 0.5)
Controls the amount of feedback from the ML weights used during
variation. Higher values make variation less random.
scorer: str, optional (default: '')
Scoring function to use internally.
feature_names: str, optional (default: '')
Optionally provide comma-separated feature names. Should be equal
to the number of features in your data. This will be set
automatically if a Pandas dataframe is passed to fit().
backprop: boolean, optional (default: False)
Perform gradient descent on feature weights using backpropagation.
iters: int, optional (default: 10)
Controls the number of iterations of backprop as well as
hillclimbing for learning weights.
lr: float, optional (default: 0.1)
Learning rate used for gradient descent. This the initial rate,
and is scheduled to decrease exponentially with generations.
batch_size: int, optional (default: 0)
Number of samples to train on each generation. 0 means train on
all the samples.
n_jobs: int, optional (default: 0)
Number of parallel threads to use. If 0, this will be
automatically determined by OMP.
hillclimb: boolean, optional (default: False)
Applies stochastic hillclimbing to feature weights.
logfile: str, optional (default: "")
If specified, spits statistics into a logfile. "" means don't log.
max_time: int, optional (default: -1)
Maximum time terminational criterion in seconds. If -1, not used.
residual_xo: boolean, optional (default: False)
Use residual crossover.
stagewise_xo: boolean, optional (default: False)
Use stagewise crossover.
stagewise_xo_tol: boolean, optional (default:False)
Terminates stagewise crossover based on an error value rather than
dimensionality.
softmax_norm: boolean, optional (default: False)
Uses softmax normalization of probabilities of variation across
the features.
save_pop: int, optional (default: 0)
Saves the population of models. 0: don't save; 1: save final
population; 2: save every generation.
normalize: boolean, optional (default: True)
Normalizes the floating point input variables using z-scores.
val_from_arch: boolean, optional (default: True)
Validates the final model using the archive rather than the whole
population.
corr_delete_mutate: boolean, optional (default: False)
Replaces root deletion mutation with a deterministic deletion
operator that deletes the feature with highest collinearity.
simplify: float, optional (default: 0)
Runs post-run simplification to try to shrink the final model
without changing its output more than the simplify tolerance.
This tolerance is the norm of the difference in outputs, divided
by the norm of the output. If simplify=0, it is ignored.
protected_groups: list, optional (default: [])
Defines protected attributes in the data. Uses for adding
fairness constraints.
tune_initial: boolean, optional (default: False)
Tune the initial linear model's penalization parameter.
tune_final: boolean, optional (default: True)
Tune the final linear model's penalization parameter.
starting_pop: str, optional (default: "")
Provide a starting pop in json format.
"""
__version__ = __version__
def __init__(self,
pop_size=100,
gens=100,
ml= 'LinearRidgeRegression',
classification=False,
verbosity=0,
max_stall=0,
sel="lexicase",
surv="nsga2",
cross_rate=0.5,
root_xo_rate=0.5,
otype='a',
functions= [
"+","-","*","/","^2","^3","sqrt","sin","cos","exp","log","^",
"logit","tanh","gauss","relu", "split","split_c",
"b2f","c2f","and","or","not","xor","=","<","<=",">",">=","if","ite"
],
max_depth=3,
max_dim=10,
random_state=0,
erc= False ,
objectives=["fitness","complexity"],
shuffle=True,
split=0.75,
fb=0.5,
scorer='',
feature_names="",
backprop=False,
iters=10,
lr=0.1,
batch_size=0,
n_jobs=1,
hillclimb=False,
logfile="",
max_time=-1,
residual_xo=False,
stagewise_xo=False,
stagewise_xo_tol=False,
softmax_norm=False,
save_pop=0,
normalize=True,
val_from_arch=True,
corr_delete_mutate=False,
simplify=0.0,
protected_groups="",
tune_initial=False,
tune_final=True,
starting_pop="",
):
self.pop_size=pop_size
self.gens=gens
self.ml=ml
self.classification = classification
self.verbosity=verbosity
self.max_stall=max_stall
self.sel=sel
self.surv=surv
self.cross_rate=cross_rate
self.root_xo_rate=root_xo_rate
self.otype=otype
self.functions=functions
self.max_depth=max_depth
self.max_dim=max_dim
self.random_state=random_state
self.erc=erc
self.objectives=objectives
self.shuffle=shuffle
self.split=split
self.fb=fb
self.scorer=scorer
self.feature_names=feature_names
self.backprop=backprop
self.iters=iters
self.lr=lr
self.batch_size=batch_size
self.n_jobs=n_jobs
self.hillclimb=hillclimb
self.logfile=logfile
self.max_time=max_time
self.residual_xo=residual_xo
self.stagewise_xo=stagewise_xo
self.stagewise_xo_tol=stagewise_xo_tol
self.softmax_norm=softmax_norm
self.save_pop=save_pop
self.normalize=normalize
self.val_from_arch=val_from_arch
self.corr_delete_mutate=corr_delete_mutate
self.simplify=simplify
self.protected_groups=protected_groups
self.tune_initial=tune_initial
self.tune_final=tune_final
self.starting_pop=starting_pop
def load(self, filename):
"""Load a saved Feat state from file."""
with open(filename, 'r') as of:
feat_state = json.load(of)
# separate out the python-specific keys
init_params = {k:v for k,v in feat_state.items() if not k.endswith('_')}
self.set_params(**init_params)
for k,v in feat_state.items():
if k.endswith('_') and k != 'cfeat_':
setattr(self, k, v)
if 'cfeat_' in feat_state.keys():
self.cfeat_ = cppFeat()
self.cfeat_.load(feat_state['cfeat_'])
else:
self._set_cfeat_params()
return self
def save(self, filename):
"""Save a Feat state to file."""
cfeat_state = self.cfeat_.save()
# add python-specific parameters
attribs = self.get_params()
for k,v in self.__dict__.items():
if k not in attribs.keys():
attribs[k] = v
attribs['cfeat_'] = cfeat_state
with open(filename, 'w') as of:
json.dump(attribs, of)
def _set_cfeat_params(self):
"""Setup cpp feat estimator."""
self.cfeat_ = cppFeat()
params = self.get_params()
for k,v in params.items():
setattr(self.cfeat_, k, v)
def fit(self, X, y, Z=None):
"""Fit a model."""
X,y = self._clean(X, y, set_feature_names=True)
self._set_cfeat_params()
if Z:
self.cfeat_.fit(X,y,Z)
else:
self.cfeat_.fit(X,y)
self.is_fitted_ = True
return self
def predict(self,X,Z=None):
"""Predict on X."""
if not self.is_fitted_:
raise ValueError("Call fit before calling predict.")
X = self._prep_X(X)
if Z:
return self.cfeat_.predict(X,Z)
else:
return self.cfeat_.predict(X)
def predict_archive(self,X,Z=None,front=False):
"""Returns a list of dictionary predictions for all models."""
if not self.is_fitted_:
raise ValueError("Call fit before calling predict.")
X = self._prep_X(X)
if Z:
raise NotImplementedError('longitudinal not implemented')
return
archive = self.cfeat_.get_archive(front)
preds = []
for ind in archive:
if ind['id'] == 9234:
print('individual:',json.dumps(ind,indent=2))
tmp = {}
tmp['id'] = ind['id']
tmp['y_pred'] = self.cfeat_.predict_archive(ind['id'], X)
preds.append(tmp)
return preds
def transform(self,X,Z=None):
"""Return the representation's transformation of X"""
if not self.is_fitted_:
raise ValueError("Call fit before calling transform.")
X = self._prep_X(X)
if Z:
Phi = self.cfeat_.transform(X,Z)
else:
Phi = self.cfeat_.transform(X)
return Phi
def fit_predict(self,X,y,Z=None):
"""Convenience method that runs fit(X,y) then predict(X)"""
self.fit(X,y,Z)
result = self.predict(X,Z)
return result
def fit_transform(self,X,y,Z=None):
"""Convenience method that runs fit(X,y) then transform(X)"""
return self.fit(X,y,Z).transform(X,Z)
def score(self,X,y,Z=None):
"""Returns a score for the predictions of Feat on X versus true
labels y"""
if ( self.classification ):
yhat = self.predict_proba(X,Z)
return log_loss(y, yhat, labels=y)
else:
yhat = self.predict(X,Z).flatten()
return mse(y,yhat)
@property
def stats_(self):
if not self.is_fitted_:
raise ValueError("Call fit before asking for stats_.")
return self.cfeat_.stats_
def _prep_array(self, x):
"""Converts dataframe to array, optionally returning feature names"""
x = np.asfortranarray(x, dtype=np.float32)
return x
def _prep_X(self, X):
X = check_array(X)
self._check_shape(X)
return self._prep_array(X).transpose()
def _clean(self, x, y, set_feature_names=False):
"""Converts dataframe to array, optionally returning feature names"""
feature_names = ''
if isinstance(x, pd.DataFrame):
if set_feature_names and len(list(x.columns)) == x.shape[1]:
self.feature_names = ','.join(x.columns)
x, y = check_X_y(x,y, ensure_min_samples=2, dtype=np.float32)
self.n_features_in_ = x.shape[1]
x = self._prep_X(x)
y = self._prep_array(y)
return x,y
def _check_shape(self, X):
if X.shape[1] != self.n_features_in_:
raise ValueError('The number of features ({}) in X do not match '
'the number of features used for training '
'({})'.format(X.shape[1],self.n_features_in_))
# wrappers
def get_representation(self): return self.cfeat_.get_representation()
def get_model(self, sort=True): return self.cfeat_.get_model(sort)
def get_coefs(self): return self.cfeat_.get_coefs()
def get_n_params(self): return self.cfeat_.get_n_params()
def get_dim(self): return self.cfeat_.get_dim()
def get_n_nodes(self): return self.cfeat_.get_n_nodes()
class FeatRegressor(Feat):
"""Convenience method that enforces regression options."""
def __init__(self,**kwargs):
kwargs.update({'classification':False})
if 'ml' not in kwargs:
kwargs['ml'] = 'LinearRidgeRegression'
Feat.__init__(self,**kwargs)
def _get_param_names(self):
return Feat._get_param_names()
class FeatClassifier(Feat):
"""Convenience method that enforces classification options.
Also includes methods for prediction probabilities.
"""
def __init__(self,**kwargs):
kwargs.update({'classification':True})
if 'ml' not in kwargs:
kwargs['ml'] = 'LR'
Feat.__init__(self,**kwargs)
def _get_param_names(self):
return Feat._get_param_names()
def fit(self,X,y,zfile=None,zids=None):
self.classes_ = [int(i) for i in np.unique(np.asarray(y))]
if (any([
i != j
for i,j in zip(self.classes_, np.arange(np.max(self.classes_)))
])):
raise ValueError('y must be a contiguous set of labels from ',
'0 to n_classes. y contains the values {}'.format(
np.unique(np.asarray(y)))
)
super().fit(X,y)
return self
def predict(self,X,Z=None):
return super().predict(X, Z)
def predict_proba(self,X,Z=None):
"""Return probabilities of predictions for data X"""
if not self.is_fitted_:
raise ValueError("Call fit before calling predict.")
X = self._prep_X(X)
if Z:
tmp = self.cfeat_.predict_proba(X,Z).transpose()
else:
tmp = self.cfeat_.predict_proba(X).transpose()
# for binary classification, add a second column for 0 complement
if len(self.classes_) == 2 and tmp.shape[1] == 1:
tmp = tmp.ravel()
assert X.shape[1] == len(tmp)
tmp = np.vstack((1-tmp,tmp)).transpose()
return tmp
def predict_proba_archive(self,X,Z=None,front=False):
"""Returns a dictionary of prediction probabilities for all models."""
if not self.is_fitted_:
raise ValueError("Call fit before calling predict.")
X = self._prep_X(X)
archive = self.cfeat_.get_archive(front)
probs = []
for ind in archive:
tmp = {}
tmp['id'] = ind['id']
if Z:
tmp['y_proba'] = self.cfeat_.predict_proba_archive(ind['id'], X, Z)
else:
tmp['y_proba'] = self.cfeat_.predict_proba_archive(ind['id'], X)
probs.append(tmp)
return probs