/
forrest.py
44 lines (32 loc) · 1.46 KB
/
forrest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def forrest(self):
'''Random Forrest based reduction strategy. Somewhat more
aggressive than for example 'spearman' because there are no
negative values, but instead the highest positive correlation
is minused from all the values so that max value is 0, and then
values are turned into positive. The one with the highest positive
score in the end will be dropped. This means that anything with
0 originally, is a candidate for dropping. Because there are multiple
zeroes in many cases, there is an element of randomness on which one
is dropped.
'''
import wrangle
import numpy as np
# handle conversion to multi_labels
from .reduce_utils import cols_to_multilabel
data = cols_to_multilabel(self)
# get the correlations
corr_values = wrangle.df_corr_randomforest(data, self.reduction_metric)
# drop labels where value is NaN
corr_values.dropna(inplace=True)
# handle the turning around of values (see docstring for more info)
corr_values -= corr_values[0]
corr_values = corr_values.abs()
# get the strongest correlation
corr_values = corr_values.index[-1]
# get the label, value, and dtype from the column header
label, dtype, value = corr_values.split('~')
# convert things back to their original dtype
value = np.array([value]).astype(dtype)[0]
# this is where we modify the parameter space accordingly
self.param_object.remove_is(label, value)
return self