forked from alteryx/featuretools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_encode_features.py
156 lines (115 loc) · 6.81 KB
/
test_encode_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import pandas as pd
import pytest
from featuretools import EntitySet, calculate_feature_matrix, dfs
from featuretools.feature_base import Feature, IdentityFeature
from featuretools.primitives import NMostCommon
from featuretools.synthesis import encode_features
def test_encodes_features(es):
f1 = IdentityFeature(es["log"]["product_id"])
f2 = IdentityFeature(es["log"]["purchased"])
f3 = IdentityFeature(es["log"]["value"])
features = [f1, f2, f3]
feature_matrix = calculate_feature_matrix(features, es, instance_ids=[0, 1, 2, 3, 4, 5])
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
assert len(features_encoded) == 6
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=2)
assert len(features_encoded) == 5
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features,
include_unknown=False)
assert len(features_encoded) == 5
def test_inplace_encodes_features(es):
f1 = IdentityFeature(es["log"]["product_id"])
features = [f1]
feature_matrix = calculate_feature_matrix(features, es, instance_ids=[0, 1, 2, 3, 4, 5])
feature_matrix_shape = feature_matrix.shape
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
assert feature_matrix_encoded.shape != feature_matrix_shape
assert feature_matrix.shape == feature_matrix_shape
# inplace they should be the same
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, inplace=True)
assert feature_matrix_encoded.shape == feature_matrix.shape
def test_to_encode_features(es):
f1 = IdentityFeature(es["log"]["product_id"])
f2 = IdentityFeature(es["log"]["value"])
features = [f1, f2]
feature_matrix = calculate_feature_matrix(features, es, instance_ids=[0, 1, 2, 3, 4, 5])
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
feature_matrix_encoded_shape = feature_matrix_encoded.shape
# to_encode should keep product_id as a string, and not create 3 additional columns
to_encode = []
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
to_encode = ['value']
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
def test_encode_features_handles_pass_columns(es):
f1 = IdentityFeature(es["log"]["product_id"])
f2 = IdentityFeature(es["log"]["value"])
features = [f1, f2]
cutoff_time = pd.DataFrame({'instance_id': range(6),
'time': es['log'].df['datetime'][0:6],
'label': [i % 2 for i in range(6)]},
columns=["instance_id", "time", "label"])
feature_matrix = calculate_feature_matrix(features, es, cutoff_time)
assert 'label' in feature_matrix.columns
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
feature_matrix_encoded_shape = feature_matrix_encoded.shape
# to_encode should keep product_id as a string, and not create 3 additional columns
to_encode = []
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
to_encode = ['value']
feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
assert 'label' in feature_matrix_encoded.columns
def test_encode_features_catches_features_mismatch(es):
f1 = IdentityFeature(es["log"]["product_id"])
f2 = IdentityFeature(es["log"]["value"])
f3 = IdentityFeature(es["log"]["session_id"])
features = [f1, f2]
cutoff_time = pd.DataFrame({'instance_id': range(6),
'time': es['log'].df['datetime'][0:6],
'label': [i % 2 for i in range(6)]},
columns=["instance_id", "time", "label"])
feature_matrix = calculate_feature_matrix(features, es, cutoff_time)
assert 'label' in feature_matrix.columns
error_text = 'Feature session_id not found in feature matrix'
with pytest.raises(AssertionError, match=error_text):
encode_features(feature_matrix, [f1, f3])
def test_encode_unknown_features():
# Dataframe with categorical column with "unknown" string
df = pd.DataFrame({'category': ['unknown', 'b', 'c', 'd', 'e']})
es = EntitySet('test')
es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
features, feature_defs = dfs(entityset=es, target_entity='a')
# Specify unknown token for replacement
features_enc, feature_defs_enc = encode_features(features, feature_defs,
include_unknown=True)
assert list(features_enc.columns) == ['category = unknown', 'category = e', 'category = d',
'category = c', 'category = b', 'category is unknown']
def test_encode_features_topn(es):
topn = Feature(es['log']['product_id'],
parent_entity=es['customers'],
primitive=NMostCommon(n=3))
features, feature_defs = dfs(entityset=es,
instance_ids=[0, 1, 2],
target_entity="customers",
agg_primitives=[NMostCommon(n=3)])
features_enc, feature_defs_enc = encode_features(features,
feature_defs,
include_unknown=True)
assert topn.unique_name() in [feat.unique_name() for feat in feature_defs_enc]
for name in topn.get_feature_names():
assert name in features_enc.columns
assert features_enc.columns.tolist().count(name) == 1
def test_encode_features_drop_first():
df = pd.DataFrame({'category': ['ao', 'b', 'c', 'd', 'e']})
es = EntitySet('test')
es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
features, feature_defs = dfs(entityset=es, target_entity='a')
features_enc, feature_defs_enc = encode_features(features, feature_defs,
drop_first=True, include_unknown=False)
assert len(features_enc.columns) == 4
features_enc, feature_defs = encode_features(features, feature_defs, top_n=3, drop_first=True,
include_unknown=False)
assert len(features_enc.columns) == 2