Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

drop_first attribute added in encode features #647

Merged
merged 10 commits into from Jul 14, 2019
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Expand Up @@ -4,6 +4,7 @@ Changelog
---------
**Future Release**
* Enhancements
* Added drop_first as param in encode_features (:pr:`647`)
* Fixes
* Fix performance regression in DFS (:pr:`637`)
* Fix deserialization of feature relationship path (:pr:`665`)
Expand Down
19 changes: 14 additions & 5 deletions featuretools/synthesis/encode_features.py
Expand Up @@ -9,7 +9,7 @@


def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
to_encode=None, inplace=False, verbose=False):
to_encode=None, inplace=False, drop_first=False, verbose=False):
"""Encode categorical features

Args:
Expand All @@ -22,6 +22,9 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
features not in this list are unencoded in the output matrix
defaults to encode all necessary features.
inplace (bool): Encode feature_matrix in place. Defaults to False.
drop_first (bool): Whether to get k-1 dummies out of k categorical
levels by removing the first level.
defaults to False
verbose (str): Print progress info.

Returns:
Expand Down Expand Up @@ -61,6 +64,10 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
to_encode=['purchased'])
f_encoded

fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
drop_first=True)
f_encoded
"""
if inplace:
X = feature_matrix
Expand All @@ -71,9 +78,7 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
feature_names = []
for feature in features:
for fname in feature.get_feature_names():
assert fname in X.columns, (
"Feature %s not found in feature matrix" % (fname)
)
assert fname in X.columns, ("Feature %s not found in feature matrix" % (fname))
feature_names.append(fname)

extra_columns = [col for col in X.columns if col not in feature_names]
Expand Down Expand Up @@ -113,7 +118,11 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
val_counts = val_counts.sort_values([f.get_name(), index_name],
ascending=False)
val_counts.set_index(index_name, inplace=True)
unique = val_counts.head(top_n).index.tolist()
select_n = top_n
if drop_first:
select_n = min(len(val_counts), top_n)
select_n = max(select_n - 1, 1)
unique = val_counts.head(select_n).index.tolist()
for label in unique:
add = f == label
encoded.append(add)
Expand Down
15 changes: 15 additions & 0 deletions featuretools/tests/synthesis/test_encode_features.py
Expand Up @@ -139,3 +139,18 @@ def test_encode_features_topn(es):
for name in topn.get_feature_names():
assert name in features_enc.columns
assert features_enc.columns.tolist().count(name) == 1


def test_encode_features_drop_first():
df = pd.DataFrame({'category': ['ao', 'b', 'c', 'd', 'e']})
es = EntitySet('test')
es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
features, feature_defs = dfs(entityset=es, target_entity='a')
features_enc, feature_defs_enc = encode_features(features, feature_defs,
drop_first=True, include_unknown=False)
assert len(features_enc.columns) == 4

features_enc, feature_defs = encode_features(features, feature_defs, top_n=3, drop_first=True,
include_unknown=False)

assert len(features_enc.columns) == 2