diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 2c4dec7935..38d7ded8da 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -4,6 +4,7 @@ Changelog --------- **Future Release** * Enhancements + * Added drop_first as param in encode_features (:pr:`647`) * Fixes * Fix performance regression in DFS (:pr:`637`) * Fix deserialization of feature relationship path (:pr:`665`) diff --git a/featuretools/synthesis/encode_features.py b/featuretools/synthesis/encode_features.py index 18255ad352..0023929e90 100644 --- a/featuretools/synthesis/encode_features.py +++ b/featuretools/synthesis/encode_features.py @@ -9,7 +9,7 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True, - to_encode=None, inplace=False, verbose=False): + to_encode=None, inplace=False, drop_first=False, verbose=False): """Encode categorical features Args: @@ -22,6 +22,9 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True, features not in this list are unencoded in the output matrix defaults to encode all necessary features. inplace (bool): Encode feature_matrix in place. Defaults to False. + drop_first (bool): Whether to get k-1 dummies out of k categorical + levels by removing the first level. + defaults to False verbose (str): Print progress info. Returns: @@ -61,6 +64,10 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True, fm_encoded, f_encoded = ft.encode_features(feature_matrix, features, to_encode=['purchased']) f_encoded + + fm_encoded, f_encoded = ft.encode_features(feature_matrix, features, + drop_first=True) + f_encoded """ if inplace: X = feature_matrix @@ -71,9 +78,7 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True, feature_names = [] for feature in features: for fname in feature.get_feature_names(): - assert fname in X.columns, ( - "Feature %s not found in feature matrix" % (fname) - ) + assert fname in X.columns, ("Feature %s not found in feature matrix" % (fname)) feature_names.append(fname) extra_columns = [col for col in X.columns if col not in feature_names] @@ -113,7 +118,11 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True, val_counts = val_counts.sort_values([f.get_name(), index_name], ascending=False) val_counts.set_index(index_name, inplace=True) - unique = val_counts.head(top_n).index.tolist() + select_n = top_n + if drop_first: + select_n = min(len(val_counts), top_n) + select_n = max(select_n - 1, 1) + unique = val_counts.head(select_n).index.tolist() for label in unique: add = f == label encoded.append(add) diff --git a/featuretools/tests/synthesis/test_encode_features.py b/featuretools/tests/synthesis/test_encode_features.py index 91665a6343..526cd17133 100644 --- a/featuretools/tests/synthesis/test_encode_features.py +++ b/featuretools/tests/synthesis/test_encode_features.py @@ -139,3 +139,18 @@ def test_encode_features_topn(es): for name in topn.get_feature_names(): assert name in features_enc.columns assert features_enc.columns.tolist().count(name) == 1 + + +def test_encode_features_drop_first(): + df = pd.DataFrame({'category': ['ao', 'b', 'c', 'd', 'e']}) + es = EntitySet('test') + es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True) + features, feature_defs = dfs(entityset=es, target_entity='a') + features_enc, feature_defs_enc = encode_features(features, feature_defs, + drop_first=True, include_unknown=False) + assert len(features_enc.columns) == 4 + + features_enc, feature_defs = encode_features(features, feature_defs, top_n=3, drop_first=True, + include_unknown=False) + + assert len(features_enc.columns) == 2