alteryx · kmax12 · Jul 14, 2019 · Jul 8, 2019 · Jul 8, 2019 · Jul 9, 2019
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -4,6 +4,7 @@ Changelog
 ---------
 **Future Release**
     * Enhancements
+        * Added drop_first as param in encode_features (:pr:`647`)
     * Fixes
         * Fix performance regression in DFS (:pr:`637`)
         * Fix deserialization of feature relationship path (:pr:`665`)

diff --git a/featuretools/synthesis/encode_features.py b/featuretools/synthesis/encode_features.py
@@ -9,7 +9,7 @@
 
 
 def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
-                    to_encode=None, inplace=False, verbose=False):
+                    to_encode=None, inplace=False, drop_first=False, verbose=False):
     """Encode categorical features
 
         Args:
@@ -22,6 +22,9 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
                 features not in this list are unencoded in the output matrix
                 defaults to encode all necessary features.
             inplace (bool): Encode feature_matrix in place. Defaults to False.
+            drop_first (bool): Whether to get k-1 dummies out of k categorical
+                    levels by removing the first level.
+                    defaults to False
             verbose (str): Print progress info.
 
         Returns:
@@ -61,6 +64,10 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
                 fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                            to_encode=['purchased'])
                 f_encoded
+
+                fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
+                                                           drop_first=True)
+                f_encoded
     """
     if inplace:
         X = feature_matrix
@@ -71,9 +78,7 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
     feature_names = []
     for feature in features:
         for fname in feature.get_feature_names():
-            assert fname in X.columns, (
-                "Feature %s not found in feature matrix" % (fname)
-            )
+            assert fname in X.columns, ("Feature %s not found in feature matrix" % (fname))
             feature_names.append(fname)
 
     extra_columns = [col for col in X.columns if col not in feature_names]
@@ -113,7 +118,11 @@ def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
         val_counts = val_counts.sort_values([f.get_name(), index_name],
                                             ascending=False)
         val_counts.set_index(index_name, inplace=True)
-        unique = val_counts.head(top_n).index.tolist()
+        select_n = top_n
+        if drop_first:
+            select_n = min(len(val_counts), top_n)
+            select_n = max(select_n - 1, 1)
+        unique = val_counts.head(select_n).index.tolist()
         for label in unique:
             add = f == label
             encoded.append(add)

diff --git a/featuretools/tests/synthesis/test_encode_features.py b/featuretools/tests/synthesis/test_encode_features.py
@@ -139,3 +139,18 @@ def test_encode_features_topn(es):
     for name in topn.get_feature_names():
         assert name in features_enc.columns
         assert features_enc.columns.tolist().count(name) == 1
+
+
+def test_encode_features_drop_first():
+    df = pd.DataFrame({'category': ['ao', 'b', 'c', 'd', 'e']})
+    es = EntitySet('test')
+    es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
+    features, feature_defs = dfs(entityset=es, target_entity='a')
+    features_enc, feature_defs_enc = encode_features(features, feature_defs,
+                                                     drop_first=True, include_unknown=False)
+    assert len(features_enc.columns) == 4
+
+    features_enc, feature_defs = encode_features(features, feature_defs, top_n=3, drop_first=True,
+                                                 include_unknown=False)
+
+    assert len(features_enc.columns) == 2