alteryx · ctduffy · May 23, 2019 · May 22, 2019 · May 22, 2019 · May 22, 2019
diff --git a/docs/source/automated_feature_engineering/primitives.rst b/docs/source/automated_feature_engineering/primitives.rst
@@ -45,11 +45,11 @@ A second advantage of primitives is that they can be used to quickly enumerate m
                                           agg_primitives=["mean", "max", "min", "std", "skew"],
                                           trans_primitives=["time_since_previous"])
 
-    feature_matrix[["MEAN(sessions.time_since_previous_by_customer_id)",
-                    "MAX(sessions.time_since_previous_by_customer_id)",
-                    "MIN(sessions.time_since_previous_by_customer_id)",
-                    "STD(sessions.time_since_previous_by_customer_id)",
-                    "SKEW(sessions.time_since_previous_by_customer_id)"]]
+    feature_matrix[["MEAN(sessions.TIME_SINCE_PREVIOUS(session_start))",
+                    "MAX(sessions.TIME_SINCE_PREVIOUS(session_start))",
+                    "MIN(sessions.TIME_SINCE_PREVIOUS(session_start))",
+                    "STD(sessions.TIME_SINCE_PREVIOUS(session_start))",
+                    "SKEW(sessions.TIME_SINCE_PREVIOUS(session_start))"]]
 
 
 

diff --git a/featuretools/primitives/standard/transform_primitive.py b/featuretools/primitives/standard/transform_primitive.py
@@ -13,7 +13,6 @@
     Boolean,
     Datetime,
     DatetimeTimeIndex,
-    Id,
     LatLong,
     Numeric,
     Ordinal,
@@ -56,42 +55,32 @@ def get_function(self):
 
 
 class TimeSincePrevious(TransformPrimitive):
-    """Compute the time in seconds since the previous instance of an entry.
+    """Compute the time in seconds since the previous entry in a list.
 
     Description:
-        Given a list of datetimes and a corresponding list of item ID values,
-        compute the time in seconds elapsed since the previous occurrence
-        of the item in the list. If an item is present only once, the result
-        for this item will be `NaN`. Similarly, the result for the first
-        occurrence of an item will always be `NaN`.
+        Given a list of datetimes, compute the time in seconds elapsed since
+        the previous item in the list. The result for the first item in the
+        list will always be `NaN`.
 
     Examples:
         >>> from datetime import datetime
         >>> time_since_previous = TimeSincePrevious()
         >>> dates = [datetime(2019, 3, 1, 0, 0, 0),
         ...          datetime(2019, 3, 1, 0, 2, 0),
-        ...          datetime(2019, 3, 10, 0, 0, 0),
+        ...          datetime(2019, 3, 1, 0, 3, 0),
         ...          datetime(2019, 3, 1, 0, 2, 30),
-        ...          datetime(2019, 3, 10, 0, 0, 50)]
-        >>> labels = ['A', 'A', 'B', 'A', 'B']
-        >>> time_since_previous(dates, labels).tolist()
-        [nan, 120.0, nan, 30.0, 50.0]
+        ...          datetime(2019, 3, 1, 0, 10, 0)]
+        >>> time_since_previous(dates).tolist()
+        [nan, 120.0, 60.0, -30.0, 450.0]
     """
     name = "time_since_previous"
-    input_types = [DatetimeTimeIndex, Id]
+    input_types = [DatetimeTimeIndex]
     return_type = Numeric
 
-    def generate_name(self, base_feature_names):
-        return u"time_since_previous_by_%s" % base_feature_names[1]
-
     def get_function(self):
-        def pd_diff(base_array, group_array):
-            bf_name = 'base_feature'
-            groupby = 'groupby'
-            grouped_df = pd.DataFrame.from_dict({bf_name: base_array,
-                                                 groupby: group_array})
-            grouped_df = grouped_df.groupby(groupby).diff()
-            return grouped_df[bf_name].apply(lambda x: x.total_seconds())
+        def pd_diff(base_array):
+            ser = pd.Series(base_array).diff()
+            return ser.apply(lambda x: x.total_seconds())
         return pd_diff
 
 
@@ -437,55 +426,37 @@ def generate_name(self, base_feature_names):
 
 class Diff(TransformPrimitive):
     """Compute the difference between the value in a list and the
-    previous value.
+    previous value in that list.
 
     Description:
-        Given a list of values and a corresponding list of item ID values,
-        compute the difference from the previous occurrence of the item in
-        the list. If an item is present only once, the result for this item
-        will be `NaN`. Similarly, the result for the first occurrence of an
-        item will always be `NaN`. If the values are datetimes, the output
-        will be a timedelta.
+        Given a list of values, compute the difference from the previous
+        item in the list. The result for the first element of the list will
+        always be `NaN`. If the values are datetimes, the output will be a
+        timedelta.
 
     Examples:
         >>> diff = Diff()
         >>> values = [1, 10, 3, 4, 15]
-        >>> labels = ['A', 'A', 'B', 'A', 'B']
-        >>> diff(values, labels).tolist()
-        [nan, 9.0, nan, -6.0, 12.0]
+        >>> diff(values).tolist()
+        [nan, 9.0, -7.0, 1.0, 11.0]
 
         If values are datetimes, difference will be a timedelta
 
         >>> from datetime import datetime
         >>> diff = Diff()
         >>> values = [datetime(2019, 3, 1, 0, 0, 0),
         ...          datetime(2019, 3, 1, 0, 1, 0),
-        ...          datetime(2019, 3, 2, 0, 0, 0),
         ...          datetime(2019, 3, 1, 0, 1, 30)]
-        >>> labels = ['A', 'A', 'B', 'A']
-        >>> diff(values, labels).tolist()
-        [NaT, Timedelta('0 days 00:01:00'), NaT, Timedelta('0 days 00:00:30')]
+        >>> diff(values).tolist()
+        [NaT, Timedelta('0 days 00:01:00'), Timedelta('0 days 00:00:30')]
     """
     name = "diff"
-    input_types = [Numeric, Id]
+    input_types = [Numeric]
     return_type = Numeric
 
-    def generate_name(self, base_feature_names):
-        base_features_str = base_feature_names[0] + u" by " + \
-            base_feature_names[1]
-        return u"DIFF(%s)" % (base_features_str)
-
     def get_function(self):
-        def pd_diff(base_array, group_array):
-            bf_name = 'base_feature'
-            groupby = 'groupby'
-            grouped_df = pd.DataFrame.from_dict({bf_name: base_array,
-                                                 groupby: group_array})
-            grouped_df = grouped_df.groupby(groupby).diff()
-            try:
-                return grouped_df[bf_name]
-            except KeyError:
-                return pd.Series([np.nan] * len(base_array))
+        def pd_diff(base_array):
+            return pd.Series(base_array).diff()
         return pd_diff
 
 

diff --git a/featuretools/tests/dfs_tests/test_deep_feature_synthesis.py b/featuretools/tests/dfs_tests/test_deep_feature_synthesis.py
@@ -175,21 +175,21 @@ def test_handles_diff_entity_groupby(es):
     dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                    entityset=es,
                                    agg_primitives=[],
-                                   trans_primitives=[Diff])
+                                   groupby_trans_primitives=[Diff])
 
     features = dfs_obj.build_features()
-    assert (feature_with_name(features, 'DIFF(value by session_id)'))
-    assert (feature_with_name(features, 'DIFF(value by product_id)'))
+    assert (feature_with_name(features, 'DIFF(value) by session_id'))
+    assert (feature_with_name(features, 'DIFF(value) by product_id'))
 
 
 def test_handles_time_since_previous_entity_groupby(es):
     dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                    entityset=es,
                                    agg_primitives=[],
-                                   trans_primitives=[TimeSincePrevious])
+                                   groupby_trans_primitives=[TimeSincePrevious])
 
     features = dfs_obj.build_features()
-    assert (feature_with_name(features, 'time_since_previous_by_session_id'))
+    assert (feature_with_name(features, 'TIME_SINCE_PREVIOUS(datetime) by session_id'))
 
 # M TODO
 # def test_handles_cumsum_entity_groupby(es):

diff --git a/featuretools/tests/primitive_tests/test_transform_features.py b/featuretools/tests/primitive_tests/test_transform_features.py
@@ -116,35 +116,24 @@ def test_make_trans_feat(es):
 
 def test_diff(es):
     value = ft.Feature(es['log']['value'])
-    customer_id_feat = ft.Feature(es['sessions']['customer_id'], entity=es['log'])
-    diff1 = ft.Feature([value, es['log']['session_id']], primitive=Diff)
-    diff2 = ft.Feature([value, customer_id_feat], primitive=Diff)
+    diff1 = ft.Feature([value], primitive=Diff)
 
-    pandas_backend = PandasBackend(es, [diff1, diff2])
+    pandas_backend = PandasBackend(es, [diff1])
     df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                                time_last=None)
 
     val1 = df[diff1.get_name()].values.tolist()
-    val2 = df[diff2.get_name()].values.tolist()
-    correct_vals1 = [
-        np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7
-    ]
-    correct_vals2 = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, np.nan, 5, -5, 7, 7]
+    correct_vals = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, 0, 5, -5, 7, 7]
     for i, v in enumerate(val1):
         v1 = val1[i]
         if np.isnan(v1):
-            assert (np.isnan(correct_vals1[i]))
+            assert (np.isnan(correct_vals[i]))
         else:
-            assert v1 == correct_vals1[i]
-        v2 = val2[i]
-        if np.isnan(v2):
-            assert (np.isnan(correct_vals2[i]))
-        else:
-            assert v2 == correct_vals2[i]
+            assert v1 == correct_vals[i]
 
 
 def test_diff_single_value(es):
-    diff = ft.Feature([es['stores']['num_square_feet'], es['stores'][u'région_id']], primitive=Diff)
+    diff = ft.Feature([es['stores']['num_square_feet']], primitive=Diff)
     pandas_backend = PandasBackend(es, [diff])
     df = pandas_backend.calculate_all_features(instance_ids=[5],
                                                time_last=None)
@@ -322,8 +311,8 @@ def test_arithmetic_of_direct(es):
 
 # P TODO: rewrite this  test
 def test_arithmetic_of_transform(es):
-    diff1 = ft.Feature([es['log']['value'], es['log']['product_id']], primitive=Diff)
-    diff2 = ft.Feature([es['log']['value_2'], es['log']['product_id']], primitive=Diff)
+    diff1 = ft.Feature([es['log']['value']], primitive=Diff)
+    diff2 = ft.Feature([es['log']['value_2']], primitive=Diff)
 
     to_test = [(AddNumeric, [np.nan, 14., -7., 3.]),
                (SubtractNumeric, [np.nan, 6., -3., 1.]),
@@ -525,14 +514,14 @@ def isin_generate_name(self, base_feature_names):
 
 def test_isnull_feat(es):
     value = ft.Feature(es['log']['value'])
-    diff = ft.Feature([value, es['log']['session_id']], primitive=Diff)
+    diff = ft.Feature([value], primitive=Diff)
     isnull = ft.Feature(diff, primitive=IsNull)
     features = [isnull]
     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
     # correct_vals_diff = [
     #     np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7]
-    correct_vals = [True, False, False, False, False, True, False, False,
-                    False, True, True, False, True, False, False]
+    correct_vals = [True, False, False, False, False, False, False, False,
+                    False, False, True, False, False, False, False]
     values = df[isnull.get_name()].values.tolist()
     assert correct_vals == values