alteryx · ctduffy · May 23, 2019 · May 22, 2019 · May 22, 2019 · May 22, 2019
diff --git a/docs/source/automated_feature_engineering/primitives.rst b/docs/source/automated_feature_engineering/primitives.rst
@@ -45,11 +45,11 @@ A second advantage of primitives is that they can be used to quickly enumerate m
                                           agg_primitives=["mean", "max", "min", "std", "skew"],
                                           trans_primitives=["time_since_previous"])
 
-    feature_matrix[["MEAN(sessions.time_since_previous_by_customer_id)",
-                    "MAX(sessions.time_since_previous_by_customer_id)",
-                    "MIN(sessions.time_since_previous_by_customer_id)",
-                    "STD(sessions.time_since_previous_by_customer_id)",
-                    "SKEW(sessions.time_since_previous_by_customer_id)"]]
+    feature_matrix[["MEAN(sessions.TIME_SINCE_PREVIOUS(session_start))",
+                    "MAX(sessions.TIME_SINCE_PREVIOUS(session_start))",
+                    "MIN(sessions.TIME_SINCE_PREVIOUS(session_start))",
+                    "STD(sessions.TIME_SINCE_PREVIOUS(session_start))",
+                    "SKEW(sessions.TIME_SINCE_PREVIOUS(session_start))"]]
 
 
 

diff --git a/featuretools/computational_backends/pandas_backend.py b/featuretools/computational_backends/pandas_backend.py
@@ -341,10 +341,11 @@ def _calculate_groupby_features(self, features, entity_frames):
 
         frame = entity_frames[entity_id]
 
+        for f in features:
+            set_default_column(frame, f)
+
         # handle when no data
         if frame.shape[0] == 0:
-            for f in features:
-                set_default_column(frame, f)
             return frame
 
         groupby = features[0].groupby.get_name()
@@ -369,10 +370,7 @@ def _calculate_groupby_features(self, features, entity_frames):
                     values = pd.Series(values, index=variable_data[0].index)
 
                 feature_name = f.get_name()
-                if feature_name in frame.columns:
-                    frame[feature_name].update(values)
-                else:
-                    frame[feature_name] = values
+                frame[feature_name].update(values)
 
         return frame
 

diff --git a/featuretools/primitives/standard/transform_primitive.py b/featuretools/primitives/standard/transform_primitive.py
@@ -13,7 +13,6 @@
     Boolean,
     Datetime,
     DatetimeTimeIndex,
-    Id,
     LatLong,
     Numeric,
     Ordinal,
@@ -56,42 +55,39 @@ def get_function(self):
 
 
 class TimeSincePrevious(TransformPrimitive):
-    """Compute the time in seconds since the previous instance of an entry.
+    """Compute the time since the previous entry in a list.
+
+    Args:
+        unit (str): Defines the unit of time to count from.
+            Defaults to Seconds. Acceptable values:
+            years, months, days, hours, minutes, seconds, milliseconds, nanoseconds
 
     Description:
-        Given a list of datetimes and a corresponding list of item ID values,
-        compute the time in seconds elapsed since the previous occurrence
-        of the item in the list. If an item is present only once, the result
-        for this item will be `NaN`. Similarly, the result for the first
-        occurrence of an item will always be `NaN`.
+        Given a list of datetimes, compute the time in seconds elapsed since
+        the previous item in the list. The result for the first item in the
+        list will always be `NaN`.
 
     Examples:
         >>> from datetime import datetime
         >>> time_since_previous = TimeSincePrevious()
         >>> dates = [datetime(2019, 3, 1, 0, 0, 0),
         ...          datetime(2019, 3, 1, 0, 2, 0),
-        ...          datetime(2019, 3, 10, 0, 0, 0),
+        ...          datetime(2019, 3, 1, 0, 3, 0),
         ...          datetime(2019, 3, 1, 0, 2, 30),
-        ...          datetime(2019, 3, 10, 0, 0, 50)]
-        >>> labels = ['A', 'A', 'B', 'A', 'B']
-        >>> time_since_previous(dates, labels).tolist()
-        [nan, 120.0, nan, 30.0, 50.0]
+        ...          datetime(2019, 3, 1, 0, 10, 0)]
+        >>> time_since_previous(dates).tolist()
+        [nan, 120.0, 60.0, -30.0, 450.0]
     """
     name = "time_since_previous"
-    input_types = [DatetimeTimeIndex, Id]
+    input_types = [DatetimeTimeIndex]
     return_type = Numeric
 
-    def generate_name(self, base_feature_names):
-        return u"time_since_previous_by_%s" % base_feature_names[1]
+    def __init__(self, unit="seconds"):
+        self.unit = unit.lower()
 
     def get_function(self):
-        def pd_diff(base_array, group_array):
-            bf_name = 'base_feature'
-            groupby = 'groupby'
-            grouped_df = pd.DataFrame.from_dict({bf_name: base_array,
-                                                 groupby: group_array})
-            grouped_df = grouped_df.groupby(groupby).diff()
-            return grouped_df[bf_name].apply(lambda x: x.total_seconds())
+        def pd_diff(values):
+            return convert_time_units(values.diff().apply(lambda x: x.total_seconds()), self.unit)
         return pd_diff
 
 
@@ -342,7 +338,7 @@ def word_counter(array):
 
 
 class TimeSince(TransformPrimitive):
-    """Calculates time in seconds from a value to a specified cutoff datetime.
+    """Calculates time from a value to a specified cutoff datetime.
 
     Args:
         unit (str): Defines the unit of time to count from.
@@ -437,55 +433,27 @@ def generate_name(self, base_feature_names):
 
 class Diff(TransformPrimitive):
     """Compute the difference between the value in a list and the
-    previous value.
+    previous value in that list.
 
     Description:
-        Given a list of values and a corresponding list of item ID values,
-        compute the difference from the previous occurrence of the item in
-        the list. If an item is present only once, the result for this item
-        will be `NaN`. Similarly, the result for the first occurrence of an
-        item will always be `NaN`. If the values are datetimes, the output
-        will be a timedelta.
+        Given a list of values, compute the difference from the previous
+        item in the list. The result for the first element of the list will
+        always be `NaN`. If the values are datetimes, the output will be a
+        timedelta.
 
     Examples:
         >>> diff = Diff()
         >>> values = [1, 10, 3, 4, 15]
-        >>> labels = ['A', 'A', 'B', 'A', 'B']
-        >>> diff(values, labels).tolist()
-        [nan, 9.0, nan, -6.0, 12.0]
-
-        If values are datetimes, difference will be a timedelta
-
-        >>> from datetime import datetime
-        >>> diff = Diff()
-        >>> values = [datetime(2019, 3, 1, 0, 0, 0),
-        ...          datetime(2019, 3, 1, 0, 1, 0),
-        ...          datetime(2019, 3, 2, 0, 0, 0),
-        ...          datetime(2019, 3, 1, 0, 1, 30)]
-        >>> labels = ['A', 'A', 'B', 'A']
-        >>> diff(values, labels).tolist()
-        [NaT, Timedelta('0 days 00:01:00'), NaT, Timedelta('0 days 00:00:30')]
+        >>> diff(values).tolist()
+        [nan, 9.0, -7.0, 1.0, 11.0]
     """
     name = "diff"
-    input_types = [Numeric, Id]
+    input_types = [Numeric]
     return_type = Numeric
 
-    def generate_name(self, base_feature_names):
-        base_features_str = base_feature_names[0] + u" by " + \
-            base_feature_names[1]
-        return u"DIFF(%s)" % (base_features_str)
-
     def get_function(self):
-        def pd_diff(base_array, group_array):
-            bf_name = 'base_feature'
-            groupby = 'groupby'
-            grouped_df = pd.DataFrame.from_dict({bf_name: base_array,
-                                                 groupby: group_array})
-            grouped_df = grouped_df.groupby(groupby).diff()
-            try:
-                return grouped_df[bf_name]
-            except KeyError:
-                return pd.Series([np.nan] * len(base_array))
+        def pd_diff(values):
+            return values.diff()
         return pd_diff
 
 

diff --git a/featuretools/tests/dfs_tests/test_deep_feature_synthesis.py b/featuretools/tests/dfs_tests/test_deep_feature_synthesis.py
@@ -175,21 +175,21 @@ def test_handles_diff_entity_groupby(es):
     dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                    entityset=es,
                                    agg_primitives=[],
-                                   trans_primitives=[Diff])
+                                   groupby_trans_primitives=[Diff])
 
     features = dfs_obj.build_features()
-    assert (feature_with_name(features, 'DIFF(value by session_id)'))
-    assert (feature_with_name(features, 'DIFF(value by product_id)'))
+    assert (feature_with_name(features, 'DIFF(value) by session_id'))
+    assert (feature_with_name(features, 'DIFF(value) by product_id'))
 
 
 def test_handles_time_since_previous_entity_groupby(es):
     dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                    entityset=es,
                                    agg_primitives=[],
-                                   trans_primitives=[TimeSincePrevious])
+                                   groupby_trans_primitives=[TimeSincePrevious])
 
     features = dfs_obj.build_features()
-    assert (feature_with_name(features, 'time_since_previous_by_session_id'))
+    assert (feature_with_name(features, 'TIME_SINCE_PREVIOUS(datetime) by session_id'))
 
 # M TODO
 # def test_handles_cumsum_entity_groupby(es):

diff --git a/featuretools/tests/primitive_tests/test_transform_features.py b/featuretools/tests/primitive_tests/test_transform_features.py
@@ -117,8 +117,8 @@ def test_make_trans_feat(es):
 def test_diff(es):
     value = ft.Feature(es['log']['value'])
     customer_id_feat = ft.Feature(es['sessions']['customer_id'], entity=es['log'])
-    diff1 = ft.Feature([value, es['log']['session_id']], primitive=Diff)
-    diff2 = ft.Feature([value, customer_id_feat], primitive=Diff)
+    diff1 = ft.Feature(value, groupby=es['log']['session_id'], primitive=Diff)
+    diff2 = ft.Feature(value, groupby=customer_id_feat, primitive=Diff)
 
     pandas_backend = PandasBackend(es, [diff1, diff2])
     df = pandas_backend.calculate_all_features(instance_ids=range(15),
@@ -144,7 +144,7 @@ def test_diff(es):
 
 
 def test_diff_single_value(es):
-    diff = ft.Feature([es['stores']['num_square_feet'], es['stores'][u'région_id']], primitive=Diff)
+    diff = ft.Feature(es['stores']['num_square_feet'], groupby=es['stores'][u'région_id'], primitive=Diff)
     pandas_backend = PandasBackend(es, [diff])
     df = pandas_backend.calculate_all_features(instance_ids=[5],
                                                time_last=None)
@@ -322,8 +322,8 @@ def test_arithmetic_of_direct(es):
 
 # P TODO: rewrite this  test
 def test_arithmetic_of_transform(es):
-    diff1 = ft.Feature([es['log']['value'], es['log']['product_id']], primitive=Diff)
-    diff2 = ft.Feature([es['log']['value_2'], es['log']['product_id']], primitive=Diff)
+    diff1 = ft.Feature([es['log']['value']], primitive=Diff)
+    diff2 = ft.Feature([es['log']['value_2']], primitive=Diff)
 
     to_test = [(AddNumeric, [np.nan, 14., -7., 3.]),
                (SubtractNumeric, [np.nan, 6., -3., 1.]),
@@ -525,7 +525,7 @@ def isin_generate_name(self, base_feature_names):
 
 def test_isnull_feat(es):
     value = ft.Feature(es['log']['value'])
-    diff = ft.Feature([value, es['log']['session_id']], primitive=Diff)
+    diff = ft.Feature(value, groupby=es['log']['session_id'], primitive=Diff)
     isnull = ft.Feature(diff, primitive=IsNull)
     features = [isnull]
     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))