Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates TimeSincePrevious and Diff Primitives #561

Merged
merged 17 commits into from
May 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/source/automated_feature_engineering/primitives.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ A second advantage of primitives is that they can be used to quickly enumerate m
agg_primitives=["mean", "max", "min", "std", "skew"],
trans_primitives=["time_since_previous"])

feature_matrix[["MEAN(sessions.time_since_previous_by_customer_id)",
"MAX(sessions.time_since_previous_by_customer_id)",
"MIN(sessions.time_since_previous_by_customer_id)",
"STD(sessions.time_since_previous_by_customer_id)",
"SKEW(sessions.time_since_previous_by_customer_id)"]]
feature_matrix[["MEAN(sessions.TIME_SINCE_PREVIOUS(session_start))",
"MAX(sessions.TIME_SINCE_PREVIOUS(session_start))",
"MIN(sessions.TIME_SINCE_PREVIOUS(session_start))",
"STD(sessions.TIME_SINCE_PREVIOUS(session_start))",
"SKEW(sessions.TIME_SINCE_PREVIOUS(session_start))"]]



Expand Down
10 changes: 4 additions & 6 deletions featuretools/computational_backends/pandas_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,10 +341,11 @@ def _calculate_groupby_features(self, features, entity_frames):

frame = entity_frames[entity_id]

for f in features:
set_default_column(frame, f)

# handle when no data
if frame.shape[0] == 0:
for f in features:
set_default_column(frame, f)
return frame

groupby = features[0].groupby.get_name()
Expand All @@ -369,10 +370,7 @@ def _calculate_groupby_features(self, features, entity_frames):
values = pd.Series(values, index=variable_data[0].index)

feature_name = f.get_name()
if feature_name in frame.columns:
frame[feature_name].update(values)
else:
frame[feature_name] = values
frame[feature_name].update(values)

return frame

Expand Down
90 changes: 29 additions & 61 deletions featuretools/primitives/standard/transform_primitive.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
Boolean,
Datetime,
DatetimeTimeIndex,
Id,
LatLong,
Numeric,
Ordinal,
Expand Down Expand Up @@ -56,42 +55,39 @@ def get_function(self):


class TimeSincePrevious(TransformPrimitive):
"""Compute the time in seconds since the previous instance of an entry.
"""Compute the time since the previous entry in a list.

Args:
unit (str): Defines the unit of time to count from.
Defaults to Seconds. Acceptable values:
years, months, days, hours, minutes, seconds, milliseconds, nanoseconds

Description:
Given a list of datetimes and a corresponding list of item ID values,
compute the time in seconds elapsed since the previous occurrence
of the item in the list. If an item is present only once, the result
for this item will be `NaN`. Similarly, the result for the first
occurrence of an item will always be `NaN`.
Given a list of datetimes, compute the time in seconds elapsed since
the previous item in the list. The result for the first item in the
list will always be `NaN`.

Examples:
>>> from datetime import datetime
>>> time_since_previous = TimeSincePrevious()
>>> dates = [datetime(2019, 3, 1, 0, 0, 0),
... datetime(2019, 3, 1, 0, 2, 0),
... datetime(2019, 3, 10, 0, 0, 0),
... datetime(2019, 3, 1, 0, 3, 0),
... datetime(2019, 3, 1, 0, 2, 30),
... datetime(2019, 3, 10, 0, 0, 50)]
>>> labels = ['A', 'A', 'B', 'A', 'B']
>>> time_since_previous(dates, labels).tolist()
[nan, 120.0, nan, 30.0, 50.0]
... datetime(2019, 3, 1, 0, 10, 0)]
>>> time_since_previous(dates).tolist()
[nan, 120.0, 60.0, -30.0, 450.0]
"""
name = "time_since_previous"
input_types = [DatetimeTimeIndex, Id]
input_types = [DatetimeTimeIndex]
return_type = Numeric

def generate_name(self, base_feature_names):
return u"time_since_previous_by_%s" % base_feature_names[1]
def __init__(self, unit="seconds"):
self.unit = unit.lower()

def get_function(self):
def pd_diff(base_array, group_array):
bf_name = 'base_feature'
groupby = 'groupby'
grouped_df = pd.DataFrame.from_dict({bf_name: base_array,
groupby: group_array})
grouped_df = grouped_df.groupby(groupby).diff()
return grouped_df[bf_name].apply(lambda x: x.total_seconds())
def pd_diff(values):
return convert_time_units(values.diff().apply(lambda x: x.total_seconds()), self.unit)
return pd_diff


Expand Down Expand Up @@ -342,7 +338,7 @@ def word_counter(array):


class TimeSince(TransformPrimitive):
"""Calculates time in seconds from a value to a specified cutoff datetime.
"""Calculates time from a value to a specified cutoff datetime.

Args:
unit (str): Defines the unit of time to count from.
Expand Down Expand Up @@ -437,55 +433,27 @@ def generate_name(self, base_feature_names):

class Diff(TransformPrimitive):
"""Compute the difference between the value in a list and the
previous value.
previous value in that list.

Description:
Given a list of values and a corresponding list of item ID values,
compute the difference from the previous occurrence of the item in
the list. If an item is present only once, the result for this item
will be `NaN`. Similarly, the result for the first occurrence of an
item will always be `NaN`. If the values are datetimes, the output
will be a timedelta.
Given a list of values, compute the difference from the previous
item in the list. The result for the first element of the list will
always be `NaN`. If the values are datetimes, the output will be a
timedelta.

Examples:
>>> diff = Diff()
>>> values = [1, 10, 3, 4, 15]
>>> labels = ['A', 'A', 'B', 'A', 'B']
>>> diff(values, labels).tolist()
[nan, 9.0, nan, -6.0, 12.0]

If values are datetimes, difference will be a timedelta

>>> from datetime import datetime
>>> diff = Diff()
>>> values = [datetime(2019, 3, 1, 0, 0, 0),
... datetime(2019, 3, 1, 0, 1, 0),
... datetime(2019, 3, 2, 0, 0, 0),
... datetime(2019, 3, 1, 0, 1, 30)]
>>> labels = ['A', 'A', 'B', 'A']
>>> diff(values, labels).tolist()
[NaT, Timedelta('0 days 00:01:00'), NaT, Timedelta('0 days 00:00:30')]
>>> diff(values).tolist()
[nan, 9.0, -7.0, 1.0, 11.0]
"""
name = "diff"
input_types = [Numeric, Id]
input_types = [Numeric]
return_type = Numeric

def generate_name(self, base_feature_names):
base_features_str = base_feature_names[0] + u" by " + \
base_feature_names[1]
return u"DIFF(%s)" % (base_features_str)

def get_function(self):
def pd_diff(base_array, group_array):
bf_name = 'base_feature'
groupby = 'groupby'
grouped_df = pd.DataFrame.from_dict({bf_name: base_array,
groupby: group_array})
grouped_df = grouped_df.groupby(groupby).diff()
try:
return grouped_df[bf_name]
except KeyError:
return pd.Series([np.nan] * len(base_array))
def pd_diff(values):
return values.diff()
return pd_diff


Expand Down
10 changes: 5 additions & 5 deletions featuretools/tests/dfs_tests/test_deep_feature_synthesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,21 +175,21 @@ def test_handles_diff_entity_groupby(es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
entityset=es,
agg_primitives=[],
trans_primitives=[Diff])
groupby_trans_primitives=[Diff])

features = dfs_obj.build_features()
assert (feature_with_name(features, 'DIFF(value by session_id)'))
assert (feature_with_name(features, 'DIFF(value by product_id)'))
assert (feature_with_name(features, 'DIFF(value) by session_id'))
assert (feature_with_name(features, 'DIFF(value) by product_id'))


def test_handles_time_since_previous_entity_groupby(es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
entityset=es,
agg_primitives=[],
trans_primitives=[TimeSincePrevious])
groupby_trans_primitives=[TimeSincePrevious])

features = dfs_obj.build_features()
assert (feature_with_name(features, 'time_since_previous_by_session_id'))
assert (feature_with_name(features, 'TIME_SINCE_PREVIOUS(datetime) by session_id'))

# M TODO
# def test_handles_cumsum_entity_groupby(es):
Expand Down
12 changes: 6 additions & 6 deletions featuretools/tests/primitive_tests/test_transform_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ def test_make_trans_feat(es):
def test_diff(es):
value = ft.Feature(es['log']['value'])
customer_id_feat = ft.Feature(es['sessions']['customer_id'], entity=es['log'])
diff1 = ft.Feature([value, es['log']['session_id']], primitive=Diff)
diff2 = ft.Feature([value, customer_id_feat], primitive=Diff)
diff1 = ft.Feature(value, groupby=es['log']['session_id'], primitive=Diff)
diff2 = ft.Feature(value, groupby=customer_id_feat, primitive=Diff)

pandas_backend = PandasBackend(es, [diff1, diff2])
df = pandas_backend.calculate_all_features(instance_ids=range(15),
Expand All @@ -144,7 +144,7 @@ def test_diff(es):


def test_diff_single_value(es):
diff = ft.Feature([es['stores']['num_square_feet'], es['stores'][u'région_id']], primitive=Diff)
diff = ft.Feature(es['stores']['num_square_feet'], groupby=es['stores'][u'région_id'], primitive=Diff)
pandas_backend = PandasBackend(es, [diff])
df = pandas_backend.calculate_all_features(instance_ids=[5],
time_last=None)
Expand Down Expand Up @@ -322,8 +322,8 @@ def test_arithmetic_of_direct(es):

# P TODO: rewrite this test
def test_arithmetic_of_transform(es):
diff1 = ft.Feature([es['log']['value'], es['log']['product_id']], primitive=Diff)
diff2 = ft.Feature([es['log']['value_2'], es['log']['product_id']], primitive=Diff)
diff1 = ft.Feature([es['log']['value']], primitive=Diff)
diff2 = ft.Feature([es['log']['value_2']], primitive=Diff)

to_test = [(AddNumeric, [np.nan, 14., -7., 3.]),
(SubtractNumeric, [np.nan, 6., -3., 1.]),
Expand Down Expand Up @@ -525,7 +525,7 @@ def isin_generate_name(self, base_feature_names):

def test_isnull_feat(es):
value = ft.Feature(es['log']['value'])
diff = ft.Feature([value, es['log']['session_id']], primitive=Diff)
diff = ft.Feature(value, groupby=es['log']['session_id'], primitive=Diff)
isnull = ft.Feature(diff, primitive=IsNull)
features = [isnull]
df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
Expand Down