Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates TimeSincePrevious and Diff Primitives #561

Merged
merged 17 commits into from
May 23, 2019
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/source/automated_feature_engineering/primitives.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ A second advantage of primitives is that they can be used to quickly enumerate m
agg_primitives=["mean", "max", "min", "std", "skew"],
trans_primitives=["time_since_previous"])

feature_matrix[["MEAN(sessions.time_since_previous_by_customer_id)",
"MAX(sessions.time_since_previous_by_customer_id)",
"MIN(sessions.time_since_previous_by_customer_id)",
"STD(sessions.time_since_previous_by_customer_id)",
"SKEW(sessions.time_since_previous_by_customer_id)"]]
feature_matrix[["MEAN(sessions.TIME_SINCE_PREVIOUS(session_start))",
"MAX(sessions.TIME_SINCE_PREVIOUS(session_start))",
"MIN(sessions.TIME_SINCE_PREVIOUS(session_start))",
"STD(sessions.TIME_SINCE_PREVIOUS(session_start))",
"SKEW(sessions.TIME_SINCE_PREVIOUS(session_start))"]]



Expand Down
77 changes: 24 additions & 53 deletions featuretools/primitives/standard/transform_primitive.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
Boolean,
Datetime,
DatetimeTimeIndex,
Id,
LatLong,
Numeric,
Ordinal,
Expand Down Expand Up @@ -56,42 +55,32 @@ def get_function(self):


class TimeSincePrevious(TransformPrimitive):
"""Compute the time in seconds since the previous instance of an entry.
"""Compute the time in seconds since the previous entry in a list.

Description:
Given a list of datetimes and a corresponding list of item ID values,
compute the time in seconds elapsed since the previous occurrence
of the item in the list. If an item is present only once, the result
for this item will be `NaN`. Similarly, the result for the first
occurrence of an item will always be `NaN`.
Given a list of datetimes, compute the time in seconds elapsed since
the previous item in the list. The result for the first item in the
list will always be `NaN`.

Examples:
>>> from datetime import datetime
>>> time_since_previous = TimeSincePrevious()
>>> dates = [datetime(2019, 3, 1, 0, 0, 0),
... datetime(2019, 3, 1, 0, 2, 0),
... datetime(2019, 3, 10, 0, 0, 0),
... datetime(2019, 3, 1, 0, 3, 0),
... datetime(2019, 3, 1, 0, 2, 30),
... datetime(2019, 3, 10, 0, 0, 50)]
>>> labels = ['A', 'A', 'B', 'A', 'B']
>>> time_since_previous(dates, labels).tolist()
[nan, 120.0, nan, 30.0, 50.0]
... datetime(2019, 3, 1, 0, 10, 0)]
>>> time_since_previous(dates).tolist()
[nan, 120.0, 60.0, -30.0, 450.0]
"""
name = "time_since_previous"
input_types = [DatetimeTimeIndex, Id]
input_types = [DatetimeTimeIndex]
return_type = Numeric

def generate_name(self, base_feature_names):
return u"time_since_previous_by_%s" % base_feature_names[1]

def get_function(self):
def pd_diff(base_array, group_array):
bf_name = 'base_feature'
groupby = 'groupby'
grouped_df = pd.DataFrame.from_dict({bf_name: base_array,
groupby: group_array})
grouped_df = grouped_df.groupby(groupby).diff()
return grouped_df[bf_name].apply(lambda x: x.total_seconds())
def pd_diff(base_array):
ser = pd.Series(base_array).diff()
return ser.apply(lambda x: x.total_seconds())
return pd_diff


Expand Down Expand Up @@ -437,55 +426,37 @@ def generate_name(self, base_feature_names):

class Diff(TransformPrimitive):
"""Compute the difference between the value in a list and the
previous value.
previous value in that list.

Description:
Given a list of values and a corresponding list of item ID values,
compute the difference from the previous occurrence of the item in
the list. If an item is present only once, the result for this item
will be `NaN`. Similarly, the result for the first occurrence of an
item will always be `NaN`. If the values are datetimes, the output
will be a timedelta.
Given a list of values, compute the difference from the previous
item in the list. The result for the first element of the list will
always be `NaN`. If the values are datetimes, the output will be a
timedelta.

Examples:
>>> diff = Diff()
>>> values = [1, 10, 3, 4, 15]
>>> labels = ['A', 'A', 'B', 'A', 'B']
>>> diff(values, labels).tolist()
[nan, 9.0, nan, -6.0, 12.0]
>>> diff(values).tolist()
[nan, 9.0, -7.0, 1.0, 11.0]

If values are datetimes, difference will be a timedelta

>>> from datetime import datetime
>>> diff = Diff()
>>> values = [datetime(2019, 3, 1, 0, 0, 0),
... datetime(2019, 3, 1, 0, 1, 0),
... datetime(2019, 3, 2, 0, 0, 0),
... datetime(2019, 3, 1, 0, 1, 30)]
>>> labels = ['A', 'A', 'B', 'A']
>>> diff(values, labels).tolist()
[NaT, Timedelta('0 days 00:01:00'), NaT, Timedelta('0 days 00:00:30')]
>>> diff(values).tolist()
[NaT, Timedelta('0 days 00:01:00'), Timedelta('0 days 00:00:30')]
"""
name = "diff"
input_types = [Numeric, Id]
input_types = [Numeric]
return_type = Numeric

def generate_name(self, base_feature_names):
base_features_str = base_feature_names[0] + u" by " + \
base_feature_names[1]
return u"DIFF(%s)" % (base_features_str)

def get_function(self):
def pd_diff(base_array, group_array):
bf_name = 'base_feature'
groupby = 'groupby'
grouped_df = pd.DataFrame.from_dict({bf_name: base_array,
groupby: group_array})
grouped_df = grouped_df.groupby(groupby).diff()
try:
return grouped_df[bf_name]
except KeyError:
return pd.Series([np.nan] * len(base_array))
def pd_diff(base_array):
return pd.Series(base_array).diff()
return pd_diff


Expand Down
10 changes: 5 additions & 5 deletions featuretools/tests/dfs_tests/test_deep_feature_synthesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,21 +175,21 @@ def test_handles_diff_entity_groupby(es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
entityset=es,
agg_primitives=[],
trans_primitives=[Diff])
groupby_trans_primitives=[Diff])

features = dfs_obj.build_features()
assert (feature_with_name(features, 'DIFF(value by session_id)'))
assert (feature_with_name(features, 'DIFF(value by product_id)'))
assert (feature_with_name(features, 'DIFF(value) by session_id'))
assert (feature_with_name(features, 'DIFF(value) by product_id'))


def test_handles_time_since_previous_entity_groupby(es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
entityset=es,
agg_primitives=[],
trans_primitives=[TimeSincePrevious])
groupby_trans_primitives=[TimeSincePrevious])

features = dfs_obj.build_features()
assert (feature_with_name(features, 'time_since_previous_by_session_id'))
assert (feature_with_name(features, 'TIME_SINCE_PREVIOUS(datetime) by session_id'))

# M TODO
# def test_handles_cumsum_entity_groupby(es):
Expand Down
33 changes: 11 additions & 22 deletions featuretools/tests/primitive_tests/test_transform_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,35 +116,24 @@ def test_make_trans_feat(es):

def test_diff(es):
value = ft.Feature(es['log']['value'])
customer_id_feat = ft.Feature(es['sessions']['customer_id'], entity=es['log'])
diff1 = ft.Feature([value, es['log']['session_id']], primitive=Diff)
diff2 = ft.Feature([value, customer_id_feat], primitive=Diff)
diff1 = ft.Feature([value], primitive=Diff)
ctduffy marked this conversation as resolved.
Show resolved Hide resolved

pandas_backend = PandasBackend(es, [diff1, diff2])
pandas_backend = PandasBackend(es, [diff1])
df = pandas_backend.calculate_all_features(instance_ids=range(15),
time_last=None)

val1 = df[diff1.get_name()].values.tolist()
val2 = df[diff2.get_name()].values.tolist()
correct_vals1 = [
np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7
]
correct_vals2 = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, np.nan, 5, -5, 7, 7]
correct_vals = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, 0, 5, -5, 7, 7]
for i, v in enumerate(val1):
v1 = val1[i]
if np.isnan(v1):
assert (np.isnan(correct_vals1[i]))
assert (np.isnan(correct_vals[i]))
else:
assert v1 == correct_vals1[i]
v2 = val2[i]
if np.isnan(v2):
assert (np.isnan(correct_vals2[i]))
else:
assert v2 == correct_vals2[i]
assert v1 == correct_vals[i]


def test_diff_single_value(es):
diff = ft.Feature([es['stores']['num_square_feet'], es['stores'][u'région_id']], primitive=Diff)
diff = ft.Feature([es['stores']['num_square_feet']], primitive=Diff)
ctduffy marked this conversation as resolved.
Show resolved Hide resolved
pandas_backend = PandasBackend(es, [diff])
df = pandas_backend.calculate_all_features(instance_ids=[5],
time_last=None)
Expand Down Expand Up @@ -322,8 +311,8 @@ def test_arithmetic_of_direct(es):

# P TODO: rewrite this test
def test_arithmetic_of_transform(es):
diff1 = ft.Feature([es['log']['value'], es['log']['product_id']], primitive=Diff)
diff2 = ft.Feature([es['log']['value_2'], es['log']['product_id']], primitive=Diff)
diff1 = ft.Feature([es['log']['value']], primitive=Diff)
diff2 = ft.Feature([es['log']['value_2']], primitive=Diff)

to_test = [(AddNumeric, [np.nan, 14., -7., 3.]),
(SubtractNumeric, [np.nan, 6., -3., 1.]),
Expand Down Expand Up @@ -525,14 +514,14 @@ def isin_generate_name(self, base_feature_names):

def test_isnull_feat(es):
value = ft.Feature(es['log']['value'])
diff = ft.Feature([value, es['log']['session_id']], primitive=Diff)
diff = ft.Feature([value], primitive=Diff)
ctduffy marked this conversation as resolved.
Show resolved Hide resolved
isnull = ft.Feature(diff, primitive=IsNull)
features = [isnull]
df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
# correct_vals_diff = [
# np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7]
correct_vals = [True, False, False, False, False, True, False, False,
False, True, True, False, True, False, False]
correct_vals = [True, False, False, False, False, False, False, False,
False, False, True, False, False, False, False]
values = df[isnull.get_name()].values.tolist()
assert correct_vals == values

Expand Down