Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate transform features of direct features #623

Merged
merged 27 commits into from Jul 25, 2019
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
387c0da
move transform to last step of dfs without creating duplicate features
zhxt95 Mar 28, 2018
0091f53
build IdentityFeature at the beginning of dfs
zhxt95 Mar 29, 2018
85c59f7
modify test cases to fit new changes
zhxt95 Mar 29, 2018
27e4f91
Add seed features after aggregations
CJStadler Jun 24, 2019
168825d
Add tests
CJStadler Jun 24, 2019
0d8326d
Refactor commutative test
CJStadler Jun 24, 2019
753c62a
Allow trans of directs with different paths
CJStadler Jun 24, 2019
e5b5707
Fix imports
CJStadler Jun 24, 2019
6fa37e8
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jun 24, 2019
be391de
Update changelog
CJStadler Jun 24, 2019
29ab964
Implement "!=" for RelationshipPath
CJStadler Jun 25, 2019
213e83a
Merge branch 'master' into dfs-trans-after-direct
kmax12 Jun 25, 2019
7c1f1b0
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jun 26, 2019
701c214
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jun 27, 2019
1df9bc6
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jun 27, 2019
4a62052
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jul 8, 2019
5e34f6c
Don't make aggregations of direct features to target
CJStadler Jul 9, 2019
c91e21b
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jul 9, 2019
cdaaf48
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jul 17, 2019
f531f16
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jul 18, 2019
6efc875
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jul 19, 2019
919c3aa
Build transforms in two steps
CJStadler Jul 23, 2019
a278870
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jul 23, 2019
9eb2833
Require direct features in match instead of after
CJStadler Jul 24, 2019
2c2489e
Use _get_matching_inputs in _build_agg_features
CJStadler Jul 24, 2019
2a81465
Rename to require_direct_input
CJStadler Jul 24, 2019
7dba0a8
Merge branch 'master' into dfs-trans-after-direct
CJStadler Jul 24, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/changelog.rst
Expand Up @@ -5,6 +5,7 @@ Changelog
**Future Release**
* Enhancements
* Added drop_first as param in encode_features (:pr:`647`)
* Generate transform features of direct features (:pr:`623`)
* Fixes
* Fix performance regression in DFS (:pr:`637`)
* Fix deserialization of feature relationship path (:pr:`665`)
Expand All @@ -26,8 +27,7 @@ Changelog

Thanks to the following people for contributing to this release:
:user:`ayushpatidar`, :user:`CJStadler`, :user:`gsheni`,
:user:`jeff-hernandez`, :user:`kmax12`, :user:`rwedge`

:user:`jeff-hernandez`, :user:`kmax12`, :user:`rwedge`, :user:`zhxt95`

**v0.9.1 July 3, 2019**
* Enhancements
Expand Down
3 changes: 3 additions & 0 deletions featuretools/entityset/relationship.py
Expand Up @@ -161,6 +161,9 @@ def __eq__(self, other):
return isinstance(other, RelationshipPath) and \
self._relationships_with_direction == other._relationships_with_direction

def __ne__(self, other):
return not self == other

def __repr__(self):
if self._relationships_with_direction:
path = '%s.%s' % (next(self.entities()), self.name)
Expand Down
129 changes: 92 additions & 37 deletions featuretools/synthesis/deep_feature_synthesis.py
Expand Up @@ -286,7 +286,12 @@ def _run_dfs(self, entity, relationship_path, all_features, max_depth):
all_features[entity.id] = {}

"""
Step 1 - Recursively build features for each entity in a backward relationship
Step 1 - Create identity features
"""
self._add_identity_features(all_features, entity)

"""
Step 2 - Recursively build features for each entity in a backward relationship
"""

backward_entities = self.es.get_backward_entities(entity.id)
Expand All @@ -311,7 +316,7 @@ def _run_dfs(self, entity, relationship_path, all_features, max_depth):
max_depth=new_max_depth)

"""
Step 2 - Create agg_feat features for all deep backward relationships
Step 3 - Create aggregation features for all deep backward relationships
"""
backward_entities = self.es.get_backward_entities(entity.id, deep=True)
for b_entity_id, sub_relationship_path in backward_entities:
Expand All @@ -329,13 +334,12 @@ def _run_dfs(self, entity, relationship_path, all_features, max_depth):
relationship_path=sub_relationship_path)

"""
Step 3 - Create Transform features
Step 4 - Create transform features of identity and aggregation features
"""
self._build_transform_features(
all_features, entity, max_depth=max_depth)
self._build_transform_features(all_features, entity, max_depth=max_depth)

"""
Step 4 - Recursively build features for each entity in a forward relationship
Step 5 - Recursively build features for each entity in a forward relationship
"""
forward_entities = self.es.get_forward_entities(entity.id)
for f_entity_id, sub_relationship_path in forward_entities:
Expand All @@ -359,7 +363,7 @@ def _run_dfs(self, entity, relationship_path, all_features, max_depth):
max_depth=new_max_depth)

"""
Step 5 - Create dfeat features for forward relationships
Step 6 - Create direct features for forward relationships
"""
forward_entities = self.es.get_forward_entities(entity.id)
for f_entity_id, sub_relationship_path in forward_entities:
Expand All @@ -375,6 +379,11 @@ def _run_dfs(self, entity, relationship_path, all_features, max_depth):
relationship_path=sub_relationship_path,
max_depth=max_depth)

"""
Step 7 - Create transform features of direct features
"""
self._build_transform_features(all_features, entity, max_depth=max_depth, direct_only=True)

# now that all features are added, build where clauses
self._build_where_clauses(all_features, entity)

Expand Down Expand Up @@ -457,7 +466,7 @@ def _build_where_clauses(self, all_features, entity):
for val in variable.interesting_values:
self.where_clauses[entity.id].add(feat == val)

def _build_transform_features(self, all_features, entity, max_depth=0):
def _build_transform_features(self, all_features, entity, max_depth=0, direct_only=False):
"""Creates trans_features for all the variables in an entity

Args:
Expand All @@ -471,21 +480,18 @@ def _build_transform_features(self, all_features, entity, max_depth=0):
if max_depth is not None:
new_max_depth = max_depth - 1

self._add_identity_features(all_features, entity)

for trans_prim in self.trans_primitives:
# if multiple input_types, only use first one for DFS
input_types = trans_prim.input_types
if type(input_types[0]) == list:
input_types = input_types[0]

features = self._features_by_type(all_features=all_features,
entity=entity,
max_depth=new_max_depth,
variable_type=set(input_types))

matching_inputs = match(input_types, features,
commutative=trans_prim.commutative)
matching_inputs = self._get_matching_inputs(all_features,
entity,
new_max_depth,
input_types,
trans_prim,
direct_only=direct_only)

for matching_input in matching_inputs:
if all(bf.number_output_features == 1 for bf in matching_input):
Expand All @@ -505,12 +511,13 @@ def _build_transform_features(self, all_features, entity, max_depth=0):
input_types = input_types[0]
input_types.append(Id)

features = self._features_by_type(all_features=all_features,
entity=entity,
max_depth=new_max_depth,
variable_type=set(input_types))
matching_inputs = match(input_types, features,
commutative=groupby_prim.commutative)
matching_inputs = self._get_matching_inputs(all_features,
entity,
new_max_depth,
input_types,
groupby_prim,
direct_only=direct_only)

for matching_input in matching_inputs:
if all(bf.number_output_features == 1 for bf in matching_input):
new_f = GroupByTransformFeature(list(matching_input[:-1]),
Expand Down Expand Up @@ -558,16 +565,18 @@ def _build_agg_features(self, all_features, parent_entity, child_entity,
if type(input_types[0]) == list:
input_types = input_types[0]

features = self._features_by_type(all_features=all_features,
entity=child_entity,
max_depth=new_max_depth,
variable_type=set(input_types))
def feature_filter(f):
# Remove direct features of parent entity and features in relationship path.
return (not _direct_of_entity(f, parent_entity)) \
CJStadler marked this conversation as resolved.
Show resolved Hide resolved
and not self._feature_in_relationship_path(relationship_path, f)

matching_inputs = self._get_matching_inputs(all_features,
child_entity,
new_max_depth,
input_types,
agg_prim,
feature_filter=feature_filter)

# remove features in relationship path
features = [f for f in features
if not self._feature_in_relationship_path(relationship_path, f)]
matching_inputs = match(input_types, features,
commutative=agg_prim.commutative)
wheres = list(self.where_clauses[child_entity.id])

for matching_input in matching_inputs:
Expand Down Expand Up @@ -651,6 +660,28 @@ def _feature_in_relationship_path(self, relationship_path, feature):

return False

def _get_matching_inputs(self, all_features, entity, max_depth, input_types,
CJStadler marked this conversation as resolved.
Show resolved Hide resolved
primitive, direct_only=False, feature_filter=None):
CJStadler marked this conversation as resolved.
Show resolved Hide resolved
features = self._features_by_type(all_features=all_features,
entity=entity,
max_depth=max_depth,
variable_type=set(input_types))

if feature_filter:
features = [f for f in features if feature_filter(f)]

matching_inputs = match(input_types, features,
commutative=primitive.commutative,
need_direct=direct_only)

if direct_only:
# Don't create trans features of inputs which are all direct
# features with the same relationship_path.
matching_inputs = {inputs for inputs in matching_inputs
if not _all_direct_and_same_path(inputs)}

return matching_inputs


def check_stacking(primitive, inputs):
"""checks if features in inputs can be used with supplied primitive
Expand Down Expand Up @@ -702,12 +733,13 @@ def match_by_type(features, t):
return matches


def match(input_types, features, replace=False, commutative=False):
def match(input_types, features, replace=False, commutative=False, need_direct=False):
to_match = input_types[0]
matches = match_by_type(features, to_match)

if len(input_types) == 1:
return [(m,) for m in matches]
return [(m,) for m in matches
if (not need_direct or isinstance(m, DirectFeature))]

matching_inputs = set([])

Expand All @@ -717,7 +749,10 @@ def match(input_types, features, replace=False, commutative=False):
if not replace:
copy = [c for c in copy if c.unique_name() != m.unique_name()]

rest = match(input_types[1:], copy, replace)
# If we need a DirectFeature and this is not a DirectFeature then one of the rest must be.
still_need_direct = need_direct and not isinstance(m, DirectFeature)
rest = match(input_types[1:], copy, replace, need_direct=still_need_direct)

for r in rest:
new_match = [m] + list(r)

Expand All @@ -730,9 +765,9 @@ def match(input_types, features, replace=False, commutative=False):
matching_inputs.add(new_match)

if commutative:
return set([tuple(sorted(s, key=lambda x: x.get_name().lower())) for s in matching_inputs])
matching_inputs = {tuple(sorted(s, key=lambda x: x.get_name().lower())) for s in matching_inputs}

return set([tuple(s) for s in matching_inputs])
return matching_inputs


def handle_primitive(primitive):
Expand All @@ -757,3 +792,23 @@ def check_trans_primitive(primitive):
"groupby_trans_primitives is not a transform "
"primitive".format(type(primitive)))
return primitive


def _all_direct_and_same_path(input_features):
return all(isinstance(f, DirectFeature) for f in input_features) and \
_features_have_same_path(input_features)


def _features_have_same_path(input_features):
path = input_features[0].relationship_path

for f in input_features[1:]:
if f.relationship_path != path:
return False

return True


def _direct_of_entity(feature, parent_entity):
return isinstance(feature, DirectFeature) \
and feature.parent_entity.id == parent_entity.id
89 changes: 77 additions & 12 deletions featuretools/tests/synthesis/test_deep_feature_synthesis.py
Expand Up @@ -17,6 +17,7 @@
AddNumeric,
Count,
Diff,
Equal,
Hour,
IsIn,
Last,
Expand Down Expand Up @@ -363,7 +364,7 @@ def test_seed_features(es):
assert session_agg.get_name() in [f.get_name() for f in features]


def test_seed_features_added_with_identity_features(es):
def test_does_not_make_agg_of_direct_of_target_entity(es):
count_sessions = ft.Feature(es['sessions']["id"], parent_entity=es['customers'], primitive=Count)
dfs_obj = DeepFeatureSynthesis(target_entity_id='customers',
entityset=es,
Expand All @@ -375,6 +376,7 @@ def test_seed_features_added_with_identity_features(es):
# this feature is meaningless because customers.COUNT(sessions) is already defined on
# the customers entity
assert not feature_with_name(features, 'LAST(sessions.customers.COUNT(sessions))')
assert not feature_with_name(features, 'LAST(sessions.customers.age)')


def test_dfs_builds_on_seed_features_more_than_max_depth(es):
Expand Down Expand Up @@ -582,18 +584,18 @@ def test_commutative(es):
trans_primitives=[AddNumeric],
max_depth=3)
feats = dfs_obj.build_features()
num_add_feats = 0
num_add_as_base_feat = 0

for feat in feats:
if isinstance(feat.primitive, AddNumeric):
num_add_feats += 1
for base_feat in feat.base_features:
if isinstance(base_feat.primitive, AddNumeric):
num_add_as_base_feat += 1
add_feats = [f for f in feats if isinstance(f.primitive, AddNumeric)]

assert num_add_feats == 3
assert num_add_as_base_feat == 9
# Check that there are no two AddNumeric features with the same base
# features.
unordered_args = set()
for f in add_feats:
arg1, arg2 = f.base_features
args_set = frozenset({arg1.unique_name(), arg2.unique_name()})
unordered_args.add(args_set)

assert len(add_feats) == len(unordered_args)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking the count doesn't communicate why it should have that value. I changed this to instead test that no features are created which have the same two inputs.



def test_transform_consistency():
Expand Down Expand Up @@ -727,7 +729,7 @@ def test_makes_agg_features_along_multiple_paths(diamond_es):
assert feature_with_name(features, 'MEAN(stores.transactions.amount)')


def test_makes_direct_features_along_multiple_paths(games_es):
def test_makes_direct_features_through_multiple_relationships(games_es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='games',
entityset=games_es,
agg_primitives=['mean'],
Expand All @@ -742,3 +744,66 @@ def test_makes_direct_features_along_multiple_paths(games_es):
f = 'teams[%s_team_id].MEAN(games[%s_team_id].%s_team_score)' \
% (forward, backward, var)
assert feature_with_name(features, f)


def test_makes_direct_features_along_multiple_paths(diamond_es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='transactions',
entityset=diamond_es,
max_depth=3,
agg_primitives=[],
trans_primitives=[])

features = dfs_obj.build_features()
assert feature_with_name(features, 'customers.regions.name')
assert feature_with_name(features, 'stores.regions.name')


def test_does_not_make_trans_of_single_direct_feature(es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
entityset=es,
agg_primitives=[],
trans_primitives=['weekday'],
max_depth=2)

features = dfs_obj.build_features()

assert not feature_with_name(features, 'WEEKDAY(customers.signup_date)')
assert feature_with_name(features, 'customers.WEEKDAY(signup_date)')


def test_makes_trans_of_multiple_direct_features(diamond_es):
es = diamond_es
dfs_obj = DeepFeatureSynthesis(target_entity_id='transactions',
entityset=es,
agg_primitives=['mean'],
trans_primitives=[Equal],
max_depth=4)

features = dfs_obj.build_features()

# Make trans of direct and non-direct
assert feature_with_name(features, 'amount = stores.MEAN(transactions.amount)')

# Make trans of direct features on different entities
assert feature_with_name(features, 'customers.MEAN(transactions.amount) = stores.square_ft')

# Make trans of direct features on same entity with different paths.
assert feature_with_name(features, 'customers.regions.name = stores.regions.name')

# Don't make trans of direct features with same path.
assert not feature_with_name(features, 'stores.square_ft = stores.MEAN(transactions.amount)')
assert not feature_with_name(features, 'stores.MEAN(transactions.amount) = stores.square_ft')

# The naming of the below is confusing but this is a direct feature of a transform.
assert feature_with_name(features, 'stores.MEAN(transactions.amount) = square_ft')


def test_makes_direct_of_agg_of_trans_on_target(es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
entityset=es,
agg_primitives=['mean'],
trans_primitives=[Absolute],
max_depth=3)

features = dfs_obj.build_features()
assert feature_with_name(features, 'sessions.MEAN(log.ABSOLUTE(value))')