Skip to content

Commit

Permalink
Generate transform features of direct features (#623)
Browse files Browse the repository at this point in the history
Transform features are now built twice: first on identity and aggregation
features, then on direct features. We do not create transforms of single
direct features, or when all inputs are direct features with the same
relationship path. This would be redundant as
`WEEKDAY(customers.signup_date)` is equivalent to
`customers.WEEKDAY(signup_date)`, which should already have been
calculated.
  • Loading branch information
CJStadler committed Jul 25, 2019
1 parent eabad43 commit c583ff1
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 51 deletions.
4 changes: 2 additions & 2 deletions docs/source/changelog.rst
Expand Up @@ -5,6 +5,7 @@ Changelog
**Future Release**
* Enhancements
* Added drop_first as param in encode_features (:pr:`647`)
* Generate transform features of direct features (:pr:`623`)
* Fixes
* Fix performance regression in DFS (:pr:`637`)
* Fix deserialization of feature relationship path (:pr:`665`)
Expand All @@ -27,8 +28,7 @@ Changelog

Thanks to the following people for contributing to this release:
:user:`ayushpatidar`, :user:`CJStadler`, :user:`gsheni`,
:user:`jeff-hernandez`, :user:`kmax12`, :user:`rwedge`

:user:`jeff-hernandez`, :user:`kmax12`, :user:`rwedge`, :user:`zhxt95`

**v0.9.1 July 3, 2019**
* Enhancements
Expand Down
3 changes: 3 additions & 0 deletions featuretools/entityset/relationship.py
Expand Up @@ -161,6 +161,9 @@ def __eq__(self, other):
return isinstance(other, RelationshipPath) and \
self._relationships_with_direction == other._relationships_with_direction

def __ne__(self, other):
return not self == other

def __repr__(self):
if self._relationships_with_direction:
path = '%s.%s' % (next(self.entities()), self.name)
Expand Down
132 changes: 95 additions & 37 deletions featuretools/synthesis/deep_feature_synthesis.py
Expand Up @@ -286,7 +286,12 @@ def _run_dfs(self, entity, relationship_path, all_features, max_depth):
all_features[entity.id] = {}

"""
Step 1 - Recursively build features for each entity in a backward relationship
Step 1 - Create identity features
"""
self._add_identity_features(all_features, entity)

"""
Step 2 - Recursively build features for each entity in a backward relationship
"""

backward_entities = self.es.get_backward_entities(entity.id)
Expand All @@ -311,7 +316,7 @@ def _run_dfs(self, entity, relationship_path, all_features, max_depth):
max_depth=new_max_depth)

"""
Step 2 - Create agg_feat features for all deep backward relationships
Step 3 - Create aggregation features for all deep backward relationships
"""
backward_entities = self.es.get_backward_entities(entity.id, deep=True)
for b_entity_id, sub_relationship_path in backward_entities:
Expand All @@ -329,13 +334,12 @@ def _run_dfs(self, entity, relationship_path, all_features, max_depth):
relationship_path=sub_relationship_path)

"""
Step 3 - Create Transform features
Step 4 - Create transform features of identity and aggregation features
"""
self._build_transform_features(
all_features, entity, max_depth=max_depth)
self._build_transform_features(all_features, entity, max_depth=max_depth)

"""
Step 4 - Recursively build features for each entity in a forward relationship
Step 5 - Recursively build features for each entity in a forward relationship
"""
forward_entities = self.es.get_forward_entities(entity.id)
for f_entity_id, sub_relationship_path in forward_entities:
Expand All @@ -359,7 +363,7 @@ def _run_dfs(self, entity, relationship_path, all_features, max_depth):
max_depth=new_max_depth)

"""
Step 5 - Create dfeat features for forward relationships
Step 6 - Create direct features for forward relationships
"""
forward_entities = self.es.get_forward_entities(entity.id)
for f_entity_id, sub_relationship_path in forward_entities:
Expand All @@ -375,6 +379,12 @@ def _run_dfs(self, entity, relationship_path, all_features, max_depth):
relationship_path=sub_relationship_path,
max_depth=max_depth)

"""
Step 7 - Create transform features of direct features
"""
self._build_transform_features(all_features, entity, max_depth=max_depth,
require_direct_input=True)

# now that all features are added, build where clauses
self._build_where_clauses(all_features, entity)

Expand Down Expand Up @@ -457,7 +467,8 @@ def _build_where_clauses(self, all_features, entity):
for val in variable.interesting_values:
self.where_clauses[entity.id].add(feat == val)

def _build_transform_features(self, all_features, entity, max_depth=0):
def _build_transform_features(self, all_features, entity, max_depth=0,
require_direct_input=False):
"""Creates trans_features for all the variables in an entity
Args:
Expand All @@ -471,21 +482,18 @@ def _build_transform_features(self, all_features, entity, max_depth=0):
if max_depth is not None:
new_max_depth = max_depth - 1

self._add_identity_features(all_features, entity)

for trans_prim in self.trans_primitives:
# if multiple input_types, only use first one for DFS
input_types = trans_prim.input_types
if type(input_types[0]) == list:
input_types = input_types[0]

features = self._features_by_type(all_features=all_features,
entity=entity,
max_depth=new_max_depth,
variable_type=set(input_types))

matching_inputs = match(input_types, features,
commutative=trans_prim.commutative)
matching_inputs = self._get_matching_inputs(all_features,
entity,
new_max_depth,
input_types,
trans_prim,
require_direct_input=require_direct_input)

for matching_input in matching_inputs:
if all(bf.number_output_features == 1 for bf in matching_input):
Expand All @@ -505,12 +513,13 @@ def _build_transform_features(self, all_features, entity, max_depth=0):
input_types = input_types[0]
input_types.append(Id)

features = self._features_by_type(all_features=all_features,
entity=entity,
max_depth=new_max_depth,
variable_type=set(input_types))
matching_inputs = match(input_types, features,
commutative=groupby_prim.commutative)
matching_inputs = self._get_matching_inputs(all_features,
entity,
new_max_depth,
input_types,
groupby_prim,
require_direct_input=require_direct_input)

for matching_input in matching_inputs:
if all(bf.number_output_features == 1 for bf in matching_input):
new_f = GroupByTransformFeature(list(matching_input[:-1]),
Expand Down Expand Up @@ -558,16 +567,18 @@ def _build_agg_features(self, all_features, parent_entity, child_entity,
if type(input_types[0]) == list:
input_types = input_types[0]

features = self._features_by_type(all_features=all_features,
entity=child_entity,
max_depth=new_max_depth,
variable_type=set(input_types))
def feature_filter(f):
# Remove direct features of parent entity and features in relationship path.
return (not _direct_of_entity(f, parent_entity)) \
and not self._feature_in_relationship_path(relationship_path, f)

matching_inputs = self._get_matching_inputs(all_features,
child_entity,
new_max_depth,
input_types,
agg_prim,
feature_filter=feature_filter)

# remove features in relationship path
features = [f for f in features
if not self._feature_in_relationship_path(relationship_path, f)]
matching_inputs = match(input_types, features,
commutative=agg_prim.commutative)
wheres = list(self.where_clauses[child_entity.id])

for matching_input in matching_inputs:
Expand Down Expand Up @@ -651,6 +662,28 @@ def _feature_in_relationship_path(self, relationship_path, feature):

return False

def _get_matching_inputs(self, all_features, entity, max_depth, input_types,
primitive, require_direct_input=False, feature_filter=None):
features = self._features_by_type(all_features=all_features,
entity=entity,
max_depth=max_depth,
variable_type=set(input_types))

if feature_filter:
features = [f for f in features if feature_filter(f)]

matching_inputs = match(input_types, features,
commutative=primitive.commutative,
require_direct_input=require_direct_input)

if require_direct_input:
# Don't create trans features of inputs which are all direct
# features with the same relationship_path.
matching_inputs = {inputs for inputs in matching_inputs
if not _all_direct_and_same_path(inputs)}

return matching_inputs


def check_stacking(primitive, inputs):
"""checks if features in inputs can be used with supplied primitive
Expand Down Expand Up @@ -702,12 +735,13 @@ def match_by_type(features, t):
return matches


def match(input_types, features, replace=False, commutative=False):
def match(input_types, features, replace=False, commutative=False, require_direct_input=False):
to_match = input_types[0]
matches = match_by_type(features, to_match)

if len(input_types) == 1:
return [(m,) for m in matches]
return [(m,) for m in matches
if (not require_direct_input or isinstance(m, DirectFeature))]

matching_inputs = set([])

Expand All @@ -717,7 +751,11 @@ def match(input_types, features, replace=False, commutative=False):
if not replace:
copy = [c for c in copy if c.unique_name() != m.unique_name()]

rest = match(input_types[1:], copy, replace)
# If we need a DirectFeature and this is not a DirectFeature then one of the rest must be.
still_require_direct_input = require_direct_input and not isinstance(m, DirectFeature)
rest = match(input_types[1:], copy, replace,
require_direct_input=still_require_direct_input)

for r in rest:
new_match = [m] + list(r)

Expand All @@ -730,9 +768,9 @@ def match(input_types, features, replace=False, commutative=False):
matching_inputs.add(new_match)

if commutative:
return set([tuple(sorted(s, key=lambda x: x.get_name().lower())) for s in matching_inputs])
matching_inputs = {tuple(sorted(s, key=lambda x: x.get_name().lower())) for s in matching_inputs}

return set([tuple(s) for s in matching_inputs])
return matching_inputs


def handle_primitive(primitive):
Expand All @@ -757,3 +795,23 @@ def check_trans_primitive(primitive):
"groupby_trans_primitives is not a transform "
"primitive".format(type(primitive)))
return primitive


def _all_direct_and_same_path(input_features):
return all(isinstance(f, DirectFeature) for f in input_features) and \
_features_have_same_path(input_features)


def _features_have_same_path(input_features):
path = input_features[0].relationship_path

for f in input_features[1:]:
if f.relationship_path != path:
return False

return True


def _direct_of_entity(feature, parent_entity):
return isinstance(feature, DirectFeature) \
and feature.parent_entity.id == parent_entity.id
89 changes: 77 additions & 12 deletions featuretools/tests/synthesis/test_deep_feature_synthesis.py
Expand Up @@ -17,6 +17,7 @@
AddNumeric,
Count,
Diff,
Equal,
Hour,
IsIn,
Last,
Expand Down Expand Up @@ -363,7 +364,7 @@ def test_seed_features(es):
assert session_agg.get_name() in [f.get_name() for f in features]


def test_seed_features_added_with_identity_features(es):
def test_does_not_make_agg_of_direct_of_target_entity(es):
count_sessions = ft.Feature(es['sessions']["id"], parent_entity=es['customers'], primitive=Count)
dfs_obj = DeepFeatureSynthesis(target_entity_id='customers',
entityset=es,
Expand All @@ -375,6 +376,7 @@ def test_seed_features_added_with_identity_features(es):
# this feature is meaningless because customers.COUNT(sessions) is already defined on
# the customers entity
assert not feature_with_name(features, 'LAST(sessions.customers.COUNT(sessions))')
assert not feature_with_name(features, 'LAST(sessions.customers.age)')


def test_dfs_builds_on_seed_features_more_than_max_depth(es):
Expand Down Expand Up @@ -582,18 +584,18 @@ def test_commutative(es):
trans_primitives=[AddNumeric],
max_depth=3)
feats = dfs_obj.build_features()
num_add_feats = 0
num_add_as_base_feat = 0

for feat in feats:
if isinstance(feat.primitive, AddNumeric):
num_add_feats += 1
for base_feat in feat.base_features:
if isinstance(base_feat.primitive, AddNumeric):
num_add_as_base_feat += 1
add_feats = [f for f in feats if isinstance(f.primitive, AddNumeric)]

assert num_add_feats == 3
assert num_add_as_base_feat == 9
# Check that there are no two AddNumeric features with the same base
# features.
unordered_args = set()
for f in add_feats:
arg1, arg2 = f.base_features
args_set = frozenset({arg1.unique_name(), arg2.unique_name()})
unordered_args.add(args_set)

assert len(add_feats) == len(unordered_args)


def test_transform_consistency():
Expand Down Expand Up @@ -727,7 +729,7 @@ def test_makes_agg_features_along_multiple_paths(diamond_es):
assert feature_with_name(features, 'MEAN(stores.transactions.amount)')


def test_makes_direct_features_along_multiple_paths(games_es):
def test_makes_direct_features_through_multiple_relationships(games_es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='games',
entityset=games_es,
agg_primitives=['mean'],
Expand All @@ -742,3 +744,66 @@ def test_makes_direct_features_along_multiple_paths(games_es):
f = 'teams[%s_team_id].MEAN(games[%s_team_id].%s_team_score)' \
% (forward, backward, var)
assert feature_with_name(features, f)


def test_makes_direct_features_along_multiple_paths(diamond_es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='transactions',
entityset=diamond_es,
max_depth=3,
agg_primitives=[],
trans_primitives=[])

features = dfs_obj.build_features()
assert feature_with_name(features, 'customers.regions.name')
assert feature_with_name(features, 'stores.regions.name')


def test_does_not_make_trans_of_single_direct_feature(es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
entityset=es,
agg_primitives=[],
trans_primitives=['weekday'],
max_depth=2)

features = dfs_obj.build_features()

assert not feature_with_name(features, 'WEEKDAY(customers.signup_date)')
assert feature_with_name(features, 'customers.WEEKDAY(signup_date)')


def test_makes_trans_of_multiple_direct_features(diamond_es):
es = diamond_es
dfs_obj = DeepFeatureSynthesis(target_entity_id='transactions',
entityset=es,
agg_primitives=['mean'],
trans_primitives=[Equal],
max_depth=4)

features = dfs_obj.build_features()

# Make trans of direct and non-direct
assert feature_with_name(features, 'amount = stores.MEAN(transactions.amount)')

# Make trans of direct features on different entities
assert feature_with_name(features, 'customers.MEAN(transactions.amount) = stores.square_ft')

# Make trans of direct features on same entity with different paths.
assert feature_with_name(features, 'customers.regions.name = stores.regions.name')

# Don't make trans of direct features with same path.
assert not feature_with_name(features, 'stores.square_ft = stores.MEAN(transactions.amount)')
assert not feature_with_name(features, 'stores.MEAN(transactions.amount) = stores.square_ft')

# The naming of the below is confusing but this is a direct feature of a transform.
assert feature_with_name(features, 'stores.MEAN(transactions.amount) = square_ft')


def test_makes_direct_of_agg_of_trans_on_target(es):
dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
entityset=es,
agg_primitives=['mean'],
trans_primitives=[Absolute],
max_depth=3)

features = dfs_obj.build_features()
assert feature_with_name(features, 'sessions.MEAN(log.ABSOLUTE(value))')

0 comments on commit c583ff1

Please sign in to comment.