Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

English sentence feature descriptions #1201

Merged
merged 21 commits into from
Oct 30, 2020
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
0a1bfda
initial example description generators
frances-h Oct 7, 2020
f237cc6
add version that stores primitive templates on primitives
frances-h Oct 13, 2020
bf33f5a
update runon and combined methods and add tests
frances-h Oct 22, 2020
65acc99
update tests and switch to runon
frances-h Oct 26, 2020
e75ed4d
Merge branch 'main' into feature-descriptions
frances-h Oct 26, 2020
5da01a3
update tests
frances-h Oct 26, 2020
f0797b0
lint and coverage updates
frances-h Oct 26, 2020
6d75d64
release notes and update primitive templates
frances-h Oct 27, 2020
350a7e7
Merge branch 'main' into feature-descriptions
frances-h Oct 27, 2020
17251ef
template updates and description utils tests
frances-h Oct 28, 2020
9824c34
Merge branch 'main' into feature-descriptions
frances-h Oct 28, 2020
2a75760
Merge branch 'main' into feature-descriptions
frances-h Oct 28, 2020
23a0391
add feature_descriptions to docs
frances-h Oct 29, 2020
fb5443e
Merge branch 'main' into feature-descriptions
frances-h Oct 29, 2020
d1176f6
update guide and add class name as primitive description fallback
frances-h Oct 29, 2020
69150d3
use nth_slice instead of slice_num in primitive template
frances-h Oct 29, 2020
1b93f65
add feature lineage graphs and feature descriptions to docs index
frances-h Oct 29, 2020
def6a37
add no primitive name generic test
frances-h Oct 30, 2020
4134a80
update docs and test
frances-h Oct 30, 2020
2f04cdd
update docs feature reference
frances-h Oct 30, 2020
d364783
doc updates
frances-h Oct 30, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Release Notes
-------------
**Future Release**
* Enhancements
* Add ``describe_feature`` to generate an English language feature description for a given feature (:pr:`1201`)
* Fixes
* Changes
* Restrict koalas version to below 1.3.0 (:pr:`1192`)
Expand All @@ -16,7 +17,7 @@ Release Notes
* Add ``pyspark`` and ``koalas`` to automated dependency checks (:pr:`1191`)

Thanks to the following people for contributing to this release:
:user:`gsheni`, :user:`rwedge`, :user:`tamargrey`, :user:`thehomebrewnerd`, :user:`jeff-hernandez`
:user:`gsheni`, :user:`rwedge`, :user:`tamargrey`, :user:`thehomebrewnerd`, :user:`jeff-hernandez`, :user:`frances-h`

**v0.20.0 Sep 30, 2020**
.. warning::
Expand Down
1 change: 1 addition & 0 deletions featuretools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
IdentityFeature,
TransformFeature,
graph_feature,
describe_feature,
save_features,
load_features
)
Expand Down
1 change: 1 addition & 0 deletions featuretools/feature_base/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
IdentityFeature,
TransformFeature
)
from .feature_descriptions import describe_feature
from .feature_visualizer import graph_feature
from .features_deserializer import load_features
from .features_serializer import save_features
153 changes: 153 additions & 0 deletions featuretools/feature_base/feature_descriptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import json

import featuretools as ft


def describe_feature(feature, feature_descriptions=None, primitive_templates=None,
metadata_file=None):
'''Generates an English language description of a feature.

Args:
feature (FeatureBase) : Feature to describe
feature_descriptions (dict, optional) : dictionary mapping features or unique
feature names to custom descriptions
primitive_templates (dict, optional) : dictionary mapping primitives or
primitive names to description templates
metadata_file (str, optional) : path to json metadata file

Returns:
str : English description of the feature
'''
if not feature_descriptions:
feature_descriptions = {}
if not primitive_templates:
primitive_templates = {}
frances-h marked this conversation as resolved.
Show resolved Hide resolved

if metadata_file:
file_feature_descriptions, file_primitive_templates = parse_json_metadata(metadata_file)
feature_descriptions = {**file_feature_descriptions, **feature_descriptions}
primitive_templates = {**file_primitive_templates, **primitive_templates}

description = generate_description(feature, feature_descriptions, primitive_templates)
return description[:1].upper() + description[1:] + '.'


def generate_description(feature, feature_descriptions, primitive_templates):
# Check if feature has custom description
if feature in feature_descriptions or feature.unique_name() in feature_descriptions:
description = (feature_descriptions.get(feature) or
feature_descriptions.get(feature.unique_name()))
return description

# Check if identity feature:
if isinstance(feature, ft.IdentityFeature):
return 'the "{}"'.format(feature.get_name())

# Handle direct features
if isinstance(feature, ft.DirectFeature):
base_feature, direct_description = get_direct_description(feature)
direct_base = generate_description(base_feature,
feature_descriptions,
primitive_templates)
return direct_base + direct_description

# Get input descriptions
input_descriptions = []
input_columns = feature.base_features
if isinstance(feature, ft.feature_base.FeatureOutputSlice):
input_columns = feature.base_feature.base_features

for input_col in input_columns:
col_description = generate_description(input_col,
feature_descriptions,
primitive_templates)
input_descriptions.append(col_description)

# Remove groupby description from input columns
groupby_description = None
if isinstance(feature, ft.GroupByTransformFeature):
groupby_description = input_descriptions.pop()

# Generate primitive description
slice_num = None
template_override = None
if isinstance(feature, ft.feature_base.FeatureOutputSlice):
slice_num = feature.n
if feature.primitive in primitive_templates or feature.primitive.name in primitive_templates:
template_override = (primitive_templates.get(feature.primitive) or
primitive_templates.get(feature.primitive.name))
primitive_description = feature.primitive.get_description(input_descriptions,
slice_num=slice_num,
template_override=template_override)
if isinstance(feature, ft.feature_base.FeatureOutputSlice):
feature = feature.base_feature

# Generate groupby phrase if applicable
groupby = ''
if isinstance(feature, ft.AggregationFeature):
groupby_description = get_aggregation_groupby(feature, feature_descriptions)
if groupby_description is not None:
if groupby_description.startswith('the '):
groupby_description = groupby_description[4:]
groupby = "for each {}".format(groupby_description)

# Generate aggregation entity phrase with use_previous
entity_description = ''
if isinstance(feature, ft.AggregationFeature):
if feature.use_previous:
entity_description = "of the previous {} of ".format(
feature.use_previous.get_name().lower())
else:
entity_description = "of all instances of "
entity_description += '"{}"'.format(feature.relationship_path[-1][1].child_entity.id)

# Generate where phrase
where = ''
if hasattr(feature, 'where') and feature.where:
where_col = generate_description(feature.where.base_features[0],
feature_descriptions,
primitive_templates)
where = 'where {} is {}'.format(where_col, feature.where.primitive.value)

# Join all parts of template
description_template = [primitive_description, entity_description, where, groupby]
description = " ".join([phrase for phrase in description_template if phrase != ''])

return description


def get_direct_description(feature):
direct_description = ' the instance of "{}" associated with this ' \
'instance of "{}"'.format(feature.relationship_path[-1][1].parent_entity.id,
feature.entity_id)
base_features = feature.base_features
# shortens stacked direct features to make it easier to understand
while isinstance(base_features[0], ft.DirectFeature):
base_feat = base_features[0]
base_feat_description = ' the instance of "{}" associated ' \
'with'.format(base_feat.relationship_path[-1][1].parent_entity.id)
direct_description = base_feat_description + direct_description
base_features = base_feat.base_features
direct_description = ' of' + direct_description
frances-h marked this conversation as resolved.
Show resolved Hide resolved

return base_features[0], direct_description


def get_aggregation_groupby(feature, feature_descriptions=None):
if feature_descriptions is None:
feature_descriptions = {}
groupby_name = feature.entity.index
groupby = ft.IdentityFeature(feature.entity[groupby_name])
if groupby in feature_descriptions or groupby.unique_name() in feature_descriptions:
return (feature_descriptions.get(groupby) or
feature_descriptions.get(groupby.unique_name()))
else:
return '"{}" in "{}"'.format(groupby_name, feature.entity.id)


def parse_json_metadata(file):
with open(file) as f:
json_metadata = json.load(f)

return (json_metadata.get('feature_descriptions', {}),
json_metadata.get('primitive_templates', {}))
36 changes: 36 additions & 0 deletions featuretools/primitives/base/primitive_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from featuretools import config
from featuretools.primitives.base.utils import signature
from featuretools.utils.description_utils import convert_to_nth
from featuretools.utils.gen_utils import Library


Expand Down Expand Up @@ -34,6 +35,12 @@ class PrimitiveBase(object):
commutative = False
#: (list): Additional compatible libraries
compatibility = [Library.PANDAS]
#: (str, list[str]): description template of the primitive. Input column
# descriptions are passed as positional arguments to the template. Slice
# number (if present) is passed to the template via the `slice_num` keyword argument.
# Multi-output primitives can use a list to differentiate between the base description
# and a slice description.
description_template = None

def __init__(self):
pass
Expand Down Expand Up @@ -98,3 +105,32 @@ def get_arguments(self):
values.append((name, value))

return values

def get_description(self, input_column_descriptions, slice_num=None, template_override=None):
template = template_override or self.description_template
if template:
if isinstance(template, list):
if slice_num is not None:
slice_index = slice_num + 1
if slice_index < len(template):
return template[slice_index].format(*input_column_descriptions,
slice_num=convert_to_nth(slice_index))
else:
if len(template) > 2:
raise IndexError('Slice out of range of template')
return template[1].format(*input_column_descriptions,
slice_num=convert_to_nth(slice_index))
else:
template = template[0]
return template.format(*input_column_descriptions)

# generic case:
if slice_num is not None:
nth_slice = convert_to_nth(slice_num + 1)
description = "the {} output from applying {} to {}".format(nth_slice,
self.name.upper(),
', '.join(input_column_descriptions))
else:
description = "the result of applying {} to {}".format(self.name.upper(),
', '.join(input_column_descriptions))
return description
Loading