Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature and primitive serialization and deserialization improvements #2136

Merged
merged 23 commits into from Jun 30, 2022
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6ea5182
serialization and deserialization improvements
thehomebrewnerd Jun 23, 2022
23b4a58
add pr number
thehomebrewnerd Jun 23, 2022
e7b45c3
Merge branch 'main' into serialization-updates
thehomebrewnerd Jun 23, 2022
aab351c
Merge branch 'main' into serialization-updates
thehomebrewnerd Jun 24, 2022
97b93f0
Improve feature deserialization to use common primitive instances (#2…
thehomebrewnerd Jun 27, 2022
1e7d881
Allow users to directly set feature output column names and save duri…
thehomebrewnerd Jun 27, 2022
fb2292d
Merge branch 'main' into serialization-updates
thehomebrewnerd Jun 28, 2022
25c71fb
Refactor feature serialization to avoid storing duplicate primitive i…
thehomebrewnerd Jun 29, 2022
3bbf68f
Merge branch 'main' into serialization-updates
thehomebrewnerd Jun 29, 2022
86e5e7d
Merge branch 'main' into serialization-updates
thehomebrewnerd Jun 29, 2022
a1b7779
fix spelling error
thehomebrewnerd Jun 30, 2022
2a34569
remove instance cache
thehomebrewnerd Jun 30, 2022
0b6fa7d
Merge branch 'main' into serialization-updates
thehomebrewnerd Jun 30, 2022
06286b0
Merge branch 'serialization-updates' of https://github.com/alteryx/fe…
thehomebrewnerd Jun 30, 2022
6628dac
update save and load docstring examples
thehomebrewnerd Jun 30, 2022
c0ce3ad
lint fix
thehomebrewnerd Jun 30, 2022
1d7363f
Merge branch 'main' into serialization-updates
thehomebrewnerd Jun 30, 2022
12692e8
more docstring cleanup
thehomebrewnerd Jun 30, 2022
00eec31
Merge branch 'main' into serialization-updates
thehomebrewnerd Jun 30, 2022
a420b3a
update release notes
thehomebrewnerd Jun 30, 2022
40419b7
Merge branch 'serialization-updates' of https://github.com/alteryx/fe…
thehomebrewnerd Jun 30, 2022
1e55383
tweak serialization
thehomebrewnerd Jun 30, 2022
2ba8139
update json
thehomebrewnerd Jun 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/source/release_notes.rst
Expand Up @@ -12,14 +12,15 @@ Future Release
* Allow dfs kwargs to be passed into ``get_valid_primitives`` (:pr:`2157`)
* Fixes
* Changes
* Improve serialization and deserialization to reduce storage of duplicate primitive information (:pr:`2136`, :pr:`2127`, :pr:`2142`, :pr:`2144`)
* Sort core requirements and test requirements in setup cfg (:pr:`2152`)
* Documentation Changes
* Testing Changes
* Fix pandas warning and reduce dask .apply warnings (:pr:`2145`)
* Pin graphviz version used in windows tests (:pr:`2159`)

Thanks to the following people for contributing to this release:
:user:`gsheni`, :user:`ozzieD`, :user:`rwedge`, :user:`sbadithe`, :user:`tamargrey`
:user:`gsheni`, :user:`ozzieD`, :user:`rwedge`, :user:`sbadithe`, :user:`tamargrey`, :user:`thehomebrewnerd`

v1.10.0 June 23, 2022
=====================
Expand Down
102 changes: 60 additions & 42 deletions featuretools/feature_base/feature_base.py
Expand Up @@ -10,7 +10,6 @@
PrimitiveBase,
TransformPrimitive,
)
from featuretools.primitives.utils import serialize_primitive
from featuretools.utils.wrangle import _check_time_against_column, _check_timedelta

_ES_REF = {}
Expand Down Expand Up @@ -70,13 +69,12 @@ def __getitem__(self, key):
return FeatureOutputSlice(self, key)

@classmethod
def from_dictionary(
cls, arguments, entityset, dependencies, primitives_deserializer
):
def from_dictionary(cls, arguments, entityset, dependencies, primitive):
raise NotImplementedError("Must define from_dictionary on FeatureBase subclass")

def rename(self, name):
"""Rename Feature, returns copy"""
"""Rename Feature, returns copy. Will reset any custom feature column names
to their default value."""
feature_copy = self.copy()
feature_copy._name = name
feature_copy._names = None
Expand All @@ -103,6 +101,33 @@ def get_feature_names(self):
]
return self._names

def set_feature_names(self, names):
"""Set new values for the feature column names, overriding the default values.
Number of names provided much match the number of output columns defined for
the feature. Only works for features that have more than one output column. Use
``Feature.rename`` to change the column name for single output features.

Args:
names (list[str]): List of names to use for the output feature columns. Provided
names must be unique.
"""
if self.number_output_features == 1:
raise ValueError(
"The set_feature_names can only be used on features that have more than one output column."
)

num_new_names = len(names)
if self.number_output_features != num_new_names:
raise ValueError(
"Number of names provided must match the number of output features:"
f" {num_new_names} name(s) provided, {self.number_output_features} expected."
)

if len(set(names)) != num_new_names:
raise ValueError("Provided output feature names must be unique.")

self._names = names

ozzieD marked this conversation as resolved.
Show resolved Hide resolved
def get_function(self, **kwargs):
return self.primitive.get_function(**kwargs)

Expand Down Expand Up @@ -423,9 +448,7 @@ def __init__(self, column, name=None):
)

@classmethod
def from_dictionary(
cls, arguments, entityset, dependencies, primitives_deserializer
):
def from_dictionary(cls, arguments, entityset, dependencies, primitive):
dataframe_name = arguments["dataframe_name"]
column_name = arguments["column_name"]
column = entityset[dataframe_name].ww[column_name]
Expand Down Expand Up @@ -516,9 +539,7 @@ def _handle_relationship(self, entityset, child_dataframe_name, relationship):
return relationship

@classmethod
def from_dictionary(
cls, arguments, entityset, dependencies, primitives_deserializer
):
def from_dictionary(cls, arguments, entityset, dependencies, primitive):
base_feature = dependencies[arguments["base_feature"]]
relationship = Relationship.from_dictionary(
arguments["relationship"], entityset
Expand Down Expand Up @@ -683,9 +704,7 @@ def _handle_relationship_path(
return relationship_path, path_is_unique

@classmethod
def from_dictionary(
cls, arguments, entityset, dependencies, primitives_deserializer
):
def from_dictionary(cls, arguments, entityset, dependencies, primitive):
base_features = [dependencies[name] for name in arguments["base_features"]]
relationship_path = [
Relationship.from_dictionary(r, entityset)
Expand All @@ -694,10 +713,6 @@ def from_dictionary(
parent_dataframe_name = relationship_path[0].parent_dataframe.ww.name
relationship_path = RelationshipPath([(False, r) for r in relationship_path])

primitive = primitives_deserializer.deserialize_primitive(
arguments["primitive"]
)

use_previous_data = arguments["use_previous"]
use_previous = use_previous_data and Timedelta.from_dictionary(
use_previous_data
Expand All @@ -706,7 +721,7 @@ def from_dictionary(
where_name = arguments["where"]
where = where_name and dependencies[where_name]

return cls(
feat = cls(
base_features=base_features,
parent_dataframe_name=parent_dataframe_name,
primitive=primitive,
Expand All @@ -715,6 +730,8 @@ def from_dictionary(
where=where,
name=arguments["name"],
)
feat._names = arguments.get("feature_names")
return feat

def copy(self):
return AggregationFeature(
Expand Down Expand Up @@ -759,14 +776,17 @@ def generate_names(self):
)

def get_arguments(self):
return {
arg_dict = {
"name": self._name,
"base_features": [feat.unique_name() for feat in self.base_features],
"relationship_path": [r.to_dictionary() for _, r in self.relationship_path],
"primitive": serialize_primitive(self.primitive),
"primitive": self.primitive,
"where": self.where and self.where.unique_name(),
"use_previous": self.use_previous and self.use_previous.get_arguments(),
}
if self._names:
arg_dict["feature_names"] = self._names
return arg_dict

def relationship_path_name(self):
if self._path_is_unique:
Expand All @@ -792,16 +812,13 @@ def __init__(self, base_features, primitive, name=None):
)

@classmethod
def from_dictionary(
cls, arguments, entityset, dependencies, primitives_deserializer
):
def from_dictionary(cls, arguments, entityset, dependencies, primitive):
base_features = [dependencies[name] for name in arguments["base_features"]]
primitive = primitives_deserializer.deserialize_primitive(
arguments["primitive"]
)
return cls(
feat = cls(
base_features=base_features, primitive=primitive, name=arguments["name"]
)
feat._names = arguments.get("feature_names")
return feat

def copy(self):
return TransformFeature(self.base_features, self.primitive)
Expand All @@ -817,11 +834,14 @@ def generate_names(self):
)

def get_arguments(self):
return {
arg_dict = {
"name": self._name,
"base_features": [feat.unique_name() for feat in self.base_features],
"primitive": serialize_primitive(self.primitive),
"primitive": self.primitive,
}
if self._names:
arg_dict["feature_names"] = self._names
return arg_dict


class GroupByTransformFeature(TransformFeature):
Expand All @@ -841,20 +861,17 @@ def __init__(self, base_features, primitive, groupby, name=None):
)

@classmethod
def from_dictionary(
cls, arguments, entityset, dependencies, primitives_deserializer
):
def from_dictionary(cls, arguments, entityset, dependencies, primitive):
base_features = [dependencies[name] for name in arguments["base_features"]]
primitive = primitives_deserializer.deserialize_primitive(
arguments["primitive"]
)
groupby = dependencies[arguments["groupby"]]
return cls(
feat = cls(
base_features=base_features,
primitive=primitive,
groupby=groupby,
name=arguments["name"],
)
feat._names = arguments.get("feature_names")
return feat

def copy(self):
# the groupby feature is appended to base_features in the __init__
Expand Down Expand Up @@ -883,12 +900,15 @@ def get_arguments(self):
for feat in self.base_features
if feat.unique_name() != self.groupby.unique_name()
]
return {
arg_dict = {
"name": self._name,
"base_features": feature_names,
"primitive": serialize_primitive(self.primitive),
"primitive": self.primitive,
"groupby": self.groupby.unique_name(),
}
if self._names:
arg_dict["feature_names"] = self._names
return arg_dict


class Feature(object):
Expand Down Expand Up @@ -981,9 +1001,7 @@ def get_arguments(self):
}

@classmethod
def from_dictionary(
cls, arguments, entityset, dependencies, primitives_deserializer
):
def from_dictionary(cls, arguments, entityset, dependencies, primitive):
base_feature_name = arguments["base_feature"]
base_feature = dependencies[base_feature_name]
n = arguments["n"]
Expand Down
32 changes: 23 additions & 9 deletions featuretools/feature_base/features_deserializer.py
Expand Up @@ -46,14 +46,21 @@ def load_features(features, profile_name=None):

.. code-block:: python

# Option 1
filepath = os.path.join('/Home/features/', 'list.json')
ft.load_features(filepath)
features = ft.load_features(filepath)

f = open(filepath, 'r')
ft.load_features(f)
# Option 2
filepath = os.path.join('/Home/features/', 'list.json')
with open(filepath, 'r') as f:
features = ft.load_features(f)

# Option 3
filepath = os.path.join('/Home/features/', 'list.json')
with open(filepath, 'r') as :
feature_str = f.read()
features = ft.load_features(feature_str)

feature_str = f.read()
ft.load_features(feature_str)

.. seealso::
:func:`.save_features`
Expand All @@ -78,7 +85,12 @@ def __init__(self, features_dict):
self._check_schema_version()
self.entityset = deserialize_es(features_dict["entityset"])
self._deserialized_features = {} # name -> feature
self._primitives_deserializer = PrimitivesDeserializer()
primitive_deserializer = PrimitivesDeserializer()
primitive_definitions = features_dict["primitive_definitions"]
self._deserialized_primitives = {
k: primitive_deserializer.deserialize_primitive(v)
for k, v in primitive_definitions.items()
}

@classmethod
def load(cls, features, profile_name):
Expand Down Expand Up @@ -109,6 +121,10 @@ def _deserialize_feature(self, feature_name):

feature_dict = self.features_dict["feature_definitions"][feature_name]
dependencies_list = feature_dict["dependencies"]
primitive = None
primitive_id = feature_dict["arguments"].get("primitive")
if primitive_id is not None:
primitive = self._deserialized_primitives[primitive_id]

# Collect dependencies into a dictionary of name -> feature.
dependencies = {
Expand All @@ -122,9 +138,7 @@ def _deserialize_feature(self, feature_name):
raise RuntimeError('Unrecognized feature type "%s"' % type)

args = feature_dict["arguments"]
feature = cls.from_dictionary(
args, self.entityset, dependencies, self._primitives_deserializer
)
feature = cls.from_dictionary(args, self.entityset, dependencies, primitive)

self._deserialized_features[feature_name] = feature
return feature
Expand Down