From f71a6d2ccb0ab22bcc2218cfd43954e7fe11b9f2 Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Tue, 8 Dec 2020 11:41:23 -0600 Subject: [PATCH 1/7] move query_by_values to EntitySet --- .../feature_set_calculator.py | 19 ++-- featuretools/entityset/entity.py | 93 ------------------ featuretools/entityset/entityset.py | 96 +++++++++++++++++++ .../tests/entityset_tests/test_entity.py | 27 ------ featuretools/tests/entityset_tests/test_es.py | 43 +++++++-- 5 files changed, 143 insertions(+), 135 deletions(-) diff --git a/featuretools/computational_backends/feature_set_calculator.py b/featuretools/computational_backends/feature_set_calculator.py index df73a8d498..4e775bddbd 100644 --- a/featuretools/computational_backends/feature_set_calculator.py +++ b/featuretools/computational_backends/feature_set_calculator.py @@ -209,8 +209,7 @@ def _calculate_features_for_entity(self, entity_id, feature_trie, df_trie, need_full_entity, full_entity_features, not_full_entity_features = feature_trie.value all_features = full_entity_features | not_full_entity_features - entity = self.entityset[entity_id] - columns = self._necessary_columns(entity, all_features) + columns = self._necessary_columns(entity_id, all_features) # If we need the full entity then don't filter by filter_values. if need_full_entity: @@ -220,12 +219,13 @@ def _calculate_features_for_entity(self, entity_id, feature_trie, df_trie, query_variable = filter_variable query_values = filter_values - df = entity.query_by_values(query_values, - variable_id=query_variable, - columns=columns, - time_last=self.time_last, - training_window=self.training_window, - include_cutoff_time=include_cutoff_time) + df = self.entityset.query_by_values(entity_id=entity_id, + instance_vals=query_values, + variable_id=query_variable, + columns=columns, + time_last=self.time_last, + training_window=self.training_window, + include_cutoff_time=include_cutoff_time) # call to update timer progress_callback(0) @@ -740,9 +740,10 @@ def last_n(df): return frame - def _necessary_columns(self, entity, feature_names): + def _necessary_columns(self, entity_id, feature_names): # We have to keep all Id columns because we don't know what forward # relationships will come from this node. + entity = self.entityset[entity_id] index_columns = {v.id for v in entity.variables if isinstance(v, (variable_types.Index, variable_types.Id, diff --git a/featuretools/entityset/entity.py b/featuretools/entityset/entity.py index 73723ba9cc..5df2f314b8 100644 --- a/featuretools/entityset/entity.py +++ b/featuretools/entityset/entity.py @@ -4,7 +4,6 @@ import dask.dataframe as dd import numpy as np import pandas as pd -import pandas.api.types as pdtypes from featuretools import variable_types as vtypes from featuretools.utils.entity_utils import ( @@ -215,70 +214,6 @@ def convert_variable_type(self, variable_id, new_type, new_variable = new_type.create_from(variable) self.variables[self.variables.index(variable)] = new_variable - def query_by_values(self, instance_vals, variable_id=None, columns=None, - time_last=None, training_window=None, include_cutoff_time=True): - """Query instances that have variable with given value - - Args: - instance_vals (pd.Dataframe, pd.Series, list[str] or str) : - Instance(s) to match. - variable_id (str) : Variable to query on. If None, query on index. - columns (list[str]) : Columns to return. Return all columns if None. - time_last (pd.TimeStamp) : Query data up to and including this - time. Only applies if entity has a time index. - training_window (Timedelta, optional): - Window defining how much time before the cutoff time data - can be used when calculating features. If None, all data before cutoff time is used. - include_cutoff_time (bool): - If True, data at cutoff time are included in calculating features - - Returns: - pd.DataFrame : instances that match constraints with ids in order of underlying dataframe - """ - if not variable_id: - variable_id = self.index - - instance_vals = self._vals_to_series(instance_vals, variable_id) - - training_window = _check_timedelta(training_window) - - if training_window is not None: - assert training_window.has_no_observations(), "Training window cannot be in observations" - - if instance_vals is None: - df = self.df.copy() - - elif isinstance(instance_vals, pd.Series) and instance_vals.empty: - df = self.df.head(0) - - else: - if is_instance(instance_vals, (dd, ks), 'Series'): - df = self.df.merge(instance_vals.to_frame(), how="inner", on=variable_id) - elif isinstance(instance_vals, pd.Series) and is_instance(self.df, ks, 'DataFrame'): - df = self.df.merge(ks.DataFrame({variable_id: instance_vals}), how="inner", on=variable_id) - else: - df = self.df[self.df[variable_id].isin(instance_vals)] - - if isinstance(self.df, pd.DataFrame): - df = df.set_index(self.index, drop=False) - - # ensure filtered df has same categories as original - # workaround for issue below - # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538 - if pdtypes.is_categorical_dtype(self.df[variable_id]): - categories = pd.api.types.CategoricalDtype(categories=self.df[variable_id].cat.categories) - df[variable_id] = df[variable_id].astype(categories) - - df = self._handle_time(df=df, - time_last=time_last, - training_window=training_window, - include_cutoff_time=include_cutoff_time) - - if columns is not None: - df = df[columns] - - return df - def _create_variables(self, variable_types, index, time_index, secondary_time_index): """Extracts the variables from a dataframe @@ -509,34 +444,6 @@ def set_secondary_time_index(self, secondary_time_index): self.secondary_time_index = secondary_time_index - def _vals_to_series(self, instance_vals, variable_id): - """ - instance_vals may be a pd.Dataframe, a pd.Series, a list, a single - value, or None. This function always returns a Series or None. - """ - if instance_vals is None: - return None - - # If this is a single value, make it a list - if not hasattr(instance_vals, '__iter__'): - instance_vals = [instance_vals] - - # convert iterable to pd.Series - if isinstance(instance_vals, pd.DataFrame): - out_vals = instance_vals[variable_id] - elif is_instance(instance_vals, (pd, dd, ks), 'Series'): - out_vals = instance_vals.rename(variable_id) - else: - out_vals = pd.Series(instance_vals) - - # no duplicates or NaN values - out_vals = out_vals.drop_duplicates().dropna() - - # want index to have no name for the merge in query_by_values - out_vals.index.name = None - - return out_vals - def _handle_time(self, df, time_last=None, training_window=None, include_cutoff_time=True): """ Filter a dataframe for all instances before time_last. diff --git a/featuretools/entityset/entityset.py b/featuretools/entityset/entityset.py index 0063c66a8b..9f6d690549 100644 --- a/featuretools/entityset/entityset.py +++ b/featuretools/entityset/entityset.py @@ -6,6 +6,7 @@ import dask.dataframe as dd import numpy as np import pandas as pd +import pandas.api.types as pdtypes from pandas.api.types import is_dtype_equal, is_numeric_dtype import featuretools.variable_types.variable as vtypes @@ -18,6 +19,7 @@ get_graphviz_format, save_graph ) +from featuretools.utils.wrangle import _check_timedelta ks = import_or_none('databricks.koalas') @@ -968,3 +970,97 @@ def plot(self, to_file=None): if to_file: save_graph(graph, to_file, format_) return graph + + def query_by_values(self, entity_id, instance_vals, variable_id=None, columns=None, + time_last=None, training_window=None, include_cutoff_time=True): + """Query instances that have variable with given value + + Args: + entity_id (str): The id of the entity to query + instance_vals (pd.Dataframe, pd.Series, list[str] or str) : + Instance(s) to match. + variable_id (str) : Variable to query on. If None, query on index. + columns (list[str]) : Columns to return. Return all columns if None. + time_last (pd.TimeStamp) : Query data up to and including this + time. Only applies if entity has a time index. + training_window (Timedelta, optional): + Window defining how much time before the cutoff time data + can be used when calculating features. If None, all data before cutoff time is used. + include_cutoff_time (bool): + If True, data at cutoff time are included in calculating features + + Returns: + pd.DataFrame : instances that match constraints with ids in order of underlying dataframe + """ + entity = self[entity_id] + if not variable_id: + variable_id = entity.index + + instance_vals = self._vals_to_series(instance_vals, variable_id) + + training_window = _check_timedelta(training_window) + + if training_window is not None: + assert training_window.has_no_observations(), "Training window cannot be in observations" + + if instance_vals is None: + df = entity.df.copy() + + elif isinstance(instance_vals, pd.Series) and instance_vals.empty: + df = entity.df.head(0) + + else: + if is_instance(instance_vals, (dd, ks), 'Series'): + df = entity.df.merge(instance_vals.to_frame(), how="inner", on=variable_id) + elif isinstance(instance_vals, pd.Series) and is_instance(entity.df, ks, 'DataFrame'): + df = entity.df.merge(ks.DataFrame({variable_id: instance_vals}), how="inner", on=variable_id) + else: + df = entity.df[entity.df[variable_id].isin(instance_vals)] + + if isinstance(entity.df, pd.DataFrame): + df = df.set_index(entity.index, drop=False) + + # ensure filtered df has same categories as original + # workaround for issue below + # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538 + if pdtypes.is_categorical_dtype(entity.df[variable_id]): + categories = pd.api.types.CategoricalDtype(categories=entity.df[variable_id].cat.categories) + df[variable_id] = df[variable_id].astype(categories) + + df = entity._handle_time(df=df, + time_last=time_last, + training_window=training_window, + include_cutoff_time=include_cutoff_time) + + if columns is not None: + df = df[columns] + + return df + + def _vals_to_series(self, instance_vals, variable_id): + """ + instance_vals may be a pd.Dataframe, a pd.Series, a list, a single + value, or None. This function always returns a Series or None. + """ + if instance_vals is None: + return None + + # If this is a single value, make it a list + if not hasattr(instance_vals, '__iter__'): + instance_vals = [instance_vals] + + # convert iterable to pd.Series + if isinstance(instance_vals, pd.DataFrame): + out_vals = instance_vals[variable_id] + elif is_instance(instance_vals, (pd, dd, ks), 'Series'): + out_vals = instance_vals.rename(variable_id) + else: + out_vals = pd.Series(instance_vals) + + # no duplicates or NaN values + out_vals = out_vals.drop_duplicates().dropna() + + # want index to have no name for the merge in query_by_values + out_vals.index.name = None + + return out_vals diff --git a/featuretools/tests/entityset_tests/test_entity.py b/featuretools/tests/entityset_tests/test_entity.py index 4e2df72384..e91350bb25 100644 --- a/featuretools/tests/entityset_tests/test_entity.py +++ b/featuretools/tests/entityset_tests/test_entity.py @@ -138,33 +138,6 @@ def test_update_data(es): assert es['customers'].df["id"].iloc[0] == 0 -def test_query_by_values_returns_rows_in_given_order(): - data = pd.DataFrame({ - "id": [1, 2, 3, 4, 5], - "value": ["a", "c", "b", "a", "a"], - "time": [1000, 2000, 3000, 4000, 5000] - }) - - es = ft.EntitySet() - es = es.entity_from_dataframe(entity_id="test", dataframe=data, index="id", - time_index="time", variable_types={ - "value": ft.variable_types.Categorical - }) - query = es['test'].query_by_values(['b', 'a'], variable_id='value') - assert np.array_equal(query['id'], [1, 3, 4, 5]) - - -def test_query_by_values_secondary_time_index(es): - end = np.datetime64(datetime(2011, 10, 1)) - all_instances = [0, 1, 2] - result = es['customers'].query_by_values(all_instances, time_last=end) - result = to_pandas(result, index='id') - - for col in ["cancel_date", "cancel_reason"]: - nulls = result.loc[all_instances][col].isnull() == [False, True, True] - assert nulls.all(), "Some instance has data it shouldn't for column %s" % col - - def test_delete_variables(es): entity = es['customers'] to_delete = ['age', 'cohort', 'email'] diff --git a/featuretools/tests/entityset_tests/test_es.py b/featuretools/tests/entityset_tests/test_es.py index d453fdf424..8ba810345a 100644 --- a/featuretools/tests/entityset_tests/test_es.py +++ b/featuretools/tests/entityset_tests/test_es.py @@ -176,13 +176,41 @@ def test_add_relationship_empty_child_convert_dtype(es): assert es['log'].df['session_id'].dtype == 'int64' +def test_query_by_values_returns_rows_in_given_order(): + data = pd.DataFrame({ + "id": [1, 2, 3, 4, 5], + "value": ["a", "c", "b", "a", "a"], + "time": [1000, 2000, 3000, 4000, 5000] + }) + + es = ft.EntitySet() + es = es.entity_from_dataframe(entity_id="test", dataframe=data, index="id", + time_index="time", variable_types={ + "value": ft.variable_types.Categorical + }) + query = es.query_by_values('test', ['b', 'a'], variable_id='value') + assert np.array_equal(query['id'], [1, 3, 4, 5]) + + +def test_query_by_values_secondary_time_index(es): + end = np.datetime64(datetime(2011, 10, 1)) + all_instances = [0, 1, 2] + result = es.query_by_values('customers', all_instances, time_last=end) + result = to_pandas(result, index='id') + + for col in ["cancel_date", "cancel_reason"]: + nulls = result.loc[all_instances][col].isnull() == [False, True, True] + assert nulls.all(), "Some instance has data it shouldn't for column %s" % col + + def test_query_by_id(es): - df = to_pandas(es['log'].query_by_values(instance_vals=[0])) + df = to_pandas(es.query_by_values('log', instance_vals=[0])) assert df['id'].values[0] == 0 def test_query_by_id_with_time(es): - df = es['log'].query_by_values( + df = es.query_by_values( + entity_id='log', instance_vals=[0, 1, 2, 3, 4], time_last=datetime(2011, 4, 9, 10, 30, 2 * 6)) df = to_pandas(df) @@ -194,7 +222,8 @@ def test_query_by_id_with_time(es): def test_query_by_variable_with_time(es): - df = es['log'].query_by_values( + df = es.query_by_values( + entity_id='log', instance_vals=[0, 1, 2], variable_id='session_id', time_last=datetime(2011, 4, 9, 10, 50, 0)) df = to_pandas(df) @@ -210,7 +239,8 @@ def test_query_by_variable_with_time(es): def test_query_by_variable_with_training_window(es): - df = es['log'].query_by_values( + df = es.query_by_values( + entity_id='log', instance_vals=[0, 1, 2], variable_id='session_id', time_last=datetime(2011, 4, 9, 10, 50, 0), training_window='15m') @@ -221,7 +251,8 @@ def test_query_by_variable_with_training_window(es): def test_query_by_indexed_variable(es): - df = es['log'].query_by_values( + df = es.query_by_values( + entity_id='log', instance_vals=['taco clock'], variable_id='product_id') df = to_pandas(df) @@ -748,7 +779,7 @@ def test_concat_entitysets(es): assert not es.__eq__(es_2, deep=True) # make sure internal indexes work before concat - regions = es_1['customers'].query_by_values(['United States'], variable_id=u'région_id') + regions = es_1.query_by_values('customers', ['United States'], variable_id=u'région_id') assert regions.index.isin(es_1['customers'].df.index).all() assert es_1.__eq__(es_2) From c481ce00ddcebce320df1ae0a74b4349ab299cc0 Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Tue, 8 Dec 2020 11:46:09 -0600 Subject: [PATCH 2/7] update release notes --- docs/source/release_notes.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index e91ee6b826..13ba35d132 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -6,12 +6,13 @@ Release Notes * Enhancements * Fixes * Changes + * Move ``query_by_values`` method from ``Entity`` to ``EntitySet`` (:pr:`1251`) * Documentation Changes * Testing Changes * Use repository-scoped token for dependency check (:pr:`1245`:) Thanks to the following people for contributing to this release: - :user:`jeff-hernandez` + :user:`jeff-hernandez`, :user:`thehomebrewnerd` **v0.22.0 Nov 30, 2020** * Enhancements From 6c0314412481149a780929796461ab0e47b38f59 Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Wed, 9 Dec 2020 10:19:37 -0600 Subject: [PATCH 3/7] make _vals_to_series a helper function instead of method --- featuretools/entityset/entityset.py | 47 +++++++++++++++-------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/featuretools/entityset/entityset.py b/featuretools/entityset/entityset.py index 9f6d690549..cea9e431d4 100644 --- a/featuretools/entityset/entityset.py +++ b/featuretools/entityset/entityset.py @@ -996,7 +996,7 @@ def query_by_values(self, entity_id, instance_vals, variable_id=None, columns=No if not variable_id: variable_id = entity.index - instance_vals = self._vals_to_series(instance_vals, variable_id) + instance_vals = _vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) @@ -1037,30 +1037,31 @@ def query_by_values(self, entity_id, instance_vals, variable_id=None, columns=No return df - def _vals_to_series(self, instance_vals, variable_id): - """ - instance_vals may be a pd.Dataframe, a pd.Series, a list, a single - value, or None. This function always returns a Series or None. - """ - if instance_vals is None: - return None - # If this is a single value, make it a list - if not hasattr(instance_vals, '__iter__'): - instance_vals = [instance_vals] +def _vals_to_series(self, instance_vals, variable_id): + """ + instance_vals may be a pd.Dataframe, a pd.Series, a list, a single + value, or None. This function always returns a Series or None. + """ + if instance_vals is None: + return None - # convert iterable to pd.Series - if isinstance(instance_vals, pd.DataFrame): - out_vals = instance_vals[variable_id] - elif is_instance(instance_vals, (pd, dd, ks), 'Series'): - out_vals = instance_vals.rename(variable_id) - else: - out_vals = pd.Series(instance_vals) + # If this is a single value, make it a list + if not hasattr(instance_vals, '__iter__'): + instance_vals = [instance_vals] + + # convert iterable to pd.Series + if isinstance(instance_vals, pd.DataFrame): + out_vals = instance_vals[variable_id] + elif is_instance(instance_vals, (pd, dd, ks), 'Series'): + out_vals = instance_vals.rename(variable_id) + else: + out_vals = pd.Series(instance_vals) - # no duplicates or NaN values - out_vals = out_vals.drop_duplicates().dropna() + # no duplicates or NaN values + out_vals = out_vals.drop_duplicates().dropna() - # want index to have no name for the merge in query_by_values - out_vals.index.name = None + # want index to have no name for the merge in query_by_values + out_vals.index.name = None - return out_vals + return out_vals From 735aa7e97cd6f7c25ef35a6b4641bd647043f723 Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Wed, 9 Dec 2020 10:30:18 -0600 Subject: [PATCH 4/7] remove extra argument --- featuretools/entityset/entityset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/featuretools/entityset/entityset.py b/featuretools/entityset/entityset.py index cea9e431d4..83262b8aa6 100644 --- a/featuretools/entityset/entityset.py +++ b/featuretools/entityset/entityset.py @@ -1038,7 +1038,7 @@ def query_by_values(self, entity_id, instance_vals, variable_id=None, columns=No return df -def _vals_to_series(self, instance_vals, variable_id): +def _vals_to_series(instance_vals, variable_id): """ instance_vals may be a pd.Dataframe, a pd.Series, a list, a single value, or None. This function always returns a Series or None. From dc5b38bdcfcf74d71133cbcc6ce4938260925c3a Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Thu, 10 Dec 2020 12:04:48 -0600 Subject: [PATCH 5/7] add tests for coverage --- featuretools/tests/entityset_tests/test_es.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/featuretools/tests/entityset_tests/test_es.py b/featuretools/tests/entityset_tests/test_es.py index 8ba810345a..6df6ce087c 100644 --- a/featuretools/tests/entityset_tests/test_es.py +++ b/featuretools/tests/entityset_tests/test_es.py @@ -208,6 +208,18 @@ def test_query_by_id(es): assert df['id'].values[0] == 0 +def test_query_by_single_value(es): + df = to_pandas(es.query_by_values('log', instance_vals=0)) + assert df['id'].values[0] == 0 + + +def test_query_by_df(es): + instance_df = pd.DataFrame({'id': [1, 3], 'vals': [0, 1]}) + df = to_pandas(es.query_by_values('log', instance_vals=instance_df)) + + assert np.array_equal(df['id'], [1, 3]) + + def test_query_by_id_with_time(es): df = es.query_by_values( entity_id='log', From 164e3b2488099c9e65794b2f9bd4633299bcd45f Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Thu, 10 Dec 2020 13:13:18 -0600 Subject: [PATCH 6/7] add breaking change --- docs/source/release_notes.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 84cf365bf1..332750d140 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -15,6 +15,11 @@ Release Notes Thanks to the following people for contributing to this release: :user:`jeff-hernandez`, :user:`rwedge`, :user:`thehomebrewnerd` +**Breaking Changes** + +* ``Entity.query_by_values`` has been removed and replaced with ``EntitySet.query_by_values``, with an + added ``entity_id`` parameter to specify which entity in the entityset should be used for the query. + **v0.22.0 Nov 30, 2020** * Enhancements * Allow variable descriptions to be set directly on variable (:pr:`1207`) From 46daab207225fca55ad3fda743bbc4fc65d2038a Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Thu, 10 Dec 2020 13:14:49 -0600 Subject: [PATCH 7/7] update wording --- docs/source/release_notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 332750d140..6803680013 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -17,7 +17,7 @@ Release Notes **Breaking Changes** -* ``Entity.query_by_values`` has been removed and replaced with ``EntitySet.query_by_values``, with an +* ``Entity.query_by_values`` has been removed and replaced by ``EntitySet.query_by_values`` with an added ``entity_id`` parameter to specify which entity in the entityset should be used for the query. **v0.22.0 Nov 30, 2020**