Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move query_by_values to EntitySet #1251

Merged
merged 10 commits into from Dec 15, 2020
3 changes: 2 additions & 1 deletion docs/source/release_notes.rst
Expand Up @@ -6,13 +6,14 @@ Release Notes
* Enhancements
* Fixes
* Changes
* Move ``query_by_values`` method from ``Entity`` to ``EntitySet`` (:pr:`1251`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's also add a "Breaking Changes" section to highlight the move

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added

* Documentation Changes
* Testing Changes
* Use repository-scoped token for dependency check (:pr:`1245`:, :pr:`1248`)
* Fix install error during docs CI test (:pr:`1250`)

Thanks to the following people for contributing to this release:
:user:`jeff-hernandez`, :user:`rwedge`
:user:`jeff-hernandez`, :user:`rwedge`, :user:`thehomebrewnerd`

**v0.22.0 Nov 30, 2020**
* Enhancements
Expand Down
19 changes: 10 additions & 9 deletions featuretools/computational_backends/feature_set_calculator.py
Expand Up @@ -209,8 +209,7 @@ def _calculate_features_for_entity(self, entity_id, feature_trie, df_trie,
need_full_entity, full_entity_features, not_full_entity_features = feature_trie.value

all_features = full_entity_features | not_full_entity_features
entity = self.entityset[entity_id]
columns = self._necessary_columns(entity, all_features)
columns = self._necessary_columns(entity_id, all_features)

# If we need the full entity then don't filter by filter_values.
if need_full_entity:
Expand All @@ -220,12 +219,13 @@ def _calculate_features_for_entity(self, entity_id, feature_trie, df_trie,
query_variable = filter_variable
query_values = filter_values

df = entity.query_by_values(query_values,
variable_id=query_variable,
columns=columns,
time_last=self.time_last,
training_window=self.training_window,
include_cutoff_time=include_cutoff_time)
df = self.entityset.query_by_values(entity_id=entity_id,
instance_vals=query_values,
variable_id=query_variable,
columns=columns,
time_last=self.time_last,
training_window=self.training_window,
include_cutoff_time=include_cutoff_time)

# call to update timer
progress_callback(0)
Expand Down Expand Up @@ -740,9 +740,10 @@ def last_n(df):

return frame

def _necessary_columns(self, entity, feature_names):
def _necessary_columns(self, entity_id, feature_names):
# We have to keep all Id columns because we don't know what forward
# relationships will come from this node.
entity = self.entityset[entity_id]
index_columns = {v.id for v in entity.variables
if isinstance(v, (variable_types.Index,
variable_types.Id,
Expand Down
93 changes: 0 additions & 93 deletions featuretools/entityset/entity.py
Expand Up @@ -4,7 +4,6 @@
import dask.dataframe as dd
import numpy as np
import pandas as pd
import pandas.api.types as pdtypes

from featuretools import variable_types as vtypes
from featuretools.utils.entity_utils import (
Expand Down Expand Up @@ -215,70 +214,6 @@ def convert_variable_type(self, variable_id, new_type,
new_variable = new_type.create_from(variable)
self.variables[self.variables.index(variable)] = new_variable

def query_by_values(self, instance_vals, variable_id=None, columns=None,
time_last=None, training_window=None, include_cutoff_time=True):
"""Query instances that have variable with given value

Args:
instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
Instance(s) to match.
variable_id (str) : Variable to query on. If None, query on index.
columns (list[str]) : Columns to return. Return all columns if None.
time_last (pd.TimeStamp) : Query data up to and including this
time. Only applies if entity has a time index.
training_window (Timedelta, optional):
Window defining how much time before the cutoff time data
can be used when calculating features. If None, all data before cutoff time is used.
include_cutoff_time (bool):
If True, data at cutoff time are included in calculating features

Returns:
pd.DataFrame : instances that match constraints with ids in order of underlying dataframe
"""
if not variable_id:
variable_id = self.index

instance_vals = self._vals_to_series(instance_vals, variable_id)

training_window = _check_timedelta(training_window)

if training_window is not None:
assert training_window.has_no_observations(), "Training window cannot be in observations"

if instance_vals is None:
df = self.df.copy()

elif isinstance(instance_vals, pd.Series) and instance_vals.empty:
df = self.df.head(0)

else:
if is_instance(instance_vals, (dd, ks), 'Series'):
df = self.df.merge(instance_vals.to_frame(), how="inner", on=variable_id)
elif isinstance(instance_vals, pd.Series) and is_instance(self.df, ks, 'DataFrame'):
df = self.df.merge(ks.DataFrame({variable_id: instance_vals}), how="inner", on=variable_id)
else:
df = self.df[self.df[variable_id].isin(instance_vals)]

if isinstance(self.df, pd.DataFrame):
df = df.set_index(self.index, drop=False)

# ensure filtered df has same categories as original
# workaround for issue below
# github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
if pdtypes.is_categorical_dtype(self.df[variable_id]):
categories = pd.api.types.CategoricalDtype(categories=self.df[variable_id].cat.categories)
df[variable_id] = df[variable_id].astype(categories)

df = self._handle_time(df=df,
time_last=time_last,
training_window=training_window,
include_cutoff_time=include_cutoff_time)

if columns is not None:
df = df[columns]

return df

def _create_variables(self, variable_types, index, time_index, secondary_time_index):
"""Extracts the variables from a dataframe

Expand Down Expand Up @@ -509,34 +444,6 @@ def set_secondary_time_index(self, secondary_time_index):

self.secondary_time_index = secondary_time_index

def _vals_to_series(self, instance_vals, variable_id):
"""
instance_vals may be a pd.Dataframe, a pd.Series, a list, a single
value, or None. This function always returns a Series or None.
"""
if instance_vals is None:
return None

# If this is a single value, make it a list
if not hasattr(instance_vals, '__iter__'):
instance_vals = [instance_vals]

# convert iterable to pd.Series
if isinstance(instance_vals, pd.DataFrame):
out_vals = instance_vals[variable_id]
elif is_instance(instance_vals, (pd, dd, ks), 'Series'):
out_vals = instance_vals.rename(variable_id)
else:
out_vals = pd.Series(instance_vals)

# no duplicates or NaN values
out_vals = out_vals.drop_duplicates().dropna()

# want index to have no name for the merge in query_by_values
out_vals.index.name = None

return out_vals

def _handle_time(self, df, time_last=None, training_window=None, include_cutoff_time=True):
"""
Filter a dataframe for all instances before time_last.
Expand Down
97 changes: 97 additions & 0 deletions featuretools/entityset/entityset.py
Expand Up @@ -6,6 +6,7 @@
import dask.dataframe as dd
import numpy as np
import pandas as pd
import pandas.api.types as pdtypes
from pandas.api.types import is_dtype_equal, is_numeric_dtype

import featuretools.variable_types.variable as vtypes
Expand All @@ -18,6 +19,7 @@
get_graphviz_format,
save_graph
)
from featuretools.utils.wrangle import _check_timedelta

ks = import_or_none('databricks.koalas')

Expand Down Expand Up @@ -968,3 +970,98 @@ def plot(self, to_file=None):
if to_file:
save_graph(graph, to_file, format_)
return graph

def query_by_values(self, entity_id, instance_vals, variable_id=None, columns=None,
time_last=None, training_window=None, include_cutoff_time=True):
"""Query instances that have variable with given value

Args:
entity_id (str): The id of the entity to query
instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
Instance(s) to match.
variable_id (str) : Variable to query on. If None, query on index.
columns (list[str]) : Columns to return. Return all columns if None.
time_last (pd.TimeStamp) : Query data up to and including this
time. Only applies if entity has a time index.
training_window (Timedelta, optional):
Window defining how much time before the cutoff time data
can be used when calculating features. If None, all data before cutoff time is used.
include_cutoff_time (bool):
If True, data at cutoff time are included in calculating features

Returns:
pd.DataFrame : instances that match constraints with ids in order of underlying dataframe
"""
entity = self[entity_id]
if not variable_id:
variable_id = entity.index

instance_vals = _vals_to_series(instance_vals, variable_id)

training_window = _check_timedelta(training_window)

if training_window is not None:
assert training_window.has_no_observations(), "Training window cannot be in observations"

if instance_vals is None:
df = entity.df.copy()

elif isinstance(instance_vals, pd.Series) and instance_vals.empty:
df = entity.df.head(0)

else:
if is_instance(instance_vals, (dd, ks), 'Series'):
df = entity.df.merge(instance_vals.to_frame(), how="inner", on=variable_id)
elif isinstance(instance_vals, pd.Series) and is_instance(entity.df, ks, 'DataFrame'):
df = entity.df.merge(ks.DataFrame({variable_id: instance_vals}), how="inner", on=variable_id)
else:
df = entity.df[entity.df[variable_id].isin(instance_vals)]

if isinstance(entity.df, pd.DataFrame):
df = df.set_index(entity.index, drop=False)

# ensure filtered df has same categories as original
# workaround for issue below
# github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
if pdtypes.is_categorical_dtype(entity.df[variable_id]):
categories = pd.api.types.CategoricalDtype(categories=entity.df[variable_id].cat.categories)
df[variable_id] = df[variable_id].astype(categories)

df = entity._handle_time(df=df,
time_last=time_last,
training_window=training_window,
include_cutoff_time=include_cutoff_time)

if columns is not None:
df = df[columns]

return df


def _vals_to_series(instance_vals, variable_id):
"""
instance_vals may be a pd.Dataframe, a pd.Series, a list, a single
value, or None. This function always returns a Series or None.
"""
if instance_vals is None:
return None

# If this is a single value, make it a list
if not hasattr(instance_vals, '__iter__'):
instance_vals = [instance_vals]

# convert iterable to pd.Series
if isinstance(instance_vals, pd.DataFrame):
out_vals = instance_vals[variable_id]
elif is_instance(instance_vals, (pd, dd, ks), 'Series'):
out_vals = instance_vals.rename(variable_id)
else:
out_vals = pd.Series(instance_vals)

# no duplicates or NaN values
out_vals = out_vals.drop_duplicates().dropna()

# want index to have no name for the merge in query_by_values
out_vals.index.name = None

return out_vals
27 changes: 0 additions & 27 deletions featuretools/tests/entityset_tests/test_entity.py
Expand Up @@ -138,33 +138,6 @@ def test_update_data(es):
assert es['customers'].df["id"].iloc[0] == 0


def test_query_by_values_returns_rows_in_given_order():
data = pd.DataFrame({
"id": [1, 2, 3, 4, 5],
"value": ["a", "c", "b", "a", "a"],
"time": [1000, 2000, 3000, 4000, 5000]
})

es = ft.EntitySet()
es = es.entity_from_dataframe(entity_id="test", dataframe=data, index="id",
time_index="time", variable_types={
"value": ft.variable_types.Categorical
})
query = es['test'].query_by_values(['b', 'a'], variable_id='value')
assert np.array_equal(query['id'], [1, 3, 4, 5])


def test_query_by_values_secondary_time_index(es):
end = np.datetime64(datetime(2011, 10, 1))
all_instances = [0, 1, 2]
result = es['customers'].query_by_values(all_instances, time_last=end)
result = to_pandas(result, index='id')

for col in ["cancel_date", "cancel_reason"]:
nulls = result.loc[all_instances][col].isnull() == [False, True, True]
assert nulls.all(), "Some instance has data it shouldn't for column %s" % col


def test_delete_variables(es):
entity = es['customers']
to_delete = ['age', 'cohort', 'email']
Expand Down