Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove reindex in Entity.query_by_values #626

Merged
merged 9 commits into from Jun 27, 2019
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Expand Up @@ -10,6 +10,7 @@ Changelog
* Select columns of dataframe using a list (:pr:`615`)
* Change type of features calculated on Index features to Categorical (:pr:`602`)
* Specify Dask version in requirements for python 2 (:pr:`627`)
* Keep dataframe sorted by time during feature calculation (:pr:`626`)
* Changes
* Remove unused variance_selection.py file (:pr:`613`)
* Remove Timedelta data param (:pr:`619`)
Expand Down
4 changes: 4 additions & 0 deletions featuretools/computational_backends/feature_set_calculator.py
Expand Up @@ -118,6 +118,10 @@ def run(self, instance_ids):

df.index.name = self.entityset[self.feature_set.target_eid].index
column_list = []

# Order by instance_ids
df = df.reindex(list(pd.unique(instance_ids)))
CJStadler marked this conversation as resolved.
Show resolved Hide resolved

for feat in self.feature_set.target_features:
column_list.extend(feat.get_feature_names())
return df[column_list]
Expand Down
7 changes: 3 additions & 4 deletions featuretools/entityset/entity.py
Expand Up @@ -235,6 +235,9 @@ def query_by_values(self, instance_vals, variable_id=None, columns=None,
Returns:
pd.DataFrame : instances that match constraints with ids in order of underlying dataframe
"""
if not variable_id:
variable_id = self.index

instance_vals = self._vals_to_series(instance_vals, variable_id)

training_window = _check_timedelta(training_window)
Expand All @@ -249,10 +252,6 @@ def query_by_values(self, instance_vals, variable_id=None, columns=None,
elif instance_vals.shape[0] == 0:
df = self.df.head(0)

elif variable_id is None or variable_id == self.index:
df = self.df.reindex(instance_vals)
df.dropna(subset=[self.index], inplace=True)

else:
df = self.df[self.df[variable_id].isin(instance_vals)]
kmax12 marked this conversation as resolved.
Show resolved Hide resolved

Expand Down
Expand Up @@ -499,12 +499,15 @@ def test_topn(es):
['taco clock', np.nan]
])
assert ([name in df.columns for name in topn.get_feature_names()])

for i in range(df.shape[0]):
true = true_results.loc[i]
actual = df.loc[i]
if i == 0:
# coke zero and toothpase have same number of occurrences
assert set(true_results.loc[i].values) == set(df.loc[i].values)
assert set(true.values) == set(actual.values)
else:
for i1, i2 in zip(true_results.loc[i], df.iloc[i]):
for i1, i2 in zip(true, actual):
assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)


Expand Down Expand Up @@ -746,3 +749,17 @@ def get_function(self):
assert all(fm[f5.get_name()].sort_index() == value_sum)
assert all(fm[f6.get_name()].sort_index() == value_sum)
assert all(fm[f7.get_name()].sort_index() == value_sum)


def test_returns_order_of_instance_ids(es):
feature_set = FeatureSet([ft.Feature(es['customers']['age'])])
calculator = FeatureSetCalculator(es,
kmax12 marked this conversation as resolved.
Show resolved Hide resolved
time_last=None,
feature_set=feature_set)

instance_ids = [0, 1, 2]
assert list(es['customers'].df['id']) != instance_ids

df = calculator.run(instance_ids)

assert list(df.index) == instance_ids
3 changes: 1 addition & 2 deletions featuretools/tests/entityset_tests/test_entity.py
Expand Up @@ -113,8 +113,7 @@ def test_query_by_values_secondary_time_index(es):
result = es['customers'].query_by_values(all_instances, time_last=end)

for col in ["cancel_date", "cancel_reason"]:
nulls = result.iloc[all_instances][col].isnull() == [
False, True, True]
nulls = result.loc[all_instances][col].isnull() == [False, True, True]
assert nulls.all(), "Some instance has data it shouldn't for column %s" % col


Expand Down