From d93ff0ddd39a4d8c7881e59577e61feabc033790 Mon Sep 17 00:00:00 2001 From: Charles Bradshaw Date: Mon, 11 Feb 2019 15:11:30 -0500 Subject: [PATCH] Cumulative primitives (#410) * Updated Cumulative Primitives --- Makefile | 2 +- featuretools/entityset/entity.py | 1 + featuretools/entityset/entityset.py | 1 + .../standard/cum_transform_feature.py | 309 +++-------- .../test_transform_features.py | 503 ++++++++---------- 5 files changed, 293 insertions(+), 523 deletions(-) diff --git a/Makefile b/Makefile index fa26d56187..43165f737a 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ lint: flake8 featuretools && isort --check-only --recursive featuretools lint-fix: - autopep8 --in-place --recursive --max-line-length=100 --exclude="*/migrations/*" --select="E225,E303,E302,E203,E128,E231,E251,E271,E127,E126" featuretools + autopep8 --in-place --recursive --max-line-length=100 --exclude="*/migrations/*" --select="E225,E303,E302,E203,E128,E231,E251,E271,E127,E126,E301,W293,E226" featuretools isort --recursive featuretools diff --git a/featuretools/entityset/entity.py b/featuretools/entityset/entity.py index 67e9896f5a..2e3f8e4305 100644 --- a/featuretools/entityset/entity.py +++ b/featuretools/entityset/entity.py @@ -34,6 +34,7 @@ class Entity(object): :class:`.Relationship`, :class:`.Variable`, :class:`.EntitySet` """ + def __init__(self, id, df, entityset, variable_types=None, index=None, time_index=None, secondary_time_index=None, last_time_index=None, already_sorted=False, make_index=False, diff --git a/featuretools/entityset/entityset.py b/featuretools/entityset/entityset.py index 3820bd31fe..f8a00caa27 100644 --- a/featuretools/entityset/entityset.py +++ b/featuretools/entityset/entityset.py @@ -34,6 +34,7 @@ class EntitySet(object): metadata """ + def __init__(self, id=None, entities=None, relationships=None): """Creates EntitySet diff --git a/featuretools/primitives/standard/cum_transform_feature.py b/featuretools/primitives/standard/cum_transform_feature.py index 91d6603ec9..87c5aa7a1f 100644 --- a/featuretools/primitives/standard/cum_transform_feature.py +++ b/featuretools/primitives/standard/cum_transform_feature.py @@ -1,272 +1,97 @@ -# import uuid -# from builtins import str +from featuretools.primitives.base import TransformPrimitive +from featuretools.variable_types import Discrete, Id, Numeric -# import numpy as np -# import pandas as pd -# from ..base.primitive_base import IdentityFeature, PrimitiveBase -# from ..base.transform_primitive_base import TransformPrimitive -# from .aggregation_primitives import Count, Max, Mean, Min, Sum -# from .utils import apply_dual_op_from_feat +class CumSum(TransformPrimitive): + """Returns the cumulative sum after grouping""" -# from featuretools.utils import is_string -# from featuretools.utils.wrangle import _check_timedelta -# from featuretools.variable_types import Id, Index, Numeric, TimeIndex -# from featuretools.variable_types.variable import Discrete + name = "cum_sum" + input_types = [[Numeric, Id], + [Numeric, Discrete]] + return_type = Numeric + uses_full_entity = True + def get_function(self): + def cum_sum(values, groups): + return values.groupby(groups).cumsum() -# class CumFeature(TransformPrimitive): -# allow_where = True -# agg_feature = None -# uses_full_entity = True + return cum_sum -# # Note: Any row with a nan value in the group by feature will have a -# # NaN value in the cumfeat + def generate_name(self, base_feature_names): + return "CUM_SUM(%s by %s)" % (base_feature_names[0], base_feature_names[1]) -# # Todo: also passing the parent entity instead of the group_feat -# def __init__(self, base_feature, group_feature, time_index=None, -# where=None, use_previous=None): -# """Summary -# Args: -# agg_feature (type): subclass of :class:`.AggregationPrimitive`; -# aggregation method being used. This is passed by the -# constructors of the cumfeat subclasses -# base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature -# or variable calculated on -# group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature -# or variable used to group the rows before computation -# where (optional[:class:`.PrimitiveBase`]): -# use_previous (optional[:class:`.Timedelta`): -# """ -# base_feature = self._check_feature(base_feature) +class CumCount(TransformPrimitive): + """Returns the cumulative count after grouping""" -# td_entity_id = None -# if is_string(use_previous): -# td_entity_id = base_feature.entity.id -# self.use_previous = _check_timedelta( -# use_previous, entity_id=td_entity_id) + name = "cum_count" + input_types = [[Id], [Discrete]] + return_type = Numeric + uses_full_entity = True -# group_feature = self._check_feature(group_feature) -# self.group_feature = group_feature + def get_function(self): + def cum_count(values): + return values.groupby(values).cumcount() + 1 -# self.base_features = [base_feature, group_feature] + return cum_count -# if time_index is None: -# entity = base_feature.entity -# time_index = IdentityFeature(entity[entity.time_index]) -# self.base_features += [time_index] + def generate_name(self, base_feature_names): + return "CUM_COUNT(%s)" % (base_feature_names[0]) -# if where is not None: -# self.where = where -# super(CumFeature, self).__init__(*self.base_features) +class CumMean(TransformPrimitive): + """Returns the cumulative mean after grouping""" -# def generate_name(self): -# where_str = u"" -# use_prev_str = u"" + name = "cum_mean" + input_types = [[Numeric, Id], + [Numeric, Discrete]] + return_type = Numeric + uses_full_entity = True -# if self.where is not None: -# where_str = u" WHERE " + self.where.get_name() + def get_function(self): + def cum_mean(values, groups): + temp = values.groupby(groups) + return temp.cumsum() / (temp.cumcount() + 1) -# if self.use_previous is not None: -# use_prev_str = u", Last %s" % (self.use_previous.get_name()) + return cum_mean -# base_features_str = u"%s by %s" % \ -# (self.base_features[0].get_name(), self.group_feature.get_name()) + def generate_name(self, base_feature_names): + return "CUM_MEAN(%s by %s)" % (base_feature_names[0], base_feature_names[1]) -# return u"%s(%s%s%s)" % (self.name.upper(), base_features_str, -# where_str, use_prev_str) -# def get_function(self): -# return pd_rolling_outer(self.rolling_func_name, self) +class CumMin(TransformPrimitive): + """Returns the cumulative min after grouping""" + name = "cum_min" + input_types = [[Numeric, Id], + [Numeric, Discrete]] + return_type = Numeric + uses_full_entity = True -# class CumSum(CumFeature): -# """Calculates the sum of previous values of an instance for each value in a time-dependent entity. -# """ -# name = "cum_sum" -# rolling_func_name = "sum" + def get_function(self): + def cum_min(values, groups): + return values.groupby(groups).cummin() -# default_value = 0 -# agg_feature = Sum -# input_types = [[Numeric, Id, TimeIndex], -# [Numeric, Discrete, TimeIndex]] + return cum_min + def generate_name(self, base_feature_names): + return "CUM_MIN(%s by %s)" % (base_feature_names[0], base_feature_names[1]) -# class CumMean(CumFeature): -# """Calculates the mean of previous values of an instance for each value in a time-dependent entity. -# """ -# name = "cum_mean" -# rolling_func_name = "mean" -# default_value = 0 -# agg_feature = Mean -# input_types = [[Numeric, Id, TimeIndex], -# [Numeric, Discrete, TimeIndex]] +class CumMax(TransformPrimitive): + """Returns the cumulative max after grouping""" -# class CumCount(CumFeature): -# """Calculates the number of previous values of an instance for each value in a time-dependent entity. -# """ -# name = "cum_count" -# rolling_func_name = "count" -# default_value = 0 -# agg_feature = Count -# input_types = [Index, Discrete, TimeIndex] + name = "cum_max" + input_types = [[Numeric, Id], + [Numeric, Discrete]] + return_type = Numeric + uses_full_entity = True + def get_function(self): + def cum_max(values, groups): + return values.groupby(groups).cummax() -# class CumMax(CumFeature): -# """Calculates the max of previous values of an instance for each value in a time-dependent entity. -# """ -# name = "cum_max" -# rolling_func_name = "max" -# default_value = 0 -# agg_feature = Max -# input_types = [[Numeric, Id, TimeIndex], -# [Numeric, Discrete, TimeIndex]] + return cum_max - -# class CumMin(CumFeature): -# """Calculates the min of previous values of an instance for each value in a time-dependent entity. -# """ -# name = "cum_min" -# rolling_func_name = "min" -# default_value = 0 -# agg_feature = Min -# input_types = [[Numeric, Id, TimeIndex], -# [Numeric, Discrete, TimeIndex]] - - -# def pd_rolling_outer(rolling_func, f): -# def pd_rolling(base_array, group_array, values_3=None, values_4=None): -# bf_name = f.base_features[0].get_name() -# entity = f.base_features[0].entity -# time_index = entity.time_index -# groupby = f.group_feature.get_name() -# timedelta = f.use_previous -# if timedelta is not None: -# if timedelta.is_absolute(): -# timedelta = f.use_previous.get_pandas_timedelta() -# absolute = True -# else: -# timedelta = f.use_previous.value -# absolute = False -# df_dict = {bf_name: base_array, groupby: group_array} -# if timedelta: -# df_dict[time_index] = values_3 -# if f.where: -# df_dict[f.where.get_name()] = values_4 -# elif f.where: -# df_dict[f.where.get_name()] = values_3 - -# df = pd.DataFrame.from_dict(df_dict) - -# if f.use_previous and not f.where: -# def apply_rolling(group): -# to_roll = group -# kwargs = {'window': timedelta, -# 'min_periods': 1} -# if absolute: -# to_roll = to_roll[[bf_name, time_index]].sort_values( -# time_index, kind='mergesort') -# kwargs['on'] = time_index -# else: -# to_roll = to_roll[bf_name] -# rolled = to_roll.rolling(**kwargs) -# rolled = getattr(rolled, rolling_func)() -# if absolute: -# rolled = rolled[bf_name] -# return rolled -# elif not f.where: -# cumfuncs = {"count": "cumcount", -# "sum": "cumsum", -# "max": "cummax", -# "min": "cummin", -# "prod": "cumprod", -# } -# if rolling_func in ["count", "sum", "max", "min"]: -# cumfunc = cumfuncs[rolling_func] -# grouped = df.groupby(groupby, sort=False, observed=True)[bf_name] -# applied = getattr(grouped, cumfunc)() -# # TODO: to produce same functionality as the rolling cases already -# # implemented, we add 1 -# # We may want to consider changing this functionality to instead -# # return count of the *previous* events -# if rolling_func == "count": -# applied += 1 -# return applied -# else: -# def apply_rolling(group): -# rolled = group[bf_name].expanding(min_periods=1) -# return getattr(rolled, rolling_func)() -# elif f.use_previous and f.where: -# def apply_rolling(group): -# variable_data = [group[base.get_name()] -# for base in [f.where.left, f.where.right] -# if isinstance(base, PrimitiveBase)] -# mask = apply_dual_op_from_feat(f.where, *variable_data) -# to_roll = group[mask] -# kwargs = {'window': timedelta, -# 'min_periods': 1} -# if absolute: -# output = pd.Series(f.default_value, index=group.index) -# # mergesort is stable -# to_roll = to_roll[[bf_name, time_index]].sort_values( -# time_index, kind='mergesort') -# kwargs['on'] = time_index -# else: -# output = pd.Series(np.nan, index=group.index) -# to_roll = to_roll[bf_name] -# rolled = to_roll.rolling(**kwargs) -# rolled = getattr(rolled, rolling_func)() -# if absolute: -# rolled = rolled[bf_name] -# output[mask] = rolled -# else: -# output[mask] = rolled -# # values filtered out by the Where statement -# # should have their values be w -# output.fillna(method='ffill', inplace=True) -# # first value might still be nan -# if pd.isnull(output.iloc[0]): -# output.fillna(0, inplace=True) -# return output -# elif f.where: -# def apply_rolling(group): -# variable_data = [group[base.get_name()] -# for base in [f.where.left, f.where.right] -# if isinstance(base, PrimitiveBase)] -# mask = apply_dual_op_from_feat(f.where, *variable_data) -# output = pd.Series(np.nan, index=group.index) -# rolled = group[mask][bf_name].expanding(min_periods=1) -# rolled = getattr(rolled, rolling_func)() -# output[mask] = rolled -# # values filtered out by the Where statement -# # should have their values be w -# output.fillna(method='ffill', inplace=True) -# # first value might still be nan -# if pd.isnull(output.iloc[0]): -# output.fillna(0, inplace=True) -# return output - -# new_index_name = str(uuid.uuid1()) -# new_index = pd.RangeIndex(len(df), name=new_index_name) -# df.set_index(new_index, append=True, inplace=True) -# grouped = df.groupby(groupby, observed=True).apply(apply_rolling) -# original_index = pd.Series(np.nan, index=df.index) -# if isinstance(grouped, pd.DataFrame): -# if grouped.shape[0] == 0 or grouped.empty: -# return original_index.values -# else: -# grouped = pd.Series(grouped.values[0], index=grouped.columns) - -# df.reset_index(new_index_name, inplace=True, drop=True) -# # case where some values of df[groupby] are nan -# # pandas groupby().apply() filters those out -# # and returns a series that's shorter than the original -# # we need to add these values to the original index to -# # preserve the length and these nan values -# grouped_index = grouped.index.get_level_values(new_index_name) -# original_index[grouped_index] = grouped.values -# return original_index.values -# return pd_rolling + def generate_name(self, base_feature_names): + return "CUM_MAX(%s by %s)" % (base_feature_names[0], base_feature_names[1]) diff --git a/featuretools/tests/primitive_tests/test_transform_features.py b/featuretools/tests/primitive_tests/test_transform_features.py index 94133ab154..aa74c7263f 100644 --- a/featuretools/tests/primitive_tests/test_transform_features.py +++ b/featuretools/tests/primitive_tests/test_transform_features.py @@ -7,15 +7,16 @@ import featuretools as ft from featuretools.computational_backends import PandasBackend -from featuretools.primitives.base import make_trans_primitive -from featuretools.synthesis.deep_feature_synthesis import match -from featuretools.variable_types import Boolean, Datetime, Numeric, Variable - -from featuretools.primitives import ( # CumCount,; CumMax,; CumMean,; CumMin,; CumSum, +from featuretools.primitives import ( Absolute, AddNumeric, AddNumericScalar, Count, + CumCount, + CumMax, + CumMean, + CumMin, + CumSum, Day, Diff, DivideByFeature, @@ -38,12 +39,12 @@ Month, MultiplyNumeric, MultiplyNumericScalar, + NMostCommon, Not, NotEqual, NotEqualScalar, NumCharacters, NumWords, - NMostCommon, Percentile, ScalarSubtractNumericFeature, Second, @@ -54,6 +55,9 @@ Year, get_transform_primitives ) +from featuretools.primitives.base import make_trans_primitive +from featuretools.synthesis.deep_feature_synthesis import match +from featuretools.variable_types import Boolean, Datetime, Numeric, Variable # some tests change the entityset values, so we have to create it fresh @@ -412,280 +416,219 @@ def test_haversine(es): for i, v in enumerate(real): assert v - values[i] < .0001 -# # M TODOS -# def test_cum_sum(es): -# log_value_feat = es['log']['value'] -# cum_sum = CumSum(log_value_feat, es['log']['session_id']) -# features = [cum_sum] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_sum.get_name()].values -# assert len(cvalues) == 15 -# cum_sum_values = [0, 5, 15, 30, 50, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21] -# for i, v in enumerate(cum_sum_values): -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_min(es): -# log_value_feat = es['log']['value'] -# cum_min = CumMin(log_value_feat, es['log']['session_id']) -# features = [cum_min] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_min.get_name()].values -# assert len(cvalues) == 15 -# cum_min_values = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] -# for i, v in enumerate(cum_min_values): -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_max(es): -# log_value_feat = es['log']['value'] -# cum_max = CumMax(log_value_feat, es['log']['session_id']) -# features = [cum_max] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_max.get_name()].values -# assert len(cvalues) == 15 -# cum_max_values = [0, 5, 10, 15, 20, 0, 1, 2, 3, 0, 0, 5, 0, 7, 14] -# for i, v in enumerate(cum_max_values): -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_sum_use_previous(es): -# log_value_feat = es['log']['value'] -# cum_sum = CumSum(log_value_feat, es['log']['session_id'], -# use_previous=Timedelta(3, 'observations', -# entity=es['log'])) -# features = [cum_sum] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_sum.get_name()].values -# assert len(cvalues) == 15 -# cum_sum_values = [0, 5, 15, 30, 45, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21] -# for i, v in enumerate(cum_sum_values): -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_sum_use_previous_integer_time(int_es): -# es = int_es - -# log_value_feat = es['log']['value'] -# with pytest.raises(AssertionError, match=''): -# CumSum(log_value_feat, es['log']['session_id'], -# use_previous=Timedelta(3, 'm')) - -# cum_sum = CumSum(log_value_feat, es['log']['session_id'], -# use_previous=Timedelta(3, 'observations', -# entity=es['log'])) -# features = [cum_sum] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_sum.get_name()].values -# assert len(cvalues) == 15 -# cum_sum_values = [0, 5, 15, 30, 45, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21] -# for i, v in enumerate(cum_sum_values): -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_sum_where(es): -# log_value_feat = es['log']['value'] -# compare_feat = GreaterThan(log_value_feat, 3) -# dfeat = ft.Feature(es['sessions']['customer_id'], es['log']) -# cum_sum = CumSum(log_value_feat, dfeat, -# where=compare_feat) -# features = [cum_sum] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_sum.get_name()].values -# assert len(cvalues) == 15 -# cum_sum_values = [0, 5, 15, 30, 50, 50, 50, 50, 50, 50, -# 0, 5, 5, 12, 26] -# for i, v in enumerate(cum_sum_values): -# if not np.isnan(v): -# assert v == cvalues[i] -# else: -# assert (np.isnan(cvalues[i])) - -# # M TODOS -# def test_cum_sum_use_previous_and_where(es): -# log_value_feat = es['log']['value'] -# compare_feat = GreaterThan(log_value_feat, 3) -# # todo should this be cummean? -# dfeat = ft.Feature(es['sessions']['customer_id'], es['log']) -# cum_sum = CumSum(log_value_feat, dfeat, -# where=compare_feat, -# use_previous=Timedelta(3, 'observations', -# entity=es['log'])) -# features = [cum_sum] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) - -# cum_sum_values = [0, 5, 15, 30, 45, 45, 45, 45, 45, 45, -# 0, 5, 5, 12, 26] -# cvalues = df[cum_sum.get_name()].values -# assert len(cvalues) == 15 -# for i, v in enumerate(cum_sum_values): -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_sum_group_on_nan(es): -# log_value_feat = es['log']['value'] -# es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 + -# ['toothpaste'] * 3 + ['brown bag'] * 2 + -# ['shoes'] + -# [np.nan] * 4 + -# ['coke_zero'] * 2) -# cum_sum = CumSum(log_value_feat, es['log']['product_id']) -# features = [cum_sum] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_sum.get_name()].values -# assert len(cvalues) == 15 -# cum_sum_values = [0, 5, 15, -# 15, 35, -# 0, 1, 3, -# 3, 3, -# 0, -# np.nan, np.nan, np.nan, np.nan] -# for i, v in enumerate(cum_sum_values): -# if np.isnan(v): -# assert (np.isnan(cvalues[i])) -# else: -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_sum_use_previous_group_on_nan(es): -# # TODO: Figure out how to test where `df` -# # in pd_rolling get_function() has multiindex -# log_value_feat = es['log']['value'] -# es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 + -# ['toothpaste'] * 3 + ['brown bag'] * 2 + -# ['shoes'] + -# [np.nan] * 4 + -# ['coke_zero'] * 2) -# cum_sum = CumSum(log_value_feat, -# es['log']['product_id'], -# es["log"]["datetime"], -# use_previous=Timedelta(40, 'seconds')) -# features = [cum_sum] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_sum.get_name()].values -# assert len(cvalues) == 15 -# cum_sum_values = [0, 5, 15, -# 15, 35, -# 0, 1, 3, -# 3, 0, -# 0, -# np.nan, np.nan, np.nan, np.nan] -# for i, v in enumerate(cum_sum_values): -# if np.isnan(v): -# assert (np.isnan(cvalues[i])) -# else: -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_sum_use_previous_and_where_absolute(es): -# log_value_feat = es['log']['value'] -# compare_feat = GreaterThan(log_value_feat, 3) -# dfeat = ft.Feature(es['sessions']['customer_id'], es['log']) -# cum_sum = CumSum(log_value_feat, dfeat, es["log"]["datetime"], -# where=compare_feat, -# use_previous=Timedelta(40, 'seconds')) -# features = [cum_sum] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) - -# cum_sum_values = [0, 5, 15, 30, 50, 0, 0, 0, 0, 0, -# 0, 5, 0, 7, 21] -# cvalues = df[cum_sum.get_name()].values -# assert len(cvalues) == 15 -# for i, v in enumerate(cum_sum_values): -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_handles_uses_full_entity(es): -# def check(feature): -# pandas_backend = PandasBackend(es, [feature]) -# df_1 = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) -# df_2 = pandas_backend.calculate_all_features(instance_ids=[2], time_last=None) - -# # check that the value for instance id 2 matches -# assert (df_2.loc[2] == df_1.loc[2]).all() - -# for primitive in [CumSum, CumMean, CumMax, CumMin]: -# check(primitive(es['log']['value'], es['log']['session_id'])) - -# check(Cumft.Feature(es['log']['id'], parent_entity=es['log']['session_id']), primitive=Count) - -# # M TODOS -# def test_cum_mean(es): -# log_value_feat = es['log']['value'] -# cum_mean = CumMean(log_value_feat, es['log']['session_id']) -# features = [cum_mean] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_mean.get_name()].values -# assert len(cvalues) == 15 -# cum_mean_values = [0, 2.5, 5, 7.5, 10, 0, .5, 1, 1.5, 0, 0, 2.5, 0, 3.5, 7] -# for i, v in enumerate(cum_mean_values): -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_mean_use_previous(es): -# log_value_feat = es['log']['value'] -# cum_mean = CumMean(log_value_feat, es['log']['session_id'], -# use_previous=Timedelta(3, 'observations', -# entity=es['log'])) -# features = [cum_mean] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_mean.get_name()].values -# assert len(cvalues) == 15 -# cum_mean_values = [0, 2.5, 5, 10, 15, 0, .5, 1, 2, 0, 0, 2.5, 0, 3.5, 7] -# for i, v in enumerate(cum_mean_values): -# assert v == cvalues[i] - -# # M TODOS -# def test_cum_mean_where(es): -# log_value_feat = es['log']['value'] -# compare_feat = GreaterThan(log_value_feat, 3) -# dfeat = ft.Feature(es['sessions']['customer_id'], es['log']) -# cum_mean = CumMean(log_value_feat, dfeat, -# where=compare_feat) -# features = [cum_mean] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_mean.get_name()].values -# assert len(cvalues) == 15 -# cum_mean_values = [0, 5, 7.5, 10, 12.5, 12.5, 12.5, 12.5, 12.5, 12.5, -# 0, 5, 5, 6, 26. / 3] - -# for i, v in enumerate(cum_mean_values): -# if not np.isnan(v): -# assert v == cvalues[i] -# else: -# assert (np.isnan(cvalues[i])) - -# # M TODOS -# def test_cum_mean_use_previous_and_where(es): -# log_value_feat = es['log']['value'] -# compare_feat = GreaterThan(log_value_feat, 3) -# # todo should this be cummean? -# dfeat = ft.Feature(es['sessions']['customer_id'], es['log']) -# cum_mean = CumMean(log_value_feat, dfeat, -# where=compare_feat, -# use_previous=Timedelta(2, 'observations', -# entity=es['log'])) -# features = [cum_mean] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) - -# cum_mean_values = [0, 5, 7.5, 12.5, 17.5, 17.5, 17.5, 17.5, 17.5, 17.5, -# 0, 5, 5, 6, 10.5] -# cvalues = df[cum_mean.get_name()].values -# assert len(cvalues) == 15 -# for i, v in enumerate(cum_mean_values): -# assert v == cvalues[i] - -# M TODOS -# def test_cum_count(es): -# log_id_feat = es['log']['id'] -# cum_count = CumCount(log_id_feat, es['log']['session_id']) -# features = [cum_count] -# df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) -# cvalues = df[cum_count.get_name()].values -# assert len(cvalues) == 15 -# cum_count_values = [1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 1, 2, 1, 2, 3] -# for i, v in enumerate(cum_count_values): -# assert v == cvalues[i] + +class TestCumCount: + + primitive = CumCount + + def test_order(self): + g = pd.Series(["a", "b", "a"]) + + answer = [1, 1, 2] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(g), answer) + + def test_regular(self): + g = pd.Series(["a", "b", "a", "c", "d", "b"]) + answer = [1, 1, 2, 1, 1, 2] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(g), answer) + + def test_discrete(self): + g = pd.Series(["a", "b", "a", "c", "d", "b"]) + answer = [1, 1, 2, 1, 1, 2] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(g), answer) + + +class TestCumSum: + + primitive = CumSum + + def test_order(self): + v = pd.Series([1, 2, 2]) + g = pd.Series(["a", "b", "a"]) + + answer = [1, 2, 3] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(v, g), answer) + + def test_regular(self): + v = pd.Series([101, 102, 103, 104, 105, 106]) + g = pd.Series(["a", "b", "a", "c", "d", "b"]) + answer = [101, 102, 204, 104, 105, 208] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(v, g), answer) + + +class TestCumMean: + primitive = CumMean + + def test_order(self): + v = pd.Series([1, 2, 2]) + g = pd.Series(["a", "b", "a"]) + + answer = [1, 2, 1.5] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(v, g), answer) + + def test_regular(self): + v = pd.Series([101, 102, 103, 104, 105, 106]) + g = pd.Series(["a", "b", "a", "c", "d", "b"]) + answer = [101, 102, 102, 104, 105, 104] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(v, g), answer) + + +class TestCumMax: + + primitive = CumMax + + def test_order(self): + v = pd.Series([1, 2, 2]) + g = pd.Series(["a", "b", "a"]) + + answer = [1, 2, 2] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(v, g), answer) + + def test_regular(self): + v = pd.Series([101, 102, 103, 104, 105, 106]) + g = pd.Series(["a", "b", "a", "c", "d", "b"]) + answer = [101, 102, 103, 104, 105, 106] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(v, g), answer) + + +class TestCumMin: + + primitive = CumMin + + def test_order(self): + v = pd.Series([1, 2, 2]) + g = pd.Series(["a", "b", "a"]) + + answer = [1, 2, 1] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(v, g), answer) + + def test_regular(self): + v = pd.Series([101, 102, 103, 104, 105, 106, 100]) + g = pd.Series(["a", "b", "a", "c", "d", "b", "a"]) + answer = [101, 102, 101, 104, 105, 102, 100] + + function = self.primitive().get_function() + np.testing.assert_array_equal(function(v, g), answer) + + +def test_cum_sum(es): + log_value_feat = es['log']['value'] + + cum_sum = ft.Feature([log_value_feat, es['log']['session_id']], primitive=CumSum) + features = [cum_sum] + df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) + cvalues = df[cum_sum.get_name()].values + assert len(cvalues) == 15 + cum_sum_values = [0, 5, 15, 30, 50, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21] + for i, v in enumerate(cum_sum_values): + assert v == cvalues[i] + + +def test_cum_min(es): + log_value_feat = es['log']['value'] + cum_min = ft.Feature([log_value_feat, es['log']['session_id']], primitive=CumMin) + features = [cum_min] + df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) + cvalues = df[cum_min.get_name()].values + assert len(cvalues) == 15 + cum_min_values = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + for i, v in enumerate(cum_min_values): + assert v == cvalues[i] + + +def test_cum_max(es): + log_value_feat = es['log']['value'] + cum_max = ft.Feature([log_value_feat, es['log']['session_id']], primitive=CumMax) + features = [cum_max] + df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) + cvalues = df[cum_max.get_name()].values + assert len(cvalues) == 15 + cum_max_values = [0, 5, 10, 15, 20, 0, 1, 2, 3, 0, 0, 5, 0, 7, 14] + for i, v in enumerate(cum_max_values): + assert v == cvalues[i] + + +def test_cum_sum_group_on_nan(es): + log_value_feat = es['log']['value'] + es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 + + ['toothpaste'] * 3 + ['brown bag'] * 2 + + ['shoes'] + + [np.nan] * 4 + + ['coke_zero'] * 2) + cum_sum = ft.Feature([log_value_feat, es['log']['product_id']], primitive=CumSum) + features = [cum_sum] + df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) + cvalues = df[cum_sum.get_name()].values + assert len(cvalues) == 15 + cum_sum_values = [0, 5, 15, + 15, 35, + 0, 1, 3, + 3, 3, + 0, + np.nan, np.nan, np.nan, np.nan] + for i, v in enumerate(cum_sum_values): + if np.isnan(v): + assert (np.isnan(cvalues[i])) + else: + assert v == cvalues[i] + + +def test_cum_handles_uses_full_entity(es): + def check(feature): + pandas_backend = PandasBackend(es, [feature]) + df_1 = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) + df_2 = pandas_backend.calculate_all_features(instance_ids=[2, 4], time_last=None) + + # check that the value for instance id 2 matches + assert (df_2.loc[2] == df_1.loc[2]).all() + + for primitive in [CumSum, CumMean, CumMax, CumMin]: + check(ft.Feature([es['log']['value'], es['log']['session_id']], primitive=primitive)) + + check(ft.Feature(es['log']['session_id'], primitive=CumCount)) + + +def test_cum_mean(es): + log_value_feat = es['log']['value'] + cum_mean = ft.Feature([log_value_feat, es['log']['session_id']], primitive=CumMean) + features = [cum_mean] + df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) + cvalues = df[cum_mean.get_name()].values + assert len(cvalues) == 15 + cum_mean_values = [0, 2.5, 5, 7.5, 10, 0, .5, 1, 1.5, 0, 0, 2.5, 0, 3.5, 7] + for i, v in enumerate(cum_mean_values): + assert v == cvalues[i] + + +def test_cum_count(es): + cum_count = ft.Feature([es['log']['session_id']], primitive=CumCount) + features = [cum_count] + df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15)) + cvalues = df[cum_count.get_name()].values + assert len(cvalues) == 15 + cum_count_values = [1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 1, 2, 1, 2, 3] + for i, v in enumerate(cum_count_values): + assert v == cvalues[i] def test_text_primitives(es):