From d93ff0ddd39a4d8c7881e59577e61feabc033790 Mon Sep 17 00:00:00 2001
From: Charles Bradshaw <bradshawce@vcu.edu>
Date: Mon, 11 Feb 2019 15:11:30 -0500
Subject: [PATCH] Cumulative primitives (#410)

* Updated Cumulative Primitives
---
 Makefile                                      |   2 +-
 featuretools/entityset/entity.py              |   1 +
 featuretools/entityset/entityset.py           |   1 +
 .../standard/cum_transform_feature.py         | 309 +++--------
 .../test_transform_features.py                | 503 ++++++++----------
 5 files changed, 293 insertions(+), 523 deletions(-)

diff --git a/Makefile b/Makefile
index fa26d56187..43165f737a 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ lint:
 	flake8 featuretools && isort --check-only --recursive featuretools
 
 lint-fix:
-	autopep8 --in-place --recursive --max-line-length=100 --exclude="*/migrations/*" --select="E225,E303,E302,E203,E128,E231,E251,E271,E127,E126" featuretools
+	autopep8 --in-place --recursive --max-line-length=100 --exclude="*/migrations/*" --select="E225,E303,E302,E203,E128,E231,E251,E271,E127,E126,E301,W293,E226" featuretools
 	isort --recursive featuretools
 
 
diff --git a/featuretools/entityset/entity.py b/featuretools/entityset/entity.py
index 67e9896f5a..2e3f8e4305 100644
--- a/featuretools/entityset/entity.py
+++ b/featuretools/entityset/entity.py
@@ -34,6 +34,7 @@ class Entity(object):
         :class:`.Relationship`, :class:`.Variable`, :class:`.EntitySet`
 
     """
+
     def __init__(self, id, df, entityset, variable_types=None,
                  index=None, time_index=None, secondary_time_index=None,
                  last_time_index=None, already_sorted=False, make_index=False,
diff --git a/featuretools/entityset/entityset.py b/featuretools/entityset/entityset.py
index 3820bd31fe..f8a00caa27 100644
--- a/featuretools/entityset/entityset.py
+++ b/featuretools/entityset/entityset.py
@@ -34,6 +34,7 @@ class EntitySet(object):
         metadata
 
     """
+
     def __init__(self, id=None, entities=None, relationships=None):
         """Creates EntitySet
 
diff --git a/featuretools/primitives/standard/cum_transform_feature.py b/featuretools/primitives/standard/cum_transform_feature.py
index 91d6603ec9..87c5aa7a1f 100644
--- a/featuretools/primitives/standard/cum_transform_feature.py
+++ b/featuretools/primitives/standard/cum_transform_feature.py
@@ -1,272 +1,97 @@
-# import uuid
-# from builtins import str
+from featuretools.primitives.base import TransformPrimitive
+from featuretools.variable_types import Discrete, Id, Numeric
 
-# import numpy as np
-# import pandas as pd
 
-# from ..base.primitive_base import IdentityFeature, PrimitiveBase
-# from ..base.transform_primitive_base import TransformPrimitive
-# from .aggregation_primitives import Count, Max, Mean, Min, Sum
-# from .utils import apply_dual_op_from_feat
+class CumSum(TransformPrimitive):
+    """Returns the cumulative sum after grouping"""
 
-# from featuretools.utils import is_string
-# from featuretools.utils.wrangle import _check_timedelta
-# from featuretools.variable_types import Id, Index, Numeric, TimeIndex
-# from featuretools.variable_types.variable import Discrete
+    name = "cum_sum"
+    input_types = [[Numeric, Id],
+                   [Numeric, Discrete]]
+    return_type = Numeric
+    uses_full_entity = True
 
+    def get_function(self):
+        def cum_sum(values, groups):
+            return values.groupby(groups).cumsum()
 
-# class CumFeature(TransformPrimitive):
-#     allow_where = True
-#     agg_feature = None
-#     uses_full_entity = True
+        return cum_sum
 
-#     # Note: Any row with a nan value in the group by feature will have a
-#     # NaN value in the cumfeat
+    def generate_name(self, base_feature_names):
+        return "CUM_SUM(%s by %s)" % (base_feature_names[0], base_feature_names[1])
 
-#     # Todo: also passing the parent entity instead of the group_feat
-#     def __init__(self, base_feature, group_feature, time_index=None,
-#                  where=None, use_previous=None):
-#         """Summary
 
-#         Args:
-#             agg_feature (type): subclass of :class:`.AggregationPrimitive`;
-#                 aggregation method being used.  This is passed by the
-#                 constructors of the cumfeat subclasses
-#             base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
-#                 or variable calculated on
-#             group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
-#                 or variable used to group the rows before computation
-#             where (optional[:class:`.PrimitiveBase`]):
-#             use_previous (optional[:class:`.Timedelta`):
-#         """
-#         base_feature = self._check_feature(base_feature)
+class CumCount(TransformPrimitive):
+    """Returns the cumulative count after grouping"""
 
-#         td_entity_id = None
-#         if is_string(use_previous):
-#             td_entity_id = base_feature.entity.id
-#         self.use_previous = _check_timedelta(
-#             use_previous, entity_id=td_entity_id)
+    name = "cum_count"
+    input_types = [[Id], [Discrete]]
+    return_type = Numeric
+    uses_full_entity = True
 
-#         group_feature = self._check_feature(group_feature)
-#         self.group_feature = group_feature
+    def get_function(self):
+        def cum_count(values):
+            return values.groupby(values).cumcount() + 1
 
-#         self.base_features = [base_feature, group_feature]
+        return cum_count
 
-#         if time_index is None:
-#             entity = base_feature.entity
-#             time_index = IdentityFeature(entity[entity.time_index])
-#         self.base_features += [time_index]
+    def generate_name(self, base_feature_names):
+        return "CUM_COUNT(%s)" % (base_feature_names[0])
 
-#         if where is not None:
-#             self.where = where
 
-#         super(CumFeature, self).__init__(*self.base_features)
+class CumMean(TransformPrimitive):
+    """Returns the cumulative mean after grouping"""
 
-#     def generate_name(self):
-#         where_str = u""
-#         use_prev_str = u""
+    name = "cum_mean"
+    input_types = [[Numeric, Id],
+                   [Numeric, Discrete]]
+    return_type = Numeric
+    uses_full_entity = True
 
-#         if self.where is not None:
-#             where_str = u" WHERE " + self.where.get_name()
+    def get_function(self):
+        def cum_mean(values, groups):
+            temp = values.groupby(groups)
+            return temp.cumsum() / (temp.cumcount() + 1)
 
-#         if self.use_previous is not None:
-#             use_prev_str = u", Last %s" % (self.use_previous.get_name())
+        return cum_mean
 
-#         base_features_str = u"%s by %s" % \
-#             (self.base_features[0].get_name(), self.group_feature.get_name())
+    def generate_name(self, base_feature_names):
+        return "CUM_MEAN(%s by %s)" % (base_feature_names[0], base_feature_names[1])
 
-#         return u"%s(%s%s%s)" % (self.name.upper(), base_features_str,
-#                                 where_str, use_prev_str)
 
-#     def get_function(self):
-#         return pd_rolling_outer(self.rolling_func_name, self)
+class CumMin(TransformPrimitive):
+    """Returns the cumulative min after grouping"""
 
+    name = "cum_min"
+    input_types = [[Numeric, Id],
+                   [Numeric, Discrete]]
+    return_type = Numeric
+    uses_full_entity = True
 
-# class CumSum(CumFeature):
-#     """Calculates the sum of previous values of an instance for each value in a time-dependent entity.
-#     """
-#     name = "cum_sum"
-#     rolling_func_name = "sum"
+    def get_function(self):
+        def cum_min(values, groups):
+            return values.groupby(groups).cummin()
 
-#     default_value = 0
-#     agg_feature = Sum
-#     input_types = [[Numeric, Id, TimeIndex],
-#                    [Numeric, Discrete, TimeIndex]]
+        return cum_min
 
+    def generate_name(self, base_feature_names):
+        return "CUM_MIN(%s by %s)" % (base_feature_names[0], base_feature_names[1])
 
-# class CumMean(CumFeature):
-#     """Calculates the mean of previous values of an instance for each value in a time-dependent entity.
-#     """
-#     name = "cum_mean"
-#     rolling_func_name = "mean"
-#     default_value = 0
-#     agg_feature = Mean
-#     input_types = [[Numeric, Id, TimeIndex],
-#                    [Numeric, Discrete, TimeIndex]]
 
+class CumMax(TransformPrimitive):
+    """Returns the cumulative max after grouping"""
 
-# class CumCount(CumFeature):
-#     """Calculates the number of previous values of an instance for each value in a time-dependent entity.
-#     """
-#     name = "cum_count"
-#     rolling_func_name = "count"
-#     default_value = 0
-#     agg_feature = Count
-#     input_types = [Index, Discrete, TimeIndex]
+    name = "cum_max"
+    input_types = [[Numeric, Id],
+                   [Numeric, Discrete]]
+    return_type = Numeric
+    uses_full_entity = True
 
+    def get_function(self):
+        def cum_max(values, groups):
+            return values.groupby(groups).cummax()
 
-# class CumMax(CumFeature):
-#     """Calculates the max of previous values of an instance for each value in a time-dependent entity.
-#     """
-#     name = "cum_max"
-#     rolling_func_name = "max"
-#     default_value = 0
-#     agg_feature = Max
-#     input_types = [[Numeric, Id, TimeIndex],
-#                    [Numeric, Discrete, TimeIndex]]
+        return cum_max
 
-
-# class CumMin(CumFeature):
-#     """Calculates the min of previous values of an instance for each value in a time-dependent entity.
-#     """
-#     name = "cum_min"
-#     rolling_func_name = "min"
-#     default_value = 0
-#     agg_feature = Min
-#     input_types = [[Numeric, Id, TimeIndex],
-#                    [Numeric, Discrete, TimeIndex]]
-
-
-# def pd_rolling_outer(rolling_func, f):
-#     def pd_rolling(base_array, group_array, values_3=None, values_4=None):
-#         bf_name = f.base_features[0].get_name()
-#         entity = f.base_features[0].entity
-#         time_index = entity.time_index
-#         groupby = f.group_feature.get_name()
-#         timedelta = f.use_previous
-#         if timedelta is not None:
-#             if timedelta.is_absolute():
-#                 timedelta = f.use_previous.get_pandas_timedelta()
-#                 absolute = True
-#             else:
-#                 timedelta = f.use_previous.value
-#                 absolute = False
-#         df_dict = {bf_name: base_array, groupby: group_array}
-#         if timedelta:
-#             df_dict[time_index] = values_3
-#             if f.where:
-#                 df_dict[f.where.get_name()] = values_4
-#         elif f.where:
-#             df_dict[f.where.get_name()] = values_3
-
-#         df = pd.DataFrame.from_dict(df_dict)
-
-#         if f.use_previous and not f.where:
-#             def apply_rolling(group):
-#                 to_roll = group
-#                 kwargs = {'window': timedelta,
-#                           'min_periods': 1}
-#                 if absolute:
-#                     to_roll = to_roll[[bf_name, time_index]].sort_values(
-#                         time_index, kind='mergesort')
-#                     kwargs['on'] = time_index
-#                 else:
-#                     to_roll = to_roll[bf_name]
-#                 rolled = to_roll.rolling(**kwargs)
-#                 rolled = getattr(rolled, rolling_func)()
-#                 if absolute:
-#                     rolled = rolled[bf_name]
-#                 return rolled
-#         elif not f.where:
-#             cumfuncs = {"count": "cumcount",
-#                         "sum": "cumsum",
-#                         "max": "cummax",
-#                         "min": "cummin",
-#                         "prod": "cumprod",
-#                         }
-#             if rolling_func in ["count", "sum", "max", "min"]:
-#                 cumfunc = cumfuncs[rolling_func]
-#                 grouped = df.groupby(groupby, sort=False, observed=True)[bf_name]
-#                 applied = getattr(grouped, cumfunc)()
-#                 # TODO: to produce same functionality as the rolling cases already
-#                 # implemented, we add 1
-#                 # We may want to consider changing this functionality to instead
-#                 # return count of the *previous* events
-#                 if rolling_func == "count":
-#                     applied += 1
-#                 return applied
-#             else:
-#                 def apply_rolling(group):
-#                     rolled = group[bf_name].expanding(min_periods=1)
-#                     return getattr(rolled, rolling_func)()
-#         elif f.use_previous and f.where:
-#             def apply_rolling(group):
-#                 variable_data = [group[base.get_name()]
-#                                  for base in [f.where.left, f.where.right]
-#                                  if isinstance(base, PrimitiveBase)]
-#                 mask = apply_dual_op_from_feat(f.where, *variable_data)
-#                 to_roll = group[mask]
-#                 kwargs = {'window': timedelta,
-#                           'min_periods': 1}
-#                 if absolute:
-#                     output = pd.Series(f.default_value, index=group.index)
-#                     # mergesort is stable
-#                     to_roll = to_roll[[bf_name, time_index]].sort_values(
-#                         time_index, kind='mergesort')
-#                     kwargs['on'] = time_index
-#                 else:
-#                     output = pd.Series(np.nan, index=group.index)
-#                     to_roll = to_roll[bf_name]
-#                 rolled = to_roll.rolling(**kwargs)
-#                 rolled = getattr(rolled, rolling_func)()
-#                 if absolute:
-#                     rolled = rolled[bf_name]
-#                     output[mask] = rolled
-#                 else:
-#                     output[mask] = rolled
-#                     # values filtered out by the Where statement
-#                     # should have their values be w
-#                     output.fillna(method='ffill', inplace=True)
-#                     # first value might still be nan
-#                     if pd.isnull(output.iloc[0]):
-#                         output.fillna(0, inplace=True)
-#                 return output
-#         elif f.where:
-#             def apply_rolling(group):
-#                 variable_data = [group[base.get_name()]
-#                                  for base in [f.where.left, f.where.right]
-#                                  if isinstance(base, PrimitiveBase)]
-#                 mask = apply_dual_op_from_feat(f.where, *variable_data)
-#                 output = pd.Series(np.nan, index=group.index)
-#                 rolled = group[mask][bf_name].expanding(min_periods=1)
-#                 rolled = getattr(rolled, rolling_func)()
-#                 output[mask] = rolled
-#                 # values filtered out by the Where statement
-#                 # should have their values be w
-#                 output.fillna(method='ffill', inplace=True)
-#                 # first value might still be nan
-#                 if pd.isnull(output.iloc[0]):
-#                     output.fillna(0, inplace=True)
-#                 return output
-
-#         new_index_name = str(uuid.uuid1())
-#         new_index = pd.RangeIndex(len(df), name=new_index_name)
-#         df.set_index(new_index, append=True, inplace=True)
-#         grouped = df.groupby(groupby, observed=True).apply(apply_rolling)
-#         original_index = pd.Series(np.nan, index=df.index)
-#         if isinstance(grouped, pd.DataFrame):
-#             if grouped.shape[0] == 0 or grouped.empty:
-#                 return original_index.values
-#             else:
-#                 grouped = pd.Series(grouped.values[0], index=grouped.columns)
-
-#         df.reset_index(new_index_name, inplace=True, drop=True)
-#         # case where some values of df[groupby] are nan
-#         # pandas groupby().apply() filters those out
-#         # and returns a series that's shorter than the original
-#         # we need to add these values to the original index to
-#         # preserve the length and these nan values
-#         grouped_index = grouped.index.get_level_values(new_index_name)
-#         original_index[grouped_index] = grouped.values
-#         return original_index.values
-#     return pd_rolling
+    def generate_name(self, base_feature_names):
+        return "CUM_MAX(%s by %s)" % (base_feature_names[0], base_feature_names[1])
diff --git a/featuretools/tests/primitive_tests/test_transform_features.py b/featuretools/tests/primitive_tests/test_transform_features.py
index 94133ab154..aa74c7263f 100644
--- a/featuretools/tests/primitive_tests/test_transform_features.py
+++ b/featuretools/tests/primitive_tests/test_transform_features.py
@@ -7,15 +7,16 @@
 
 import featuretools as ft
 from featuretools.computational_backends import PandasBackend
-from featuretools.primitives.base import make_trans_primitive
-from featuretools.synthesis.deep_feature_synthesis import match
-from featuretools.variable_types import Boolean, Datetime, Numeric, Variable
-
-from featuretools.primitives import (  # CumCount,; CumMax,; CumMean,; CumMin,; CumSum,
+from featuretools.primitives import (
     Absolute,
     AddNumeric,
     AddNumericScalar,
     Count,
+    CumCount,
+    CumMax,
+    CumMean,
+    CumMin,
+    CumSum,
     Day,
     Diff,
     DivideByFeature,
@@ -38,12 +39,12 @@
     Month,
     MultiplyNumeric,
     MultiplyNumericScalar,
+    NMostCommon,
     Not,
     NotEqual,
     NotEqualScalar,
     NumCharacters,
     NumWords,
-    NMostCommon,
     Percentile,
     ScalarSubtractNumericFeature,
     Second,
@@ -54,6 +55,9 @@
     Year,
     get_transform_primitives
 )
+from featuretools.primitives.base import make_trans_primitive
+from featuretools.synthesis.deep_feature_synthesis import match
+from featuretools.variable_types import Boolean, Datetime, Numeric, Variable
 
 
 # some tests change the entityset values, so we have to create it fresh
@@ -412,280 +416,219 @@ def test_haversine(es):
     for i, v in enumerate(real):
         assert v - values[i] < .0001
 
-# # M TODOS
-# def test_cum_sum(es):
-#     log_value_feat = es['log']['value']
-#     cum_sum = CumSum(log_value_feat, es['log']['session_id'])
-#     features = [cum_sum]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_sum.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_sum_values = [0, 5, 15, 30, 50, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21]
-#     for i, v in enumerate(cum_sum_values):
-#         assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_min(es):
-#     log_value_feat = es['log']['value']
-#     cum_min = CumMin(log_value_feat, es['log']['session_id'])
-#     features = [cum_min]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_min.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_min_values = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-#     for i, v in enumerate(cum_min_values):
-#         assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_max(es):
-#     log_value_feat = es['log']['value']
-#     cum_max = CumMax(log_value_feat, es['log']['session_id'])
-#     features = [cum_max]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_max.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_max_values = [0, 5, 10, 15, 20, 0, 1, 2, 3, 0, 0, 5, 0, 7, 14]
-#     for i, v in enumerate(cum_max_values):
-#         assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_sum_use_previous(es):
-#     log_value_feat = es['log']['value']
-#     cum_sum = CumSum(log_value_feat, es['log']['session_id'],
-#                      use_previous=Timedelta(3, 'observations',
-#                                             entity=es['log']))
-#     features = [cum_sum]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_sum.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_sum_values = [0, 5, 15, 30, 45, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21]
-#     for i, v in enumerate(cum_sum_values):
-#         assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_sum_use_previous_integer_time(int_es):
-#     es = int_es
-
-#     log_value_feat = es['log']['value']
-#     with pytest.raises(AssertionError, match=''):
-#         CumSum(log_value_feat, es['log']['session_id'],
-#                use_previous=Timedelta(3, 'm'))
-
-#     cum_sum = CumSum(log_value_feat, es['log']['session_id'],
-#                      use_previous=Timedelta(3, 'observations',
-#                                             entity=es['log']))
-#     features = [cum_sum]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_sum.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_sum_values = [0, 5, 15, 30, 45, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21]
-#     for i, v in enumerate(cum_sum_values):
-#         assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_sum_where(es):
-#     log_value_feat = es['log']['value']
-#     compare_feat = GreaterThan(log_value_feat, 3)
-#     dfeat = ft.Feature(es['sessions']['customer_id'], es['log'])
-#     cum_sum = CumSum(log_value_feat, dfeat,
-#                      where=compare_feat)
-#     features = [cum_sum]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_sum.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_sum_values = [0, 5, 15, 30, 50, 50, 50, 50, 50, 50,
-#                       0, 5, 5, 12, 26]
-#     for i, v in enumerate(cum_sum_values):
-#         if not np.isnan(v):
-#             assert v == cvalues[i]
-#         else:
-#             assert (np.isnan(cvalues[i]))
-
-# # M TODOS
-# def test_cum_sum_use_previous_and_where(es):
-#     log_value_feat = es['log']['value']
-#     compare_feat = GreaterThan(log_value_feat, 3)
-#     # todo should this be cummean?
-#     dfeat = ft.Feature(es['sessions']['customer_id'], es['log'])
-#     cum_sum = CumSum(log_value_feat, dfeat,
-#                      where=compare_feat,
-#                      use_previous=Timedelta(3, 'observations',
-#                                             entity=es['log']))
-#     features = [cum_sum]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-
-#     cum_sum_values = [0, 5, 15, 30, 45, 45, 45, 45, 45, 45,
-#                       0, 5, 5, 12, 26]
-#     cvalues = df[cum_sum.get_name()].values
-#     assert len(cvalues) == 15
-#     for i, v in enumerate(cum_sum_values):
-#         assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_sum_group_on_nan(es):
-#     log_value_feat = es['log']['value']
-#     es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 +
-#                                   ['toothpaste'] * 3 + ['brown bag'] * 2 +
-#                                   ['shoes'] +
-#                                   [np.nan] * 4 +
-#                                   ['coke_zero'] * 2)
-#     cum_sum = CumSum(log_value_feat, es['log']['product_id'])
-#     features = [cum_sum]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_sum.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_sum_values = [0, 5, 15,
-#                       15, 35,
-#                       0, 1, 3,
-#                       3, 3,
-#                       0,
-#                       np.nan, np.nan, np.nan, np.nan]
-#     for i, v in enumerate(cum_sum_values):
-#         if np.isnan(v):
-#             assert (np.isnan(cvalues[i]))
-#         else:
-#             assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_sum_use_previous_group_on_nan(es):
-#     # TODO: Figure out how to test where `df`
-#     # in pd_rolling get_function() has multiindex
-#     log_value_feat = es['log']['value']
-#     es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 +
-#                                   ['toothpaste'] * 3 + ['brown bag'] * 2 +
-#                                   ['shoes'] +
-#                                   [np.nan] * 4 +
-#                                   ['coke_zero'] * 2)
-#     cum_sum = CumSum(log_value_feat,
-#                      es['log']['product_id'],
-#                      es["log"]["datetime"],
-#                      use_previous=Timedelta(40, 'seconds'))
-#     features = [cum_sum]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_sum.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_sum_values = [0, 5, 15,
-#                       15, 35,
-#                       0, 1, 3,
-#                       3, 0,
-#                       0,
-#                       np.nan, np.nan, np.nan, np.nan]
-#     for i, v in enumerate(cum_sum_values):
-#         if np.isnan(v):
-#             assert (np.isnan(cvalues[i]))
-#         else:
-#             assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_sum_use_previous_and_where_absolute(es):
-#     log_value_feat = es['log']['value']
-#     compare_feat = GreaterThan(log_value_feat, 3)
-#     dfeat = ft.Feature(es['sessions']['customer_id'], es['log'])
-#     cum_sum = CumSum(log_value_feat, dfeat, es["log"]["datetime"],
-#                      where=compare_feat,
-#                      use_previous=Timedelta(40, 'seconds'))
-#     features = [cum_sum]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-
-#     cum_sum_values = [0, 5, 15, 30, 50, 0, 0, 0, 0, 0,
-#                       0, 5, 0, 7, 21]
-#     cvalues = df[cum_sum.get_name()].values
-#     assert len(cvalues) == 15
-#     for i, v in enumerate(cum_sum_values):
-#         assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_handles_uses_full_entity(es):
-#     def check(feature):
-#         pandas_backend = PandasBackend(es, [feature])
-#         df_1 = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None)
-#         df_2 = pandas_backend.calculate_all_features(instance_ids=[2], time_last=None)
-
-#         # check that the value for instance id 2 matches
-#         assert (df_2.loc[2] == df_1.loc[2]).all()
-
-#     for primitive in [CumSum, CumMean, CumMax, CumMin]:
-#         check(primitive(es['log']['value'], es['log']['session_id']))
-
-#     check(Cumft.Feature(es['log']['id'], parent_entity=es['log']['session_id']), primitive=Count)
-
-# # M TODOS
-# def test_cum_mean(es):
-#     log_value_feat = es['log']['value']
-#     cum_mean = CumMean(log_value_feat, es['log']['session_id'])
-#     features = [cum_mean]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_mean.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_mean_values = [0, 2.5, 5, 7.5, 10, 0, .5, 1, 1.5, 0, 0, 2.5, 0, 3.5, 7]
-#     for i, v in enumerate(cum_mean_values):
-#         assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_mean_use_previous(es):
-#     log_value_feat = es['log']['value']
-#     cum_mean = CumMean(log_value_feat, es['log']['session_id'],
-#                        use_previous=Timedelta(3, 'observations',
-#                                               entity=es['log']))
-#     features = [cum_mean]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_mean.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_mean_values = [0, 2.5, 5, 10, 15, 0, .5, 1, 2, 0, 0, 2.5, 0, 3.5, 7]
-#     for i, v in enumerate(cum_mean_values):
-#         assert v == cvalues[i]
-
-# # M TODOS
-# def test_cum_mean_where(es):
-#     log_value_feat = es['log']['value']
-#     compare_feat = GreaterThan(log_value_feat, 3)
-#     dfeat = ft.Feature(es['sessions']['customer_id'], es['log'])
-#     cum_mean = CumMean(log_value_feat, dfeat,
-#                        where=compare_feat)
-#     features = [cum_mean]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_mean.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_mean_values = [0, 5, 7.5, 10, 12.5, 12.5, 12.5, 12.5, 12.5, 12.5,
-#                        0, 5, 5, 6, 26. / 3]
-
-#     for i, v in enumerate(cum_mean_values):
-#         if not np.isnan(v):
-#             assert v == cvalues[i]
-#         else:
-#             assert (np.isnan(cvalues[i]))
-
-# # M TODOS
-# def test_cum_mean_use_previous_and_where(es):
-#     log_value_feat = es['log']['value']
-#     compare_feat = GreaterThan(log_value_feat, 3)
-#     # todo should this be cummean?
-#     dfeat = ft.Feature(es['sessions']['customer_id'], es['log'])
-#     cum_mean = CumMean(log_value_feat, dfeat,
-#                        where=compare_feat,
-#                        use_previous=Timedelta(2, 'observations',
-#                                               entity=es['log']))
-#     features = [cum_mean]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-
-#     cum_mean_values = [0, 5, 7.5, 12.5, 17.5, 17.5, 17.5, 17.5, 17.5, 17.5,
-#                        0, 5, 5, 6, 10.5]
-#     cvalues = df[cum_mean.get_name()].values
-#     assert len(cvalues) == 15
-#     for i, v in enumerate(cum_mean_values):
-#         assert v == cvalues[i]
-
-# M TODOS
-# def test_cum_count(es):
-#     log_id_feat = es['log']['id']
-#     cum_count = CumCount(log_id_feat, es['log']['session_id'])
-#     features = [cum_count]
-#     df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
-#     cvalues = df[cum_count.get_name()].values
-#     assert len(cvalues) == 15
-#     cum_count_values = [1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 1, 2, 1, 2, 3]
-#     for i, v in enumerate(cum_count_values):
-#         assert v == cvalues[i]
+
+class TestCumCount:
+
+    primitive = CumCount
+
+    def test_order(self):
+        g = pd.Series(["a", "b", "a"])
+
+        answer = [1, 1, 2]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(g), answer)
+
+    def test_regular(self):
+        g = pd.Series(["a", "b", "a", "c", "d", "b"])
+        answer = [1, 1, 2, 1, 1, 2]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(g), answer)
+
+    def test_discrete(self):
+        g = pd.Series(["a", "b", "a", "c", "d", "b"])
+        answer = [1, 1, 2, 1, 1, 2]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(g), answer)
+
+
+class TestCumSum:
+
+    primitive = CumSum
+
+    def test_order(self):
+        v = pd.Series([1, 2, 2])
+        g = pd.Series(["a", "b", "a"])
+
+        answer = [1, 2, 3]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(v, g), answer)
+
+    def test_regular(self):
+        v = pd.Series([101, 102, 103, 104, 105, 106])
+        g = pd.Series(["a", "b", "a", "c", "d", "b"])
+        answer = [101, 102, 204, 104, 105, 208]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(v, g), answer)
+
+
+class TestCumMean:
+    primitive = CumMean
+
+    def test_order(self):
+        v = pd.Series([1, 2, 2])
+        g = pd.Series(["a", "b", "a"])
+
+        answer = [1, 2, 1.5]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(v, g), answer)
+
+    def test_regular(self):
+        v = pd.Series([101, 102, 103, 104, 105, 106])
+        g = pd.Series(["a", "b", "a", "c", "d", "b"])
+        answer = [101, 102, 102, 104, 105, 104]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(v, g), answer)
+
+
+class TestCumMax:
+
+    primitive = CumMax
+
+    def test_order(self):
+        v = pd.Series([1, 2, 2])
+        g = pd.Series(["a", "b", "a"])
+
+        answer = [1, 2, 2]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(v, g), answer)
+
+    def test_regular(self):
+        v = pd.Series([101, 102, 103, 104, 105, 106])
+        g = pd.Series(["a", "b", "a", "c", "d", "b"])
+        answer = [101, 102, 103, 104, 105, 106]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(v, g), answer)
+
+
+class TestCumMin:
+
+    primitive = CumMin
+
+    def test_order(self):
+        v = pd.Series([1, 2, 2])
+        g = pd.Series(["a", "b", "a"])
+
+        answer = [1, 2, 1]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(v, g), answer)
+
+    def test_regular(self):
+        v = pd.Series([101, 102, 103, 104, 105, 106, 100])
+        g = pd.Series(["a", "b", "a", "c", "d", "b", "a"])
+        answer = [101, 102, 101, 104, 105, 102, 100]
+
+        function = self.primitive().get_function()
+        np.testing.assert_array_equal(function(v, g), answer)
+
+
+def test_cum_sum(es):
+    log_value_feat = es['log']['value']
+
+    cum_sum = ft.Feature([log_value_feat, es['log']['session_id']], primitive=CumSum)
+    features = [cum_sum]
+    df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
+    cvalues = df[cum_sum.get_name()].values
+    assert len(cvalues) == 15
+    cum_sum_values = [0, 5, 15, 30, 50, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21]
+    for i, v in enumerate(cum_sum_values):
+        assert v == cvalues[i]
+
+
+def test_cum_min(es):
+    log_value_feat = es['log']['value']
+    cum_min = ft.Feature([log_value_feat, es['log']['session_id']], primitive=CumMin)
+    features = [cum_min]
+    df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
+    cvalues = df[cum_min.get_name()].values
+    assert len(cvalues) == 15
+    cum_min_values = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    for i, v in enumerate(cum_min_values):
+        assert v == cvalues[i]
+
+
+def test_cum_max(es):
+    log_value_feat = es['log']['value']
+    cum_max = ft.Feature([log_value_feat, es['log']['session_id']], primitive=CumMax)
+    features = [cum_max]
+    df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
+    cvalues = df[cum_max.get_name()].values
+    assert len(cvalues) == 15
+    cum_max_values = [0, 5, 10, 15, 20, 0, 1, 2, 3, 0, 0, 5, 0, 7, 14]
+    for i, v in enumerate(cum_max_values):
+        assert v == cvalues[i]
+
+
+def test_cum_sum_group_on_nan(es):
+    log_value_feat = es['log']['value']
+    es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 +
+                                  ['toothpaste'] * 3 + ['brown bag'] * 2 +
+                                  ['shoes'] +
+                                  [np.nan] * 4 +
+                                  ['coke_zero'] * 2)
+    cum_sum = ft.Feature([log_value_feat, es['log']['product_id']], primitive=CumSum)
+    features = [cum_sum]
+    df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
+    cvalues = df[cum_sum.get_name()].values
+    assert len(cvalues) == 15
+    cum_sum_values = [0, 5, 15,
+                      15, 35,
+                      0, 1, 3,
+                      3, 3,
+                      0,
+                      np.nan, np.nan, np.nan, np.nan]
+    for i, v in enumerate(cum_sum_values):
+        if np.isnan(v):
+            assert (np.isnan(cvalues[i]))
+        else:
+            assert v == cvalues[i]
+
+
+def test_cum_handles_uses_full_entity(es):
+    def check(feature):
+        pandas_backend = PandasBackend(es, [feature])
+        df_1 = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None)
+        df_2 = pandas_backend.calculate_all_features(instance_ids=[2, 4], time_last=None)
+
+        # check that the value for instance id 2 matches
+        assert (df_2.loc[2] == df_1.loc[2]).all()
+
+    for primitive in [CumSum, CumMean, CumMax, CumMin]:
+        check(ft.Feature([es['log']['value'], es['log']['session_id']], primitive=primitive))
+
+    check(ft.Feature(es['log']['session_id'], primitive=CumCount))
+
+
+def test_cum_mean(es):
+    log_value_feat = es['log']['value']
+    cum_mean = ft.Feature([log_value_feat, es['log']['session_id']], primitive=CumMean)
+    features = [cum_mean]
+    df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
+    cvalues = df[cum_mean.get_name()].values
+    assert len(cvalues) == 15
+    cum_mean_values = [0, 2.5, 5, 7.5, 10, 0, .5, 1, 1.5, 0, 0, 2.5, 0, 3.5, 7]
+    for i, v in enumerate(cum_mean_values):
+        assert v == cvalues[i]
+
+
+def test_cum_count(es):
+    cum_count = ft.Feature([es['log']['session_id']], primitive=CumCount)
+    features = [cum_count]
+    df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(15))
+    cvalues = df[cum_count.get_name()].values
+    assert len(cvalues) == 15
+    cum_count_values = [1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 1, 2, 1, 2, 3]
+    for i, v in enumerate(cum_count_values):
+        assert v == cvalues[i]
 
 
 def test_text_primitives(es):