From 136d383a398e5a1169220c73dde6feb23689dd12 Mon Sep 17 00:00:00 2001
From: Nick Gravgaard <nick@beautifulcode.net>
Date: Thu, 31 Jul 2014 17:41:14 +0100
Subject: [PATCH] Nested merge now groups using a list of key combos

This means that when multiple group_by parameters are specified we can
group by all of them and include all of them in the same group of
results.
---
 backdrop/core/nested_merge.py   | 57 ++++++++++++---------
 backdrop/core/query.py          | 13 ++---
 backdrop/core/storage/mongo.py  |  4 +-
 features/read_api/group.feature | 11 ++++
 tests/core/test_data_set.py     | 16 ++++++
 tests/core/test_nested_merge.py | 89 +++++++++++++++++++++++++++++++--
 6 files changed, 155 insertions(+), 35 deletions(-)

diff --git a/backdrop/core/nested_merge.py b/backdrop/core/nested_merge.py
index e0ff1849..730d3e61 100644
--- a/backdrop/core/nested_merge.py
+++ b/backdrop/core/nested_merge.py
@@ -1,9 +1,17 @@
 from .errors import InvalidOperationError
 
-from operator import itemgetter, add
+from operator import add
 import itertools
 
 
+def _multi_itemgetter(*items):
+    """Like operator.itemgetter, but the callable always returns
+    a sequence of lookup values (regardless of items' length)
+    see https://docs.python.org/2/library/operator.html#operator.itemgetter
+    """
+    return lambda obj: tuple(obj[item] for item in items)
+
+
 def nested_merge(keys, collect, data):
     if len(keys) > 1:
         data = group_by(data, keys)
@@ -19,46 +27,48 @@ def group_by(data, keys):
     """Recursively group an array of results by a list of keys
 
     data: a list of dictionaries as returned by MongoDriver.group
-    keys: a list of keys to group by
+    keys: a list of combinations of keys to group by
     """
-    key = keys[0]
-    getter = itemgetter(key)
+    key_combo = keys[0]
+    getter = _multi_itemgetter(*key_combo)
     data = sorted(data, key=getter)
+
     if len(keys) > 1:
-        data = [
-            {
-                key: value,
-                "_subgroup": group_by(
-                    remove_key_from_all(subgroups, key),
-                    keys[1:]
-                )
-            }
-            for value, subgroups in itertools.groupby(data, getter)
-        ]
+        grouped_data = []
+        for values, subgroups in itertools.groupby(data, getter):
+            # create a dict containing key value pairs and _subgroup
+            result = dict(zip(key_combo, values))
+            result['_subgroup'] = group_by(
+                remove_keys_from_all(subgroups, key_combo),
+                keys[1:]
+            )
+            grouped_data.append(result)
+        data = grouped_data
+
     return data
 
 
-def remove_key_from_all(groups, key):
-    """Remove a key from each group in a list of groups
+def remove_keys_from_all(groups, keys):
+    """Remove keys from each group in a list of groups
 
     groups: a list of groups (dictionaries)
     key: the key to remove
     """
-    return [remove_key(group, key) for group in groups]
+    return [remove_keys(group, keys) for group in groups]
 
 
-def remove_key(doc, key):
-    """Return a new document with the key removed
+def remove_keys(doc, keys):
+    """Return a new document with keys in keys removed
 
     >>> doc = {'a':1, 'b':2}
-    >>> remove_key(doc, 'a')
+    >>> remove_keys(doc, ['a'])
     {'b': 2}
     >>> # Show that the original document is not affected
     >>> doc['a']
     1
     """
     return dict(
-        (k, v) for k, v in doc.items() if k != key)
+        (k, v) for k, v in doc.items() if k not in keys)
 
 
 def apply_counts(groups):
@@ -104,7 +114,7 @@ def apply_collect_to_group(group, collect):
 
     # remove left over collect keys
     for key, _ in collect:
-        group = remove_key(group, key)
+        group = remove_keys(group, key)
 
     # Hack in the old way
     for key, method in collect:
@@ -203,7 +213,8 @@ def collect_reducer_mean(values):
 
 
 def sort_all(data, keys):
+    key_combo = keys[0]
     if len(keys) > 1:
         for i, group in enumerate(data):
             data[i]['_subgroup'] = sort_all(group['_subgroup'], keys[1:])
-    return sorted(data, key=itemgetter(keys[0]))
+    return sorted(data, key=_multi_itemgetter(*key_combo))
diff --git a/backdrop/core/query.py b/backdrop/core/query.py
index 62b0db47..c97a9dec 100644
--- a/backdrop/core/query.py
+++ b/backdrop/core/query.py
@@ -50,7 +50,8 @@ def collect_fields(self):
 
     @property
     def group_keys(self):
-        """Return a list of fields that are being grouped on
+        """Return a list of lists of combinations of fields that are being
+        grouped on
 
         This is kinda coupled to how we group with Mongo but these keys
         are in the returned results and are used in the nested merge to
@@ -58,17 +59,17 @@ def group_keys(self):
 
         >>> from ..core.timeseries import WEEK
         >>> Query.create(group_by=['foo']).group_keys
-        ['foo']
+        [['foo']]
         >>> Query.create(period=WEEK).group_keys
-        ['_week_start_at']
+        [['_week_start_at']]
         >>> Query.create(group_by=['foo'], period=WEEK).group_keys
-        ['foo', '_week_start_at']
+        [['foo'], ['_week_start_at']]
         """
         keys = []
         if self.group_by:
-            keys += self.group_by
+            keys.append(self.group_by)
         if self.period:
-            keys.append(self.period.start_at_key)
+            keys.append([self.period.start_at_key])
         return keys
 
     @property
diff --git a/backdrop/core/storage/mongo.py b/backdrop/core/storage/mongo.py
index f314cf06..9e2ab0a0 100644
--- a/backdrop/core/storage/mongo.py
+++ b/backdrop/core/storage/mongo.py
@@ -1,6 +1,7 @@
 import os
 import logging
 import datetime
+import itertools
 
 import pymongo
 from pymongo.errors import AutoReconnect, CollectionInvalid
@@ -99,7 +100,8 @@ def _execute_query(self, data_set_id, query):
             return self._basic_query(data_set_id, query)
 
     def _group_query(self, data_set_id, query):
-        keys = query.group_keys
+        # flatten the list of key combos to form a flat list of keys
+        keys = list(itertools.chain.from_iterable(query.group_keys))
         spec = get_mongo_spec(query)
         collect_fields = query.collect_fields
 
diff --git a/features/read_api/group.feature b/features/read_api/group.feature
index e526eb48..58a47793 100644
--- a/features/read_api/group.feature
+++ b/features/read_api/group.feature
@@ -11,6 +11,17 @@ Feature: grouping queries for read api
           and the "1st" result should be "{"authority": "Camden", "_count": 2}"
           and the "2nd" result should be "{"authority": "Westminster", "_count": 4}"
 
+
+    Scenario: grouping by multiple keys
+        Given "licensing_2.json" is in "foo" data_set
+         when I go to "/foo?group_by=authority&group_by=licence_name"
+         then I should get back a status of "200"
+          and the JSON should have "3" results
+          and the "1st" result should be "{"authority": "Camden", "licence_name": "Temporary events notice", "_count": 1}"
+          and the "2nd" result should be "{"authority": "Westminster", "licence_name": "Cat herding licence", "_count": 1}"
+          and the "3rd" result should be "{"authority": "Westminster", "licence_name": "Temporary events notice", "_count": 3}"
+
+
     Scenario: grouping and filtering by different keys
         Given "licensing_2.json" is in "foo" data_set
          when I go to "/foo?group_by=authority&filter_by=licence_name:Temporary%20events%20notice"
diff --git a/tests/core/test_data_set.py b/tests/core/test_data_set.py
index fad3d75d..cbd1a133 100644
--- a/tests/core/test_data_set.py
+++ b/tests/core/test_data_set.py
@@ -206,6 +206,22 @@ def test_month_and_group_query(self):
         assert_that(data,
                     has_item(has_entries({"values": has_length(3)})))
 
+    def test_month_and_groups_query(self):
+        self.mock_storage.execute_query.return_value = [
+            {'some_group': 'val1', 'another_group': 'val3', '_month_start_at': d(2013, 1, 1), '_count': 1},
+            {'some_group': 'val1', 'another_group': 'val3', '_month_start_at': d(2013, 2, 1), '_count': 5},
+            {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 3, 1), '_count': 2},
+            {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 4, 1), '_count': 6},
+            {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 7, 1), '_count': 6},
+        ]
+
+        data = self.data_set.execute_query(Query.create(period=MONTH,
+                                                        group_by=['some_group', 'another_group']))
+        assert_that(data,
+                    has_item(has_entries({"values": has_length(2)})))
+        assert_that(data,
+                    has_item(has_entries({"values": has_length(3)})))
+
     def test_month_and_group_query_with_start_and_end_at(self):
         self.mock_storage.execute_query.return_value = [
             {'some_group': 'val1', '_month_start_at': d(2013, 1, 1), '_count': 1},
diff --git a/tests/core/test_nested_merge.py b/tests/core/test_nested_merge.py
index eef1381a..2d55ab72 100644
--- a/tests/core/test_nested_merge.py
+++ b/tests/core/test_nested_merge.py
@@ -4,12 +4,14 @@
 from backdrop.core.timeseries import WEEK, MONTH
 
 
-def datum(name=None, place=None, age=None, stamp=None, count=1):
+def datum(name=None, version=None, place=None, age=None, stamp=None, count=1):
     result = {
         "_count": count
     }
     if name is not None:
         result['name'] = name
+    if version is not None:
+        result['version'] = version
     if place is not None:
         result['place'] = place
     if age is not None:
@@ -29,7 +31,7 @@ def test_one_level_grouping_with_collect(self):
             datum(name='Jack', age=[34, 34]),
             datum(name='John', age=[56, 65])
         ]
-        results = nested_merge(['name'], [('age', 'mean')], data)
+        results = nested_merge([['name']], [('age', 'mean')], data)
 
         assert_that(results,
                     contains(
@@ -46,7 +48,7 @@ def test_two_level_grouping_with_collect(self):
             datum(name='James', place='Kettering', age=[43, 87], count=2),
             datum(name='Jill', place='Keswick', age=[76, 32], count=2),
         ]
-        results = nested_merge(['name', 'place'], [('age', 'mean')], data)
+        results = nested_merge([['name'], ['place']], [('age', 'mean')], data)
 
         assert_that(results,
                     contains(
@@ -89,6 +91,83 @@ def test_two_level_grouping_with_collect(self):
                         }),
                     ))
 
+    def test_two_level_grouping_combination_of_keys(self):
+        data = [
+            datum(name='IE', version='6', place='England', age=[13, 12], count=2),
+            datum(name='IE', version='6', place='Wales', age=[13, 14], count=2),
+            datum(name='IE', version='7', place='England', age=[8, 7], count=2),
+            datum(name='IE', version='7', place='Wales', age=[8, 9], count=2),
+            datum(name='IE', version='8', place='England', age=[5, 4], count=2),
+            datum(name='IE', version='8', place='Wales', age=[5, 6], count=2),
+            datum(name='Chrome', version='20', place='England', age=[2, 1], count=2),
+            datum(name='Chrome', version='20', place='Wales', age=[2, 3], count=2),
+        ]
+        results = nested_merge([['name', 'version'], ['place']], [('age', 'mean')], data)
+
+        assert_that(results,
+                    contains(
+                        has_entries({
+                            'name': 'Chrome',
+                            'version': '20',
+                            'age:mean': 2,
+                            '_subgroup': contains(
+                                has_entries({
+                                    'place': 'England',
+                                    'age:mean': 1.5
+                                }),
+                                has_entries({
+                                    'place': 'Wales',
+                                    'age:mean': 2.5
+                                })
+                            )
+                        }),
+                        has_entries({
+                            'name': 'IE',
+                            'version': '6',
+                            'age:mean': 13,
+                            '_subgroup': contains(
+                                has_entries({
+                                    'place': 'England',
+                                    'age:mean': 12.5
+                                }),
+                                has_entries({
+                                    'place': 'Wales',
+                                    'age:mean': 13.5
+                                })
+                            )
+                        }),
+                        has_entries({
+                            'name': 'IE',
+                            'version': '7',
+                            'age:mean': 8,
+                            '_subgroup': contains(
+                                has_entries({
+                                    'place': 'England',
+                                    'age:mean': 7.5
+                                }),
+                                has_entries({
+                                    'place': 'Wales',
+                                    'age:mean': 8.5
+                                })
+                            )
+                        }),
+                        has_entries({
+                            'name': 'IE',
+                            'version': '8',
+                            'age:mean': 5,
+                            '_subgroup': contains(
+                                has_entries({
+                                    'place': 'England',
+                                    'age:mean': 4.5
+                                }),
+                                has_entries({
+                                    'place': 'Wales',
+                                    'age:mean': 5.5
+                                })
+                            )
+                        }),
+                    ))
+
 
 class TestGroupBy(object):
     def test_one_level_grouping(self):
@@ -97,7 +176,7 @@ def test_one_level_grouping(self):
             datum(name='Jack', age=[34, 34]),
             datum(name='John', age=[56, 65])
         ]
-        results = group_by(data, ['name'])
+        results = group_by(data, [['name']])
 
         assert_that(results,
                     contains(
@@ -112,7 +191,7 @@ def test_two_level_grouping(self):
             datum(name='James', place='Kettering', age=[43, 87], count=2),
             datum(name='Jill', place='Keswick', age=[76, 32], count=2),
         ]
-        results = group_by(data, ['name', 'place'])
+        results = group_by(data, [['name'], ['place']])
 
         assert_that(results,
                     contains(