Nested merge now takes a list of key combos to group by

alphagov · Jul 31, 2014 · a870139 · a870139
1 parent ea2c43e
commit a870139
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 34 deletions.
diff --git a/backdrop/core/nested_merge.py b/backdrop/core/nested_merge.py
@@ -1,9 +1,15 @@
 from .errors import InvalidOperationError
 
-from operator import itemgetter, add
+from operator import add
 import itertools
 
 
+def _multi_itemgetter(*items):
+    """Like operator.itemgetter, but the callable always returns
+    a sequence of lookup values (regardless of items' length)"""
+    return lambda obj: tuple(obj[item] for item in items)
+
+
 def nested_merge(keys, collect, data):
     if len(keys) > 1:
         data = group_by(data, keys)
@@ -19,46 +25,47 @@ def group_by(data, keys):
     """Recursively group an array of results by a list of keys
 
     data: a list of dictionaries as returned by MongoDriver.group
-    keys: a list of keys to group by
+    keys: a list of combinations of keys to group by
     """
-    key = keys[0]
-    getter = itemgetter(key)
+    key_combo = keys[0]
+    getter = _multi_itemgetter(*key_combo)
     data = sorted(data, key=getter)
+
     if len(keys) > 1:
-        data = [
-            {
-                key: value,
-                "_subgroup": group_by(
-                    remove_key_from_all(subgroups, key),
-                    keys[1:]
-                )
-            }
-            for value, subgroups in itertools.groupby(data, getter)
-        ]
+        grouped_data = []
+        for values, subgroups in itertools.groupby(data, getter):
+            result = dict(zip(key_combo, values))
+            result['_subgroup'] = group_by(
+                remove_keys_from_all(subgroups, key_combo),
+                keys[1:]
+            )
+            grouped_data.append(result)
+        data = grouped_data
+
     return data
 
 
-def remove_key_from_all(groups, key):
-    """Remove a key from each group in a list of groups
+def remove_keys_from_all(groups, keys):
+    """Remove keys from each group in a list of groups
 
     groups: a list of groups (dictionaries)
     key: the key to remove
     """
-    return [remove_key(group, key) for group in groups]
+    return [remove_keys(group, keys) for group in groups]
 
 
-def remove_key(doc, key):
-    """Return a new document with the key removed
+def remove_keys(doc, keys):
+    """Return a new document with keys in keys removed
 
     >>> doc = {'a':1, 'b':2}
-    >>> remove_key(doc, 'a')
+    >>> remove_keys(doc, ['a'])
     {'b': 2}
     >>> # Show that the original document is not affected
     >>> doc['a']
     1
     """
     return dict(
-        (k, v) for k, v in doc.items() if k != key)
+        (k, v) for k, v in doc.items() if k not in keys)
 
 
 def apply_counts(groups):
@@ -104,7 +111,7 @@ def apply_collect_to_group(group, collect):
 
     # remove left over collect keys
     for key, _ in collect:
-        group = remove_key(group, key)
+        group = remove_keys(group, key)
 
     # Hack in the old way
     for key, method in collect:
@@ -203,7 +210,8 @@ def collect_reducer_mean(values):
 
 
 def sort_all(data, keys):
+    key_combo = keys[0]
     if len(keys) > 1:
         for i, group in enumerate(data):
             data[i]['_subgroup'] = sort_all(group['_subgroup'], keys[1:])
-    return sorted(data, key=itemgetter(keys[0]))
+    return sorted(data, key=_multi_itemgetter(*key_combo))
diff --git a/backdrop/core/query.py b/backdrop/core/query.py
@@ -50,25 +50,26 @@ def collect_fields(self):
 
     @property
     def group_keys(self):
-        """Return a list of fields that are being grouped on
+        """Return a list of lists of combinations of fields that are being
+        grouped on
 
         This is kinda coupled to how we group with Mongo but these keys
         are in the returned results and are used in the nested merge to
         create the hierarchical response.
 
         >>> from ..core.timeseries import WEEK
         >>> Query.create(group_by=['foo']).group_keys
-        ['foo']
+        [['foo']]
         >>> Query.create(period=WEEK).group_keys
-        ['_week_start_at']
+        [['_week_start_at']]
         >>> Query.create(group_by=['foo'], period=WEEK).group_keys
-        ['foo', '_week_start_at']
+        [['foo'], ['_week_start_at']]
         """
         keys = []
         if self.group_by:
-            keys += self.group_by
+            keys.append(self.group_by)
         if self.period:
-            keys.append(self.period.start_at_key)
+            keys.append([self.period.start_at_key])
         return keys
 
     @property

diff --git a/backdrop/core/storage/mongo.py b/backdrop/core/storage/mongo.py
@@ -1,6 +1,7 @@
 import os
 import logging
 import datetime
+import itertools
 
 import pymongo
 from pymongo.errors import AutoReconnect, CollectionInvalid
@@ -99,7 +100,7 @@ def _execute_query(self, data_set_id, query):
             return self._basic_query(data_set_id, query)
 
     def _group_query(self, data_set_id, query):
-        keys = query.group_keys
+        keys = list(itertools.chain.from_iterable(query.group_keys))
         spec = get_mongo_spec(query)
         collect_fields = query.collect_fields
 

diff --git a/features/read_api/group.feature b/features/read_api/group.feature
@@ -11,6 +11,17 @@ Feature: grouping queries for read api
           and the "1st" result should be "{"authority": "Camden", "_count": 2}"
           and the "2nd" result should be "{"authority": "Westminster", "_count": 4}"
 
+
+    Scenario: grouping by multiple keys
+        Given "licensing_2.json" is in "foo" data_set
+         when I go to "/foo?group_by=authority&group_by=licence_name"
+         then I should get back a status of "200"
+          and the JSON should have "3" results
+          and the "1st" result should be "{"authority": "Camden", "licence_name": "Temporary events notice", "_count": 1}"
+          and the "2nd" result should be "{"authority": "Westminster", "licence_name": "Cat herding licence", "_count": 1}"
+          and the "3rd" result should be "{"authority": "Westminster", "licence_name": "Temporary events notice", "_count": 3}"
+
+
     Scenario: grouping and filtering by different keys
         Given "licensing_2.json" is in "foo" data_set
          when I go to "/foo?group_by=authority&filter_by=licence_name:Temporary%20events%20notice"

diff --git a/tests/core/test_data_set.py b/tests/core/test_data_set.py
@@ -206,6 +206,22 @@ def test_month_and_group_query(self):
         assert_that(data,
                     has_item(has_entries({"values": has_length(3)})))
 
+    def test_month_and_groups_query(self):
+        self.mock_storage.execute_query.return_value = [
+            {'some_group': 'val1', 'another_group': 'val3', '_month_start_at': d(2013, 1, 1), '_count': 1},
+            {'some_group': 'val1', 'another_group': 'val3', '_month_start_at': d(2013, 2, 1), '_count': 5},
+            {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 3, 1), '_count': 2},
+            {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 4, 1), '_count': 6},
+            {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 7, 1), '_count': 6},
+        ]
+
+        data = self.data_set.execute_query(Query.create(period=MONTH,
+                                                        group_by=['some_group', 'another_group']))
+        assert_that(data,
+                    has_item(has_entries({"values": has_length(2)})))
+        assert_that(data,
+                    has_item(has_entries({"values": has_length(3)})))
+
     def test_month_and_group_query_with_start_and_end_at(self):
         self.mock_storage.execute_query.return_value = [
             {'some_group': 'val1', '_month_start_at': d(2013, 1, 1), '_count': 1},

diff --git a/tests/core/test_nested_merge.py b/tests/core/test_nested_merge.py
@@ -29,7 +29,7 @@ def test_one_level_grouping_with_collect(self):
             datum(name='Jack', age=[34, 34]),
             datum(name='John', age=[56, 65])
         ]
-        results = nested_merge(['name'], [('age', 'mean')], data)
+        results = nested_merge([['name']], [('age', 'mean')], data)
 
         assert_that(results,
                     contains(
@@ -46,7 +46,7 @@ def test_two_level_grouping_with_collect(self):
             datum(name='James', place='Kettering', age=[43, 87], count=2),
             datum(name='Jill', place='Keswick', age=[76, 32], count=2),
         ]
-        results = nested_merge(['name', 'place'], [('age', 'mean')], data)
+        results = nested_merge([['name'], ['place']], [('age', 'mean')], data)
 
         assert_that(results,
                     contains(
@@ -97,7 +97,7 @@ def test_one_level_grouping(self):
             datum(name='Jack', age=[34, 34]),
             datum(name='John', age=[56, 65])
         ]
-        results = group_by(data, ['name'])
+        results = group_by(data, [['name']])
 
         assert_that(results,
                     contains(
@@ -112,7 +112,7 @@ def test_two_level_grouping(self):
             datum(name='James', place='Kettering', age=[43, 87], count=2),
             datum(name='Jill', place='Keswick', age=[76, 32], count=2),
         ]
-        results = group_by(data, ['name', 'place'])
+        results = group_by(data, [['name'], ['place']])
 
         assert_that(results,
                     contains(