Nested merge now groups using a list of key combos

This means that when multiple group_by parameters are specified we can group by all of them and include all of them in the same group of results.
alphagov · Aug 1, 2014 · 136d383 · 136d383
1 parent ea2c43e
commit 136d383
Show file tree

Hide file tree

Showing 6 changed files with 155 additions and 35 deletions.
diff --git a/backdrop/core/nested_merge.py b/backdrop/core/nested_merge.py
@@ -1,9 +1,17 @@
 from .errors import InvalidOperationError
 
-from operator import itemgetter, add
+from operator import add
 import itertools
 
 
+def _multi_itemgetter(*items):
+    """Like operator.itemgetter, but the callable always returns
+    a sequence of lookup values (regardless of items' length)
+    see https://docs.python.org/2/library/operator.html#operator.itemgetter
+    """
+    return lambda obj: tuple(obj[item] for item in items)
+
+
 def nested_merge(keys, collect, data):
     if len(keys) > 1:
         data = group_by(data, keys)
@@ -19,46 +27,48 @@ def group_by(data, keys):
     """Recursively group an array of results by a list of keys
 
     data: a list of dictionaries as returned by MongoDriver.group
-    keys: a list of keys to group by
+    keys: a list of combinations of keys to group by
     """
-    key = keys[0]
-    getter = itemgetter(key)
+    key_combo = keys[0]
+    getter = _multi_itemgetter(*key_combo)
     data = sorted(data, key=getter)
+
     if len(keys) > 1:
-        data = [
-            {
-                key: value,
-                "_subgroup": group_by(
-                    remove_key_from_all(subgroups, key),
-                    keys[1:]
-                )
-            }
-            for value, subgroups in itertools.groupby(data, getter)
-        ]
+        grouped_data = []
+        for values, subgroups in itertools.groupby(data, getter):
+            # create a dict containing key value pairs and _subgroup
+            result = dict(zip(key_combo, values))
+            result['_subgroup'] = group_by(
+                remove_keys_from_all(subgroups, key_combo),
+                keys[1:]
+            )
+            grouped_data.append(result)
+        data = grouped_data
+
     return data
 
 
-def remove_key_from_all(groups, key):
-    """Remove a key from each group in a list of groups
+def remove_keys_from_all(groups, keys):
+    """Remove keys from each group in a list of groups
 
     groups: a list of groups (dictionaries)
     key: the key to remove
     """
-    return [remove_key(group, key) for group in groups]
+    return [remove_keys(group, keys) for group in groups]
 
 
-def remove_key(doc, key):
-    """Return a new document with the key removed
+def remove_keys(doc, keys):
+    """Return a new document with keys in keys removed
 
     >>> doc = {'a':1, 'b':2}
-    >>> remove_key(doc, 'a')
+    >>> remove_keys(doc, ['a'])
     {'b': 2}
     >>> # Show that the original document is not affected
     >>> doc['a']
     1
     """
     return dict(
-        (k, v) for k, v in doc.items() if k != key)
+        (k, v) for k, v in doc.items() if k not in keys)
 
 
 def apply_counts(groups):
@@ -104,7 +114,7 @@ def apply_collect_to_group(group, collect):
 
     # remove left over collect keys
     for key, _ in collect:
-        group = remove_key(group, key)
+        group = remove_keys(group, key)
 
     # Hack in the old way
     for key, method in collect:
@@ -203,7 +213,8 @@ def collect_reducer_mean(values):
 
 
 def sort_all(data, keys):
+    key_combo = keys[0]
     if len(keys) > 1:
         for i, group in enumerate(data):
             data[i]['_subgroup'] = sort_all(group['_subgroup'], keys[1:])
-    return sorted(data, key=itemgetter(keys[0]))
+    return sorted(data, key=_multi_itemgetter(*key_combo))
diff --git a/backdrop/core/query.py b/backdrop/core/query.py
@@ -50,25 +50,26 @@ def collect_fields(self):
 
     @property
     def group_keys(self):
-        """Return a list of fields that are being grouped on
+        """Return a list of lists of combinations of fields that are being
+        grouped on
 
         This is kinda coupled to how we group with Mongo but these keys
         are in the returned results and are used in the nested merge to
         create the hierarchical response.
 
         >>> from ..core.timeseries import WEEK
         >>> Query.create(group_by=['foo']).group_keys
-        ['foo']
+        [['foo']]
         >>> Query.create(period=WEEK).group_keys
-        ['_week_start_at']
+        [['_week_start_at']]
         >>> Query.create(group_by=['foo'], period=WEEK).group_keys
-        ['foo', '_week_start_at']
+        [['foo'], ['_week_start_at']]
         """
         keys = []
         if self.group_by:
-            keys += self.group_by
+            keys.append(self.group_by)
         if self.period:
-            keys.append(self.period.start_at_key)
+            keys.append([self.period.start_at_key])
         return keys
 
     @property

diff --git a/backdrop/core/storage/mongo.py b/backdrop/core/storage/mongo.py
@@ -1,6 +1,7 @@
 import os
 import logging
 import datetime
+import itertools
 
 import pymongo
 from pymongo.errors import AutoReconnect, CollectionInvalid
@@ -99,7 +100,8 @@ def _execute_query(self, data_set_id, query):
             return self._basic_query(data_set_id, query)
 
     def _group_query(self, data_set_id, query):
-        keys = query.group_keys
+        # flatten the list of key combos to form a flat list of keys
+        keys = list(itertools.chain.from_iterable(query.group_keys))
         spec = get_mongo_spec(query)
         collect_fields = query.collect_fields
 

diff --git a/features/read_api/group.feature b/features/read_api/group.feature
@@ -11,6 +11,17 @@ Feature: grouping queries for read api
           and the "1st" result should be "{"authority": "Camden", "_count": 2}"
           and the "2nd" result should be "{"authority": "Westminster", "_count": 4}"
 
+
+    Scenario: grouping by multiple keys
+        Given "licensing_2.json" is in "foo" data_set
+         when I go to "/foo?group_by=authority&group_by=licence_name"
+         then I should get back a status of "200"
+          and the JSON should have "3" results
+          and the "1st" result should be "{"authority": "Camden", "licence_name": "Temporary events notice", "_count": 1}"
+          and the "2nd" result should be "{"authority": "Westminster", "licence_name": "Cat herding licence", "_count": 1}"
+          and the "3rd" result should be "{"authority": "Westminster", "licence_name": "Temporary events notice", "_count": 3}"
+
+
     Scenario: grouping and filtering by different keys
         Given "licensing_2.json" is in "foo" data_set
          when I go to "/foo?group_by=authority&filter_by=licence_name:Temporary%20events%20notice"

diff --git a/tests/core/test_data_set.py b/tests/core/test_data_set.py
@@ -206,6 +206,22 @@ def test_month_and_group_query(self):
         assert_that(data,
                     has_item(has_entries({"values": has_length(3)})))
 
+    def test_month_and_groups_query(self):
+        self.mock_storage.execute_query.return_value = [
+            {'some_group': 'val1', 'another_group': 'val3', '_month_start_at': d(2013, 1, 1), '_count': 1},
+            {'some_group': 'val1', 'another_group': 'val3', '_month_start_at': d(2013, 2, 1), '_count': 5},
+            {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 3, 1), '_count': 2},
+            {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 4, 1), '_count': 6},
+            {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 7, 1), '_count': 6},
+        ]
+
+        data = self.data_set.execute_query(Query.create(period=MONTH,
+                                                        group_by=['some_group', 'another_group']))
+        assert_that(data,
+                    has_item(has_entries({"values": has_length(2)})))
+        assert_that(data,
+                    has_item(has_entries({"values": has_length(3)})))
+
     def test_month_and_group_query_with_start_and_end_at(self):
         self.mock_storage.execute_query.return_value = [
             {'some_group': 'val1', '_month_start_at': d(2013, 1, 1), '_count': 1},

diff --git a/tests/core/test_nested_merge.py b/tests/core/test_nested_merge.py
@@ -4,12 +4,14 @@
 from backdrop.core.timeseries import WEEK, MONTH
 
 
-def datum(name=None, place=None, age=None, stamp=None, count=1):
+def datum(name=None, version=None, place=None, age=None, stamp=None, count=1):
     result = {
         "_count": count
     }
     if name is not None:
         result['name'] = name
+    if version is not None:
+        result['version'] = version
     if place is not None:
         result['place'] = place
     if age is not None:
@@ -29,7 +31,7 @@ def test_one_level_grouping_with_collect(self):
             datum(name='Jack', age=[34, 34]),
             datum(name='John', age=[56, 65])
         ]
-        results = nested_merge(['name'], [('age', 'mean')], data)
+        results = nested_merge([['name']], [('age', 'mean')], data)
 
         assert_that(results,
                     contains(
@@ -46,7 +48,7 @@ def test_two_level_grouping_with_collect(self):
             datum(name='James', place='Kettering', age=[43, 87], count=2),
             datum(name='Jill', place='Keswick', age=[76, 32], count=2),
         ]
-        results = nested_merge(['name', 'place'], [('age', 'mean')], data)
+        results = nested_merge([['name'], ['place']], [('age', 'mean')], data)
 
         assert_that(results,
                     contains(
@@ -89,6 +91,83 @@ def test_two_level_grouping_with_collect(self):
                         }),
                     ))
 
+    def test_two_level_grouping_combination_of_keys(self):
+        data = [
+            datum(name='IE', version='6', place='England', age=[13, 12], count=2),
+            datum(name='IE', version='6', place='Wales', age=[13, 14], count=2),
+            datum(name='IE', version='7', place='England', age=[8, 7], count=2),
+            datum(name='IE', version='7', place='Wales', age=[8, 9], count=2),
+            datum(name='IE', version='8', place='England', age=[5, 4], count=2),
+            datum(name='IE', version='8', place='Wales', age=[5, 6], count=2),
+            datum(name='Chrome', version='20', place='England', age=[2, 1], count=2),
+            datum(name='Chrome', version='20', place='Wales', age=[2, 3], count=2),
+        ]
+        results = nested_merge([['name', 'version'], ['place']], [('age', 'mean')], data)
+
+        assert_that(results,
+                    contains(
+                        has_entries({
+                            'name': 'Chrome',
+                            'version': '20',
+                            'age:mean': 2,
+                            '_subgroup': contains(
+                                has_entries({
+                                    'place': 'England',
+                                    'age:mean': 1.5
+                                }),
+                                has_entries({
+                                    'place': 'Wales',
+                                    'age:mean': 2.5
+                                })
+                            )
+                        }),
+                        has_entries({
+                            'name': 'IE',
+                            'version': '6',
+                            'age:mean': 13,
+                            '_subgroup': contains(
+                                has_entries({
+                                    'place': 'England',
+                                    'age:mean': 12.5
+                                }),
+                                has_entries({
+                                    'place': 'Wales',
+                                    'age:mean': 13.5
+                                })
+                            )
+                        }),
+                        has_entries({
+                            'name': 'IE',
+                            'version': '7',
+                            'age:mean': 8,
+                            '_subgroup': contains(
+                                has_entries({
+                                    'place': 'England',
+                                    'age:mean': 7.5
+                                }),
+                                has_entries({
+                                    'place': 'Wales',
+                                    'age:mean': 8.5
+                                })
+                            )
+                        }),
+                        has_entries({
+                            'name': 'IE',
+                            'version': '8',
+                            'age:mean': 5,
+                            '_subgroup': contains(
+                                has_entries({
+                                    'place': 'England',
+                                    'age:mean': 4.5
+                                }),
+                                has_entries({
+                                    'place': 'Wales',
+                                    'age:mean': 5.5
+                                })
+                            )
+                        }),
+                    ))
+
 
 class TestGroupBy(object):
     def test_one_level_grouping(self):
@@ -97,7 +176,7 @@ def test_one_level_grouping(self):
             datum(name='Jack', age=[34, 34]),
             datum(name='John', age=[56, 65])
         ]
-        results = group_by(data, ['name'])
+        results = group_by(data, [['name']])
 
         assert_that(results,
                     contains(
@@ -112,7 +191,7 @@ def test_two_level_grouping(self):
             datum(name='James', place='Kettering', age=[43, 87], count=2),
             datum(name='Jill', place='Keswick', age=[76, 32], count=2),
         ]
-        results = group_by(data, ['name', 'place'])
+        results = group_by(data, [['name'], ['place']])
 
         assert_that(results,
                     contains(