From 136d383a398e5a1169220c73dde6feb23689dd12 Mon Sep 17 00:00:00 2001 From: Nick Gravgaard Date: Thu, 31 Jul 2014 17:41:14 +0100 Subject: [PATCH] Nested merge now groups using a list of key combos This means that when multiple group_by parameters are specified we can group by all of them and include all of them in the same group of results. --- backdrop/core/nested_merge.py | 57 ++++++++++++--------- backdrop/core/query.py | 13 ++--- backdrop/core/storage/mongo.py | 4 +- features/read_api/group.feature | 11 ++++ tests/core/test_data_set.py | 16 ++++++ tests/core/test_nested_merge.py | 89 +++++++++++++++++++++++++++++++-- 6 files changed, 155 insertions(+), 35 deletions(-) diff --git a/backdrop/core/nested_merge.py b/backdrop/core/nested_merge.py index e0ff1849..730d3e61 100644 --- a/backdrop/core/nested_merge.py +++ b/backdrop/core/nested_merge.py @@ -1,9 +1,17 @@ from .errors import InvalidOperationError -from operator import itemgetter, add +from operator import add import itertools +def _multi_itemgetter(*items): + """Like operator.itemgetter, but the callable always returns + a sequence of lookup values (regardless of items' length) + see https://docs.python.org/2/library/operator.html#operator.itemgetter + """ + return lambda obj: tuple(obj[item] for item in items) + + def nested_merge(keys, collect, data): if len(keys) > 1: data = group_by(data, keys) @@ -19,46 +27,48 @@ def group_by(data, keys): """Recursively group an array of results by a list of keys data: a list of dictionaries as returned by MongoDriver.group - keys: a list of keys to group by + keys: a list of combinations of keys to group by """ - key = keys[0] - getter = itemgetter(key) + key_combo = keys[0] + getter = _multi_itemgetter(*key_combo) data = sorted(data, key=getter) + if len(keys) > 1: - data = [ - { - key: value, - "_subgroup": group_by( - remove_key_from_all(subgroups, key), - keys[1:] - ) - } - for value, subgroups in itertools.groupby(data, getter) - ] + grouped_data = [] + for values, subgroups in itertools.groupby(data, getter): + # create a dict containing key value pairs and _subgroup + result = dict(zip(key_combo, values)) + result['_subgroup'] = group_by( + remove_keys_from_all(subgroups, key_combo), + keys[1:] + ) + grouped_data.append(result) + data = grouped_data + return data -def remove_key_from_all(groups, key): - """Remove a key from each group in a list of groups +def remove_keys_from_all(groups, keys): + """Remove keys from each group in a list of groups groups: a list of groups (dictionaries) key: the key to remove """ - return [remove_key(group, key) for group in groups] + return [remove_keys(group, keys) for group in groups] -def remove_key(doc, key): - """Return a new document with the key removed +def remove_keys(doc, keys): + """Return a new document with keys in keys removed >>> doc = {'a':1, 'b':2} - >>> remove_key(doc, 'a') + >>> remove_keys(doc, ['a']) {'b': 2} >>> # Show that the original document is not affected >>> doc['a'] 1 """ return dict( - (k, v) for k, v in doc.items() if k != key) + (k, v) for k, v in doc.items() if k not in keys) def apply_counts(groups): @@ -104,7 +114,7 @@ def apply_collect_to_group(group, collect): # remove left over collect keys for key, _ in collect: - group = remove_key(group, key) + group = remove_keys(group, key) # Hack in the old way for key, method in collect: @@ -203,7 +213,8 @@ def collect_reducer_mean(values): def sort_all(data, keys): + key_combo = keys[0] if len(keys) > 1: for i, group in enumerate(data): data[i]['_subgroup'] = sort_all(group['_subgroup'], keys[1:]) - return sorted(data, key=itemgetter(keys[0])) + return sorted(data, key=_multi_itemgetter(*key_combo)) diff --git a/backdrop/core/query.py b/backdrop/core/query.py index 62b0db47..c97a9dec 100644 --- a/backdrop/core/query.py +++ b/backdrop/core/query.py @@ -50,7 +50,8 @@ def collect_fields(self): @property def group_keys(self): - """Return a list of fields that are being grouped on + """Return a list of lists of combinations of fields that are being + grouped on This is kinda coupled to how we group with Mongo but these keys are in the returned results and are used in the nested merge to @@ -58,17 +59,17 @@ def group_keys(self): >>> from ..core.timeseries import WEEK >>> Query.create(group_by=['foo']).group_keys - ['foo'] + [['foo']] >>> Query.create(period=WEEK).group_keys - ['_week_start_at'] + [['_week_start_at']] >>> Query.create(group_by=['foo'], period=WEEK).group_keys - ['foo', '_week_start_at'] + [['foo'], ['_week_start_at']] """ keys = [] if self.group_by: - keys += self.group_by + keys.append(self.group_by) if self.period: - keys.append(self.period.start_at_key) + keys.append([self.period.start_at_key]) return keys @property diff --git a/backdrop/core/storage/mongo.py b/backdrop/core/storage/mongo.py index f314cf06..9e2ab0a0 100644 --- a/backdrop/core/storage/mongo.py +++ b/backdrop/core/storage/mongo.py @@ -1,6 +1,7 @@ import os import logging import datetime +import itertools import pymongo from pymongo.errors import AutoReconnect, CollectionInvalid @@ -99,7 +100,8 @@ def _execute_query(self, data_set_id, query): return self._basic_query(data_set_id, query) def _group_query(self, data_set_id, query): - keys = query.group_keys + # flatten the list of key combos to form a flat list of keys + keys = list(itertools.chain.from_iterable(query.group_keys)) spec = get_mongo_spec(query) collect_fields = query.collect_fields diff --git a/features/read_api/group.feature b/features/read_api/group.feature index e526eb48..58a47793 100644 --- a/features/read_api/group.feature +++ b/features/read_api/group.feature @@ -11,6 +11,17 @@ Feature: grouping queries for read api and the "1st" result should be "{"authority": "Camden", "_count": 2}" and the "2nd" result should be "{"authority": "Westminster", "_count": 4}" + + Scenario: grouping by multiple keys + Given "licensing_2.json" is in "foo" data_set + when I go to "/foo?group_by=authority&group_by=licence_name" + then I should get back a status of "200" + and the JSON should have "3" results + and the "1st" result should be "{"authority": "Camden", "licence_name": "Temporary events notice", "_count": 1}" + and the "2nd" result should be "{"authority": "Westminster", "licence_name": "Cat herding licence", "_count": 1}" + and the "3rd" result should be "{"authority": "Westminster", "licence_name": "Temporary events notice", "_count": 3}" + + Scenario: grouping and filtering by different keys Given "licensing_2.json" is in "foo" data_set when I go to "/foo?group_by=authority&filter_by=licence_name:Temporary%20events%20notice" diff --git a/tests/core/test_data_set.py b/tests/core/test_data_set.py index fad3d75d..cbd1a133 100644 --- a/tests/core/test_data_set.py +++ b/tests/core/test_data_set.py @@ -206,6 +206,22 @@ def test_month_and_group_query(self): assert_that(data, has_item(has_entries({"values": has_length(3)}))) + def test_month_and_groups_query(self): + self.mock_storage.execute_query.return_value = [ + {'some_group': 'val1', 'another_group': 'val3', '_month_start_at': d(2013, 1, 1), '_count': 1}, + {'some_group': 'val1', 'another_group': 'val3', '_month_start_at': d(2013, 2, 1), '_count': 5}, + {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 3, 1), '_count': 2}, + {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 4, 1), '_count': 6}, + {'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 7, 1), '_count': 6}, + ] + + data = self.data_set.execute_query(Query.create(period=MONTH, + group_by=['some_group', 'another_group'])) + assert_that(data, + has_item(has_entries({"values": has_length(2)}))) + assert_that(data, + has_item(has_entries({"values": has_length(3)}))) + def test_month_and_group_query_with_start_and_end_at(self): self.mock_storage.execute_query.return_value = [ {'some_group': 'val1', '_month_start_at': d(2013, 1, 1), '_count': 1}, diff --git a/tests/core/test_nested_merge.py b/tests/core/test_nested_merge.py index eef1381a..2d55ab72 100644 --- a/tests/core/test_nested_merge.py +++ b/tests/core/test_nested_merge.py @@ -4,12 +4,14 @@ from backdrop.core.timeseries import WEEK, MONTH -def datum(name=None, place=None, age=None, stamp=None, count=1): +def datum(name=None, version=None, place=None, age=None, stamp=None, count=1): result = { "_count": count } if name is not None: result['name'] = name + if version is not None: + result['version'] = version if place is not None: result['place'] = place if age is not None: @@ -29,7 +31,7 @@ def test_one_level_grouping_with_collect(self): datum(name='Jack', age=[34, 34]), datum(name='John', age=[56, 65]) ] - results = nested_merge(['name'], [('age', 'mean')], data) + results = nested_merge([['name']], [('age', 'mean')], data) assert_that(results, contains( @@ -46,7 +48,7 @@ def test_two_level_grouping_with_collect(self): datum(name='James', place='Kettering', age=[43, 87], count=2), datum(name='Jill', place='Keswick', age=[76, 32], count=2), ] - results = nested_merge(['name', 'place'], [('age', 'mean')], data) + results = nested_merge([['name'], ['place']], [('age', 'mean')], data) assert_that(results, contains( @@ -89,6 +91,83 @@ def test_two_level_grouping_with_collect(self): }), )) + def test_two_level_grouping_combination_of_keys(self): + data = [ + datum(name='IE', version='6', place='England', age=[13, 12], count=2), + datum(name='IE', version='6', place='Wales', age=[13, 14], count=2), + datum(name='IE', version='7', place='England', age=[8, 7], count=2), + datum(name='IE', version='7', place='Wales', age=[8, 9], count=2), + datum(name='IE', version='8', place='England', age=[5, 4], count=2), + datum(name='IE', version='8', place='Wales', age=[5, 6], count=2), + datum(name='Chrome', version='20', place='England', age=[2, 1], count=2), + datum(name='Chrome', version='20', place='Wales', age=[2, 3], count=2), + ] + results = nested_merge([['name', 'version'], ['place']], [('age', 'mean')], data) + + assert_that(results, + contains( + has_entries({ + 'name': 'Chrome', + 'version': '20', + 'age:mean': 2, + '_subgroup': contains( + has_entries({ + 'place': 'England', + 'age:mean': 1.5 + }), + has_entries({ + 'place': 'Wales', + 'age:mean': 2.5 + }) + ) + }), + has_entries({ + 'name': 'IE', + 'version': '6', + 'age:mean': 13, + '_subgroup': contains( + has_entries({ + 'place': 'England', + 'age:mean': 12.5 + }), + has_entries({ + 'place': 'Wales', + 'age:mean': 13.5 + }) + ) + }), + has_entries({ + 'name': 'IE', + 'version': '7', + 'age:mean': 8, + '_subgroup': contains( + has_entries({ + 'place': 'England', + 'age:mean': 7.5 + }), + has_entries({ + 'place': 'Wales', + 'age:mean': 8.5 + }) + ) + }), + has_entries({ + 'name': 'IE', + 'version': '8', + 'age:mean': 5, + '_subgroup': contains( + has_entries({ + 'place': 'England', + 'age:mean': 4.5 + }), + has_entries({ + 'place': 'Wales', + 'age:mean': 5.5 + }) + ) + }), + )) + class TestGroupBy(object): def test_one_level_grouping(self): @@ -97,7 +176,7 @@ def test_one_level_grouping(self): datum(name='Jack', age=[34, 34]), datum(name='John', age=[56, 65]) ] - results = group_by(data, ['name']) + results = group_by(data, [['name']]) assert_that(results, contains( @@ -112,7 +191,7 @@ def test_two_level_grouping(self): datum(name='James', place='Kettering', age=[43, 87], count=2), datum(name='Jill', place='Keswick', age=[76, 32], count=2), ] - results = group_by(data, ['name', 'place']) + results = group_by(data, [['name'], ['place']]) assert_that(results, contains(