Skip to content
This repository has been archived by the owner on Mar 24, 2021. It is now read-only.

Commit

Permalink
Nested merge now groups using a list of key combos
Browse files Browse the repository at this point in the history
This means that when multiple group_by parameters are specified we can
group by all of them and include all of them in the same group of
results.
  • Loading branch information
nick-gravgaard committed Aug 1, 2014
1 parent ea2c43e commit 136d383
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 35 deletions.
57 changes: 34 additions & 23 deletions backdrop/core/nested_merge.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
from .errors import InvalidOperationError

from operator import itemgetter, add
from operator import add
import itertools


def _multi_itemgetter(*items):
"""Like operator.itemgetter, but the callable always returns
a sequence of lookup values (regardless of items' length)
see https://docs.python.org/2/library/operator.html#operator.itemgetter
"""
return lambda obj: tuple(obj[item] for item in items)


def nested_merge(keys, collect, data):
if len(keys) > 1:
data = group_by(data, keys)
Expand All @@ -19,46 +27,48 @@ def group_by(data, keys):
"""Recursively group an array of results by a list of keys
data: a list of dictionaries as returned by MongoDriver.group
keys: a list of keys to group by
keys: a list of combinations of keys to group by
"""
key = keys[0]
getter = itemgetter(key)
key_combo = keys[0]
getter = _multi_itemgetter(*key_combo)
data = sorted(data, key=getter)

if len(keys) > 1:
data = [
{
key: value,
"_subgroup": group_by(
remove_key_from_all(subgroups, key),
keys[1:]
)
}
for value, subgroups in itertools.groupby(data, getter)
]
grouped_data = []
for values, subgroups in itertools.groupby(data, getter):
# create a dict containing key value pairs and _subgroup
result = dict(zip(key_combo, values))
result['_subgroup'] = group_by(
remove_keys_from_all(subgroups, key_combo),
keys[1:]
)
grouped_data.append(result)
data = grouped_data

return data


def remove_key_from_all(groups, key):
"""Remove a key from each group in a list of groups
def remove_keys_from_all(groups, keys):
"""Remove keys from each group in a list of groups
groups: a list of groups (dictionaries)
key: the key to remove
"""
return [remove_key(group, key) for group in groups]
return [remove_keys(group, keys) for group in groups]


def remove_key(doc, key):
"""Return a new document with the key removed
def remove_keys(doc, keys):
"""Return a new document with keys in keys removed
>>> doc = {'a':1, 'b':2}
>>> remove_key(doc, 'a')
>>> remove_keys(doc, ['a'])
{'b': 2}
>>> # Show that the original document is not affected
>>> doc['a']
1
"""
return dict(
(k, v) for k, v in doc.items() if k != key)
(k, v) for k, v in doc.items() if k not in keys)


def apply_counts(groups):
Expand Down Expand Up @@ -104,7 +114,7 @@ def apply_collect_to_group(group, collect):

# remove left over collect keys
for key, _ in collect:
group = remove_key(group, key)
group = remove_keys(group, key)

# Hack in the old way
for key, method in collect:
Expand Down Expand Up @@ -203,7 +213,8 @@ def collect_reducer_mean(values):


def sort_all(data, keys):
key_combo = keys[0]
if len(keys) > 1:
for i, group in enumerate(data):
data[i]['_subgroup'] = sort_all(group['_subgroup'], keys[1:])
return sorted(data, key=itemgetter(keys[0]))
return sorted(data, key=_multi_itemgetter(*key_combo))
13 changes: 7 additions & 6 deletions backdrop/core/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,26 @@ def collect_fields(self):

@property
def group_keys(self):
"""Return a list of fields that are being grouped on
"""Return a list of lists of combinations of fields that are being
grouped on
This is kinda coupled to how we group with Mongo but these keys
are in the returned results and are used in the nested merge to
create the hierarchical response.
>>> from ..core.timeseries import WEEK
>>> Query.create(group_by=['foo']).group_keys
['foo']
[['foo']]
>>> Query.create(period=WEEK).group_keys
['_week_start_at']
[['_week_start_at']]
>>> Query.create(group_by=['foo'], period=WEEK).group_keys
['foo', '_week_start_at']
[['foo'], ['_week_start_at']]
"""
keys = []
if self.group_by:
keys += self.group_by
keys.append(self.group_by)
if self.period:
keys.append(self.period.start_at_key)
keys.append([self.period.start_at_key])
return keys

@property
Expand Down
4 changes: 3 additions & 1 deletion backdrop/core/storage/mongo.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import logging
import datetime
import itertools

import pymongo
from pymongo.errors import AutoReconnect, CollectionInvalid
Expand Down Expand Up @@ -99,7 +100,8 @@ def _execute_query(self, data_set_id, query):
return self._basic_query(data_set_id, query)

def _group_query(self, data_set_id, query):
keys = query.group_keys
# flatten the list of key combos to form a flat list of keys
keys = list(itertools.chain.from_iterable(query.group_keys))
spec = get_mongo_spec(query)
collect_fields = query.collect_fields

Expand Down
11 changes: 11 additions & 0 deletions features/read_api/group.feature
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@ Feature: grouping queries for read api
and the "1st" result should be "{"authority": "Camden", "_count": 2}"
and the "2nd" result should be "{"authority": "Westminster", "_count": 4}"


Scenario: grouping by multiple keys
Given "licensing_2.json" is in "foo" data_set
when I go to "/foo?group_by=authority&group_by=licence_name"
then I should get back a status of "200"
and the JSON should have "3" results
and the "1st" result should be "{"authority": "Camden", "licence_name": "Temporary events notice", "_count": 1}"
and the "2nd" result should be "{"authority": "Westminster", "licence_name": "Cat herding licence", "_count": 1}"
and the "3rd" result should be "{"authority": "Westminster", "licence_name": "Temporary events notice", "_count": 3}"


Scenario: grouping and filtering by different keys
Given "licensing_2.json" is in "foo" data_set
when I go to "/foo?group_by=authority&filter_by=licence_name:Temporary%20events%20notice"
Expand Down
16 changes: 16 additions & 0 deletions tests/core/test_data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,22 @@ def test_month_and_group_query(self):
assert_that(data,
has_item(has_entries({"values": has_length(3)})))

def test_month_and_groups_query(self):
self.mock_storage.execute_query.return_value = [
{'some_group': 'val1', 'another_group': 'val3', '_month_start_at': d(2013, 1, 1), '_count': 1},
{'some_group': 'val1', 'another_group': 'val3', '_month_start_at': d(2013, 2, 1), '_count': 5},
{'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 3, 1), '_count': 2},
{'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 4, 1), '_count': 6},
{'some_group': 'val2', 'another_group': 'val3', '_month_start_at': d(2013, 7, 1), '_count': 6},
]

data = self.data_set.execute_query(Query.create(period=MONTH,
group_by=['some_group', 'another_group']))
assert_that(data,
has_item(has_entries({"values": has_length(2)})))
assert_that(data,
has_item(has_entries({"values": has_length(3)})))

def test_month_and_group_query_with_start_and_end_at(self):
self.mock_storage.execute_query.return_value = [
{'some_group': 'val1', '_month_start_at': d(2013, 1, 1), '_count': 1},
Expand Down
89 changes: 84 additions & 5 deletions tests/core/test_nested_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
from backdrop.core.timeseries import WEEK, MONTH


def datum(name=None, place=None, age=None, stamp=None, count=1):
def datum(name=None, version=None, place=None, age=None, stamp=None, count=1):
result = {
"_count": count
}
if name is not None:
result['name'] = name
if version is not None:
result['version'] = version
if place is not None:
result['place'] = place
if age is not None:
Expand All @@ -29,7 +31,7 @@ def test_one_level_grouping_with_collect(self):
datum(name='Jack', age=[34, 34]),
datum(name='John', age=[56, 65])
]
results = nested_merge(['name'], [('age', 'mean')], data)
results = nested_merge([['name']], [('age', 'mean')], data)

assert_that(results,
contains(
Expand All @@ -46,7 +48,7 @@ def test_two_level_grouping_with_collect(self):
datum(name='James', place='Kettering', age=[43, 87], count=2),
datum(name='Jill', place='Keswick', age=[76, 32], count=2),
]
results = nested_merge(['name', 'place'], [('age', 'mean')], data)
results = nested_merge([['name'], ['place']], [('age', 'mean')], data)

assert_that(results,
contains(
Expand Down Expand Up @@ -89,6 +91,83 @@ def test_two_level_grouping_with_collect(self):
}),
))

def test_two_level_grouping_combination_of_keys(self):
data = [
datum(name='IE', version='6', place='England', age=[13, 12], count=2),
datum(name='IE', version='6', place='Wales', age=[13, 14], count=2),
datum(name='IE', version='7', place='England', age=[8, 7], count=2),
datum(name='IE', version='7', place='Wales', age=[8, 9], count=2),
datum(name='IE', version='8', place='England', age=[5, 4], count=2),
datum(name='IE', version='8', place='Wales', age=[5, 6], count=2),
datum(name='Chrome', version='20', place='England', age=[2, 1], count=2),
datum(name='Chrome', version='20', place='Wales', age=[2, 3], count=2),
]
results = nested_merge([['name', 'version'], ['place']], [('age', 'mean')], data)

assert_that(results,
contains(
has_entries({
'name': 'Chrome',
'version': '20',
'age:mean': 2,
'_subgroup': contains(
has_entries({
'place': 'England',
'age:mean': 1.5
}),
has_entries({
'place': 'Wales',
'age:mean': 2.5
})
)
}),
has_entries({
'name': 'IE',
'version': '6',
'age:mean': 13,
'_subgroup': contains(
has_entries({
'place': 'England',
'age:mean': 12.5
}),
has_entries({
'place': 'Wales',
'age:mean': 13.5
})
)
}),
has_entries({
'name': 'IE',
'version': '7',
'age:mean': 8,
'_subgroup': contains(
has_entries({
'place': 'England',
'age:mean': 7.5
}),
has_entries({
'place': 'Wales',
'age:mean': 8.5
})
)
}),
has_entries({
'name': 'IE',
'version': '8',
'age:mean': 5,
'_subgroup': contains(
has_entries({
'place': 'England',
'age:mean': 4.5
}),
has_entries({
'place': 'Wales',
'age:mean': 5.5
})
)
}),
))


class TestGroupBy(object):
def test_one_level_grouping(self):
Expand All @@ -97,7 +176,7 @@ def test_one_level_grouping(self):
datum(name='Jack', age=[34, 34]),
datum(name='John', age=[56, 65])
]
results = group_by(data, ['name'])
results = group_by(data, [['name']])

assert_that(results,
contains(
Expand All @@ -112,7 +191,7 @@ def test_two_level_grouping(self):
datum(name='James', place='Kettering', age=[43, 87], count=2),
datum(name='Jill', place='Keswick', age=[76, 32], count=2),
]
results = group_by(data, ['name', 'place'])
results = group_by(data, [['name'], ['place']])

assert_that(results,
contains(
Expand Down

0 comments on commit 136d383

Please sign in to comment.