Add ability to specify different collection methods

Methods are passed in to a collect query param as field:method.
alphagov · Jul 2, 2013 · 1dfb708 · 1dfb708
1 parent b3a4086
commit 1dfb708
Show file tree

Hide file tree

Showing 9 changed files with 156 additions and 23 deletions.
diff --git a/backdrop/core/database.py b/backdrop/core/database.py
@@ -144,7 +144,8 @@ def _require_keys_in_query(self, keys, query):
         return query
 
     def _group(self, keys, query, sort=None, limit=None, collect=None):
-        results = self._mongo.group(keys, query, collect)
+        collect_fields = unique_collect_fields(collect)
+        results = self._mongo.group(keys, query, list(collect_fields))
 
         results = nested_merge(keys, collect, results)
 
@@ -174,22 +175,47 @@ class InvalidSortError(ValueError):
 
 def extract_collected_values(collect, result):
     collected = {}
-    for collect_field in collect:
+    for collect_field in unique_collect_fields(collect):
         collected[collect_field] = result.pop(collect_field)
     return collected, result
 
 
 def insert_collected_values(collected, group):
     for collect_field in collected.keys():
         if collect_field not in group:
-            group[collect_field] = set()
-        group[collect_field].update(collected[collect_field])
+            group[collect_field] = []
+        group[collect_field] += collected[collect_field]
 
 
-def convert_collected_values_to_list(collect, groups):
+def apply_collection_methods(collect, groups):
     for group in groups:
-        for collected_field in collect:
-            group[collected_field] = sorted(list(group[collected_field]))
+        for collect_field, collect_method in collect:
+            collect_key = '{0}:{1}'.format(collect_field, collect_method)
+            group[collect_key] = apply_collection_method(
+                group[collect_field], collect_method)
+        for collect_field in unique_collect_fields(collect):
+            del group[collect_field]
+            # This is to provide backwards compatibility with earlier interface
+            if (collect_field, 'set') in collect:
+                group[collect_field] = group['{0}:set'.format(collect_field)]
+
+
+def apply_collection_method(collected_data, collect_method):
+    if "sum" == collect_method:
+        return sum(collected_data)
+    elif "count" == collect_method:
+        return len(collected_data)
+    elif "set" == collect_method:
+        return sorted(list(set(collected_data)))
+    elif "mean" == collect_method:
+        return sum(collected_data) / float(len(collected_data))
+    else:
+        raise ValueError("Unknown collection method")
+
+
+def unique_collect_fields(collect):
+    """Return the unique set of field names to collect."""
+    return set([collect_field for collect_field, _ in collect])
 
 
 def nested_merge(keys, collect, results):
@@ -201,7 +227,7 @@ def nested_merge(keys, collect, results):
 
         insert_collected_values(collected, group)
 
-    convert_collected_values_to_list(collect, groups)
+    apply_collection_methods(collect, groups)
     return groups
 
 

diff --git a/backdrop/read/query.py b/backdrop/read/query.py
@@ -37,7 +37,12 @@ def parse_request_args(request_args):
 
     args['limit'] = if_present(int, request_args.get('limit'))
 
-    args['collect'] = request_args.getlist('collect')
+    args['collect'] = []
+    for collect_arg in request_args.getlist('collect'):
+        if ':' in collect_arg:
+            args['collect'].append(tuple(collect_arg.split(':')))
+        else:
+            args['collect'].append((collect_arg, 'set'))
 
     return args
 

diff --git a/backdrop/read/validation.py b/backdrop/read/validation.py
@@ -176,6 +176,11 @@ def validate(self, request_args, context):
             validate_field_value=self.validate_field_value)
 
     def validate_field_value(self, value, request_args, _):
+        if ":" in value:
+            value, operator = value.split(":")
+            if operator not in ["sum", "count", "set", "mean"]:
+                self.add_error("Unknown collection method")
+
         if not key_is_valid(value):
             self.add_error('Cannot collect an invalid field name')
         if value.startswith('_'):

diff --git a/features/read_api/collect.feature b/features/read_api/collect.feature
@@ -23,8 +23,16 @@ Feature: collect fields into grouped responses
           when I go to "/foo?collect=authority"
           then I should get back a status of "400"
 
+
     Scenario: should be able to collect false values
         Given "licensing_2.json" is in "foo" bucket
          when I go to "/foo?group_by=licence_name&filter_by=isPaymentRequired:false&collect=isPaymentRequired"
          then I should get back a status of "200"
          and the "1st" result should have "isPaymentRequired" with item "false"
+
+    Scenario: should be able to perform maths on collect
+        Given "sort_and_limit.json" is in "foo" bucket
+         when I go to "/foo?group_by=type&filter_by=type:wild&collect=value:sum&collect=value:mean"
+         then I should get back a status of "200"
+         and the "1st" result should have "value:sum" with json "27"
+         and the "1st" result should have "value:mean" with json "6.75"
diff --git a/features/steps/read_api.py b/features/steps/read_api.py
@@ -106,6 +106,13 @@ def step(context, nth, key, value):
     assert_that(the_data[i][key], has_item(json.loads(value)))
 
 
+@then('the "{nth}" result should have "{key}" with json "{expected_json}"')
+def impl(context, nth, key, expected_json):
+    the_data = json.loads(context.response.data)['data']
+    i = parse_position(nth, the_data)
+    assert_that(the_data[i][key], is_(json.loads(expected_json)))
+
+
 @then('the "{header}" header should be "{value}"')
 def step(context, header, value):
     assert_that(context.response.headers.get(header), is_(value))
diff --git a/tests/core/integration/test_database_integration.py b/tests/core/integration/test_database_integration.py
@@ -295,32 +295,33 @@ def test_grouping_by_multiple_keys(self):
     def test_grouping_with_collect(self):
         self.setUpPeopleLocationData()
 
-        results = self.repo.group("person", Query.create(), None, None, ["place"])
+        results = self.repo.group("person", Query.create(), None, None, [("place", "set")])
 
         assert_that(results, has_item(has_entries({
             "person": "John",
-            "place": has_items("Kettering", "Kennington")
+            "place:set": has_items("Kettering", "Kennington")
         })))
 
     def test_another_grouping_with_collect(self):
         self.setUpPeopleLocationData()
 
-        results = self.repo.group("place", Query.create(), None, None, ["person"])
+        results = self.repo.group("place", Query.create(), None, None, [("person", "set")])
 
         assert_that(results, has_item(has_entries({
             "place": "Kettering",
-            "person": has_items("Jack", "John")
+            "person:set": has_items("Jack", "John")
         })))
 
     def test_grouping_with_collect_two_fields(self):
         self.setUpPeopleLocationData()
 
-        results = self.repo.group("place", Query.create(), None, None, ["person", "hair"])
+        results = self.repo.group("place", Query.create(), None, None,
+                                  [("person", "set"), ("hair", "set")])
 
         assert_that(results, has_item(has_entries({
             "place": "Kettering",
-            "person": ["Jack", "John"],
-            "hair": ["blond", "dark", "red"]
+            "person:set": ["Jack", "John"],
+            "hair:set": ["blond", "dark", "red"]
         })))
 
     def test_grouping_on_non_existent_keys(self):
@@ -426,12 +427,12 @@ def test_multi_group_with_collect(self):
             "place",
             "_week_start_at",
             Query.create(),
-            collect=["person"]
+            collect=[("person", "set")]
         )
 
         assert_that(results, has_item(has_entries({
             "place": "Kettering",
-            "person": ["Jack", "John"]
+            "person:set": ["Jack", "John"]
         })))
 
 

diff --git a/tests/core/test_database.py b/tests/core/test_database.py
@@ -3,7 +3,7 @@
 from mock import Mock, patch
 from pymongo.errors import AutoReconnect
 from backdrop.core import database
-from backdrop.core.database import Repository, InvalidSortError, MongoDriver
+from backdrop.core.database import Repository, InvalidSortError, MongoDriver, apply_collection_method
 from backdrop.read.query import Query
 from tests.support.test_helpers import d_tz
 
@@ -71,6 +71,56 @@ def test_nested_merge_squashes_duplicates(self):
             {'a': 2}
         ]))
 
+    def test_nested_merge_collected_values(self):
+        stub_dictionaries = [
+            {'a': 1, 'b': [2], 'c': 3},
+            {'a': 1, 'b': [1], 'c': 3},
+            {'a': 2, 'b': [1], 'c': 3}
+        ]
+        output = database.nested_merge(['a'], [('b', 'set')], stub_dictionaries)
+        assert_that(output, is_([
+            {'a': 1, 'b:set': [1, 2], 'b': [1, 2]},
+            {'a': 2, 'b:set': [1], 'b': [1]}
+        ]))
+
+    def test_nested_merge_collect_sum(self):
+        stub_dictionaries = [
+            {'a': 1, 'b': [2]},
+            {'a': 1, 'b': [1]},
+            {'a': 2, 'b': [1]}
+        ]
+        output = database.nested_merge(['a'], [('b', 'sum')], stub_dictionaries)
+        assert_that(output, is_([
+            {'a': 1, 'b:sum': 3},
+            {'a': 2, 'b:sum': 1}
+        ]))
+
+
+class TestApplyCollectionMethod(unittest.TestCase):
+    def test_sum(self):
+        data = [2, 5, 8]
+        response = apply_collection_method(data, "sum")
+        assert_that(response, is_(15))
+
+    def test_count(self):
+        data = ['Sheep', 'Elephant', 'Wolf', 'Dog']
+        response = apply_collection_method(data, "count")
+        assert_that(response, is_(4))
+
+    def test_set(self):
+        data = ['Badger', 'Badger', 'Badger', 'Snake']
+        response = apply_collection_method(data, "set")
+        assert_that(response, is_(['Badger', 'Snake']))
+
+    def test_mean(self):
+        data = [13, 19, 15, 2]
+        response = apply_collection_method(data, "mean")
+        assert_that(response, is_(12.25))
+
+    def test_unknown_collection_method_raises_error(self):
+        self.assertRaises(ValueError,
+                          apply_collection_method, ['foo'], "unknown")
+
 
 class TestRepository(unittest.TestCase):
     def setUp(self):

diff --git a/tests/read/test_parse_request_args.py b/tests/read/test_parse_request_args.py
@@ -103,21 +103,31 @@ def test_limit_is_parsed(self):
 
         assert_that(args['limit'], is_(123))
 
-    def test_one_collect_is_parsed(self):
+    def test_one_collect_is_parsed_with_default_method(self):
         request_args = MultiDict([
             ("collect", "some_key")
         ])
 
         args = parse_request_args(request_args)
 
-        assert_that(args['collect'], is_(["some_key"]))
+        assert_that(args['collect'], is_([("some_key", "set")]))
 
-    def test_two_collects_are_parsed(self):
+    def test_two_collects_are_parsed_with_default_methods(self):
         request_args = MultiDict([
             ("collect", "some_key"),
             ("collect", "some_other_key")
         ])
 
         args = parse_request_args(request_args)
 
-        assert_that(args['collect'], is_(["some_key", "some_other_key"]))
+        assert_that(args['collect'], is_([("some_key", "set"),
+                                          ("some_other_key", "set")]))
+
+    def test_one_collect_is_parsed_with_custom_method(self):
+        request_args = MultiDict([
+            ("collect", "some_key:mean")
+        ])
+
+        args = parse_request_args(request_args)
+
+        assert_that(args['collect'], is_([("some_key", "mean")]))
diff --git a/tests/read/test_validation.py b/tests/read/test_validation.py
@@ -306,6 +306,27 @@ def test_that_queries_with_invalid_timezone_are_disallowed(self):
         assert_that(validation_result, is_invalid_with_message(
             "start_at is not a valid datetime"))
 
+    def test_that_collect_queries_with_valid_methods_are_allowed(self):
+        valid_collection_methods = ["sum", "count", "set", "mean"]
+
+        for method in valid_collection_methods:
+            validation_result = validate_request_args({
+                'group_by': 'foo',
+                'collect': 'field:{0}'.format(method),
+            })
+
+            assert_that(validation_result, is_valid())
+
+    def test_that_collect_queries_with_invalid_method_are_disallowed(self):
+        validation_result = validate_request_args({
+            'group_by': 'foo',
+            'collect': 'field:infinity',
+        })
+
+        assert_that(validation_result, is_invalid_with_message((
+            "Unknown collection method"
+        )))
+
 
 class TestValidationHelpers(TestCase):
     def test_timestamp_is_valid_method(self):