Merge e2a793c into 9e90c73

alphagov · Jul 3, 2013 · eea2696 · eea2696
2 parents 9e90c73 + e2a793c
commit eea2696
Show file tree

Hide file tree

Showing 12 changed files with 268 additions and 54 deletions.
diff --git a/README.md b/README.md
@@ -45,3 +45,22 @@ This is the oauth flow we are using to authenticate users with Signonotron2
     - **GET** (to signonotron) `/user.json` uses access token to get user data and see if they have permissions to sign in to backdrop
 4. User is now signed in
 
+## Requesting data
+
+Requests return a JSON object containing a `data` array.
+
+`GET /bucket_name` will return an array of data. Each element is an object.
+
+`GET /bucket_name?collect=score&group_by=name` will return an array. In this
+case, each element of the array is an object containing a `name` value, a
+`score` array with the scores for that name and a `_count` value with the
+number of scores.
+
+`GET /bucket_name?filter_by=name:Foo` returns all elements with `name` equal to "Foo".
+
+Other parameters:
+
+- `start_at` (YYYY-MM-DDTHH:MM:SS+HH:MM) and `end_at` (YYYY-MM-DDTHH:MM:SS+HH:MM)
+- `period` ("week", "month")
+- `sort_by` (field)
+- `limit` (number)
diff --git a/backdrop/core/database.py b/backdrop/core/database.py
@@ -49,31 +49,32 @@ def _ignore_docs_without_grouping_keys(self, keys, query):
                 query[key] = {"$ne": None}
         return query
 
-    def group(self, keys, query, collect):
+    def group(self, keys, query, collect_fields):
         return self._collection.group(
             key=keys,
             condition=self._ignore_docs_without_grouping_keys(keys, query),
-            initial=self._build_accumulator_initial_state(collect),
-            reduce=self._build_reducer_function(collect)
+            initial=self._build_accumulator_initial_state(collect_fields),
+            reduce=self._build_reducer_function(collect_fields)
         )
 
-    def _build_collector_code(self, collect):
+    def _build_collector_code(self, collect_fields):
         template = "if (current.{c} !== undefined) " \
                    "{{ previous.{c}.push(current.{c}); }}"
-        code = [template.format(c=collect_me) for collect_me in collect]
+        code = [template.format(c=collect_field)
+                for collect_field in collect_fields]
         return "\n".join(code)
 
-    def _build_accumulator_initial_state(self, collect):
+    def _build_accumulator_initial_state(self, collect_fields):
         initial = {'_count': 0}
-        for collect_me in collect:
-            initial.update({collect_me: []})
+        for collect_field in collect_fields:
+            initial.update({collect_field: []})
         return initial
 
-    def _build_reducer_function(self, collect):
+    def _build_reducer_function(self, collect_fields):
         reducer_skeleton = "function (current, previous)" + \
                            "{{ previous._count++; {collectors} }}"
         reducer_code = reducer_skeleton.format(
-            collectors=self._build_collector_code(collect)
+            collectors=self._build_collector_code(collect_fields)
         )
         reducer = Code(reducer_code)
         return reducer
@@ -143,7 +144,8 @@ def _require_keys_in_query(self, keys, query):
         return query
 
     def _group(self, keys, query, sort=None, limit=None, collect=None):
-        results = self._mongo.group(keys, query, collect)
+        collect_fields = unique_collect_fields(collect)
+        results = self._mongo.group(keys, query, list(collect_fields))
 
         results = nested_merge(keys, collect, results)
 
@@ -173,22 +175,60 @@ class InvalidSortError(ValueError):
 
 def extract_collected_values(collect, result):
     collected = {}
-    for collect_field in collect:
+    for collect_field in unique_collect_fields(collect):
         collected[collect_field] = result.pop(collect_field)
     return collected, result
 
 
 def insert_collected_values(collected, group):
     for collect_field in collected.keys():
         if collect_field not in group:
-            group[collect_field] = set()
-        group[collect_field].update(collected[collect_field])
+            group[collect_field] = []
+        group[collect_field] += collected[collect_field]
 
 
-def convert_collected_values_to_list(collect, groups):
+def apply_collection_methods(collect, groups):
     for group in groups:
-        for collected_field in collect:
-            group[collected_field] = sorted(list(group[collected_field]))
+        for collect_field, collect_method in collect:
+            if collect_method == 'default':
+                collect_keys = [collect_field, '{0}:set'.format(collect_field)]
+            else:
+                collect_keys = ['{0}:{1}'.format(collect_field,
+                                                 collect_method)]
+            for collect_key in collect_keys:
+                group[collect_key] = apply_collection_method(
+                    group[collect_field], collect_method)
+        for collect_field in unique_collect_fields(collect):
+            del group[collect_field]
+            # This is to provide backwards compatibility with earlier interface
+            if (collect_field, 'default') in collect:
+                group[collect_field] = group['{0}:set'.format(collect_field)]
+
+
+def apply_collection_method(collected_data, collect_method):
+    if "sum" == collect_method:
+        try:
+            return sum(collected_data)
+        except TypeError:
+            raise InvalidOperationError("Unable to sum that data")
+    elif "count" == collect_method:
+        return len(collected_data)
+    elif "set" == collect_method:
+        return sorted(list(set(collected_data)))
+    elif "mean" == collect_method:
+        try:
+            return sum(collected_data) / float(len(collected_data))
+        except TypeError:
+            raise InvalidOperationError("Unable to find the mean of that data")
+    elif "default" == collect_method:
+        return sorted(list(set(collected_data)))
+    else:
+        raise ValueError("Unknown collection method")
+
+
+def unique_collect_fields(collect):
+    """Return the unique set of field names to collect."""
+    return set([collect_field for collect_field, _ in collect])
 
 
 def nested_merge(keys, collect, results):
@@ -200,7 +240,7 @@ def nested_merge(keys, collect, results):
 
         insert_collected_values(collected, group)
 
-    convert_collected_values_to_list(collect, groups)
+    apply_collection_methods(collect, groups)
     return groups
 
 
@@ -213,7 +253,7 @@ def _merge(groups, keys, result):
     group = _find_group(group for group in groups if group[key] == value)
     if not group:
         if is_leaf:
-            group = _new_leaf_node(key, value, result)
+            group = _new_leaf_node(key, value, result.get('_count'))
         else:
             group = _new_branch_node(key, value)
         groups.append(group)
@@ -240,10 +280,14 @@ def _new_branch_node(key, value):
     }
 
 
-def _new_leaf_node(key, value, result):
+def _new_leaf_node(key, value, count=None):
     """Create a new node that has no further sub-groups"""
-    result[key] = value
-    return result
+    r = {
+        key: value,
+    }
+    if count is not None:
+        r['_count'] = count
+    return r
 
 
 def _merge_and_sort_subgroup(group, keys, result):
@@ -254,3 +298,7 @@ def _merge_and_sort_subgroup(group, keys, result):
 def _add_branch_node_counts(group):
     group['_count'] = sum(doc.get('_count', 0) for doc in group['_subgroup'])
     group['_group_count'] = len(group['_subgroup'])
+
+
+class InvalidOperationError(TypeError):
+    pass
diff --git a/backdrop/read/api.py b/backdrop/read/api.py
@@ -12,6 +12,7 @@
 from .validation import validate_request_args
 from ..core import database, log_handler, cache_control
 from ..core.bucket import Bucket
+from ..core.database import InvalidOperationError
 
 
 def setup_logging():
@@ -69,6 +70,11 @@ def health_check():
                        message='cannot connect to database'), 500
 
 
+def log_error_and_respond(message, status_code):
+    app.logger.error(message)
+    return jsonify(status='error', message=message), status_code
+
+
 @app.route('/<bucket_name>', methods=['GET', 'OPTIONS'])
 @cache_control.set("max-age=3600, must-revalidate")
 @cache_control.etag
@@ -84,11 +90,14 @@ def query(bucket_name):
                                        raw_queries_allowed(bucket_name))
 
         if not result.is_valid:
-            app.logger.error(result.message)
-            return jsonify(status='error', message=result.message), 400
+            return log_error_and_respond(result.message, 400)
 
         bucket = Bucket(db, bucket_name)
-        result_data = bucket.query(Query.parse(request.args)).data()
+
+        try:
+            result_data = bucket.query(Query.parse(request.args)).data()
+        except InvalidOperationError:
+            return log_error_and_respond('invalid collect for that data', 400)
 
         # Taken from flask.helpers.jsonify to add JSONEncoder
         # NB. this can be removed once fix #471 works it's way into a release

diff --git a/backdrop/read/query.py b/backdrop/read/query.py
@@ -37,7 +37,12 @@ def parse_request_args(request_args):
 
     args['limit'] = if_present(int, request_args.get('limit'))
 
-    args['collect'] = request_args.getlist('collect')
+    args['collect'] = []
+    for collect_arg in request_args.getlist('collect'):
+        if ':' in collect_arg:
+            args['collect'].append(tuple(collect_arg.split(':')))
+        else:
+            args['collect'].append((collect_arg, 'default'))
 
     return args
 

diff --git a/backdrop/read/validation.py b/backdrop/read/validation.py
@@ -176,6 +176,11 @@ def validate(self, request_args, context):
             validate_field_value=self.validate_field_value)
 
     def validate_field_value(self, value, request_args, _):
+        if ":" in value:
+            value, operator = value.split(":")
+            if operator not in ["sum", "count", "set", "mean"]:
+                self.add_error("Unknown collection method")
+
         if not key_is_valid(value):
             self.add_error('Cannot collect an invalid field name')
         if value.startswith('_'):

diff --git a/features/read_api/collect.feature b/features/read_api/collect.feature
@@ -23,8 +23,22 @@ Feature: collect fields into grouped responses
           when I go to "/foo?collect=authority"
           then I should get back a status of "400"
 
+
     Scenario: should be able to collect false values
         Given "licensing_2.json" is in "foo" bucket
          when I go to "/foo?group_by=licence_name&filter_by=isPaymentRequired:false&collect=isPaymentRequired"
          then I should get back a status of "200"
          and the "1st" result should have "isPaymentRequired" with item "false"
+
+    Scenario: should be able to perform maths on collect
+        Given "sort_and_limit.json" is in "foo" bucket
+         when I go to "/foo?group_by=type&filter_by=type:wild&collect=value:sum&collect=value:mean"
+         then I should get back a status of "200"
+         and the "1st" result should have "value:sum" with json "27"
+         and the "1st" result should have "value:mean" with json "6.75"
+
+    Scenario: should receive a nice error when performing invalid operation
+        Given "dinosaurs.json" is in "foo" bucket
+         when I go to "/foo?group_by=type&collect=name:sum"
+         then I should get back a status of "400"
+         and the error message should be "invalid collect for that data"
diff --git a/features/read_api/group.feature b/features/read_api/group.feature
@@ -57,6 +57,7 @@ Feature: grouping queries for read api
          then I should get back a status of "200"
            and the JSON should have "1" results
            and the "1st" result should have "values" with item "{"_start_at": "2013-03-11T00:00:00+00:00", "_end_at": "2013-03-18T00:00:00+00:00", "_count": 2.0}"
+           and the "1st" result should have "values" with item "{"_start_at": "2013-03-18T00:00:00+00:00", "_end_at": "2013-03-25T00:00:00+00:00", "_count": 1.0}"
 
 
     Scenario: grouping data by time period (week) and a name that doesn't exist

diff --git a/features/steps/read_api.py b/features/steps/read_api.py
@@ -106,6 +106,19 @@ def step(context, nth, key, value):
     assert_that(the_data[i][key], has_item(json.loads(value)))
 
 
+@then('the "{nth}" result should have "{key}" with json "{expected_json}"')
+def impl(context, nth, key, expected_json):
+    the_data = json.loads(context.response.data)['data']
+    i = parse_position(nth, the_data)
+    assert_that(the_data[i][key], is_(json.loads(expected_json)))
+
+
 @then('the "{header}" header should be "{value}"')
 def step(context, header, value):
     assert_that(context.response.headers.get(header), is_(value))
+
+
+@then(u'the error message should be "{expected_message}"')
+def impl(context, expected_message):
+    error_message = json.loads(context.response.data)['message']
+    assert_that(error_message, is_(expected_message))