Merge pull request #360 from bento-platform/hotfix/histogram-errs

Fix off-by-two histogram errors with binning
bento-platform · Nov 15, 2022 · 86748f8 · 86748f8
2 parents 56e0579 + 914a846
commit 86748f8
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 12 deletions.
diff --git a/chord_metadata_service/package.cfg b/chord_metadata_service/package.cfg
@@ -1,4 +1,4 @@
 [package]
 name = katsu
-version = 2.15.1
+version = 2.15.2
 authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire, Paul Pillot, Son Chau
diff --git a/chord_metadata_service/restapi/tests/constants.py b/chord_metadata_service/restapi/tests/constants.py
@@ -209,8 +209,16 @@
     }
 }
 
-VALID_INDIVIDUALS = [VALID_INDIVIDUAL_1, VALID_INDIVIDUAL_2, VALID_INDIVIDUAL_3, VALID_INDIVIDUAL_4,
-                     VALID_INDIVIDUAL_5, VALID_INDIVIDUAL_6, VALID_INDIVIDUAL_7, VALID_INDIVIDUAL_8]
+VALID_INDIVIDUALS = [
+    VALID_INDIVIDUAL_1,
+    VALID_INDIVIDUAL_2,
+    VALID_INDIVIDUAL_3,
+    VALID_INDIVIDUAL_4,
+    VALID_INDIVIDUAL_5,
+    VALID_INDIVIDUAL_6,
+    VALID_INDIVIDUAL_7,
+    VALID_INDIVIDUAL_8,
+]
 
 
 extra_properties_with_list = {
@@ -260,7 +268,13 @@
             "charts": [
                 {"field": "date_of_consent", "chart_type": "bar"},
                 {"field": "smoking", "chart_type": "bar"},
-                {"field": "baseline_creatinine", "chart_type": "bar"}
+                {"field": "baseline_creatinine", "chart_type": "bar"},
+            ]
+        },
+        {
+            "section_title": "Third Section",
+            "charts": [
+                {"field": "lab_test_result_value", "chart_type": "bar"},
             ]
         }
     ],

diff --git a/chord_metadata_service/restapi/tests/test_api.py b/chord_metadata_service/restapi/tests/test_api.py
@@ -256,6 +256,21 @@ def test_overview(self):
         self.assertIsInstance(response_obj, dict)
         self.assertEqual(response_obj["counts"]["individuals"], db_count)
 
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_overview_bins(self):
+        # test that there is the correct number of data entries for number
+        # histograms, vs. number of bins
+        response = self.client.get('/api/public_overview')
+        response_obj = response.json()
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertIsInstance(response_obj, dict)
+        self.assertEqual(
+            # 1 more bin than intervals expected: e.g. for config.bins = [2, 3, 4],
+            # we expect data entries for ≤2, [2 3), [3 4), ≥4
+            len(response_obj["fields"]["lab_test_result_value"]["config"]["bins"]) + 1,
+            len(response_obj["fields"]["lab_test_result_value"]["data"]),
+        )
+
     @override_settings(CONFIG_PUBLIC={})
     def test_overview_no_config(self):
         response = self.client.get('/api/public_overview')

diff --git a/chord_metadata_service/restapi/utils.py b/chord_metadata_service/restapi/utils.py
@@ -2,8 +2,8 @@
 import datetime
 
 from collections import defaultdict, Counter
-from typing import Tuple, Mapping, Generator
 from calendar import month_abbr
+from typing import Dict, List, Tuple, Mapping, Generator
 
 from django.db.models import Count, F, Func, IntegerField, CharField, Case, When, Value
 from django.db.models.functions import Cast
@@ -133,9 +133,9 @@ def custom_binning_generator(field_props) -> Generator[Tuple[int, int, str], Non
     """
     Generator for custom bins. It expects an array of bin boundaries (`bins` property)
     `minimum` and `maximum` properties are optional. When absent, there is no lower/upper
-    bound and the corresponding bin limit is open ended (as in "< 5").
-    If present but equal to the closest bin boundary, there is no open ended bin.
-    If present but different from closest bin, an extra bin is added to collect
+    bound and the corresponding bin limit is open-ended (as in "< 5").
+    If present but equal to the closest bin boundary, there is no open-ended bin.
+    If present but different from the closest bin, an extra bin is added to collect
     all values down/up to the min/max value that is set (open-ended without limit)
     For example, given the following configuration:
     {
@@ -150,7 +150,7 @@ def custom_binning_generator(field_props) -> Generator[Tuple[int, int, str], Non
     c = field_props["config"]
     minimum = int(c["minimum"]) if "minimum" in c else None
     maximum = int(c["maximum"]) if "maximum" in c else None
-    bins = [int(value) for value in c["bins"]]
+    bins: List[int] = [int(value) for value in c["bins"]]
 
     # check prerequisites
     # Note: it raises an error as it reflects an error in the config file
@@ -170,11 +170,17 @@ def custom_binning_generator(field_props) -> Generator[Tuple[int, int, str], Non
     if minimum is None or minimum != bins[0]:
         yield minimum, bins[0], f"< {bins[0]}"
 
-    for i in range(1, len(bins) - 2):
+    # Generate interstitial bins for the range.
+    # range() is semi-open: [1, len(bins))
+    # – so in terms of indices, we skip the first bin (we access it via i-1 for lhs)
+    #   and generate [lhs, rhs) pairs for each pair of bins until the end.
+    # Values beyond the last bin gets handled separately.
+    for i in range(1, len(bins)):
         lhs = bins[i - 1]
         rhs = bins[i]
         yield lhs, rhs, f"{lhs}-{rhs}"
 
+    # Then, handle values beyond the value of the last bin
     if maximum is None or maximum != bins[-1]:
         yield bins[-1], maximum, f"≥ {bins[-1]}"
 
@@ -275,7 +281,7 @@ def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping
     annotated_queryset = queryset.values(field).annotate(total=Count("*"))
     num_missing = 0
 
-    stats: Mapping[str, int] = dict()
+    stats: Dict[str, int] = dict()
     for item in annotated_queryset:
         key = item[field]
         if key is None:
@@ -496,7 +502,7 @@ def get_range_stats(field_props):
         .values(label=Case(*whens, default=Value("missing"), output_field=CharField()))\
         .annotate(total=Count("label"))
 
-    stats: Mapping[str, int] = dict()
+    stats: Dict[str, int] = dict()
     for item in query_set:
         key = item["label"]
         stats[key] = item["total"] if item["total"] > threshold else 0