Skip to content

Commit

Permalink
Merge pull request #360 from bento-platform/hotfix/histogram-errs
Browse files Browse the repository at this point in the history
Fix off-by-two histogram errors with binning
  • Loading branch information
davidlougheed committed Nov 15, 2022
2 parents 56e0579 + 914a846 commit 86748f8
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 12 deletions.
2 changes: 1 addition & 1 deletion chord_metadata_service/package.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[package]
name = katsu
version = 2.15.1
version = 2.15.2
authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire, Paul Pillot, Son Chau
20 changes: 17 additions & 3 deletions chord_metadata_service/restapi/tests/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,16 @@
}
}

VALID_INDIVIDUALS = [VALID_INDIVIDUAL_1, VALID_INDIVIDUAL_2, VALID_INDIVIDUAL_3, VALID_INDIVIDUAL_4,
VALID_INDIVIDUAL_5, VALID_INDIVIDUAL_6, VALID_INDIVIDUAL_7, VALID_INDIVIDUAL_8]
VALID_INDIVIDUALS = [
VALID_INDIVIDUAL_1,
VALID_INDIVIDUAL_2,
VALID_INDIVIDUAL_3,
VALID_INDIVIDUAL_4,
VALID_INDIVIDUAL_5,
VALID_INDIVIDUAL_6,
VALID_INDIVIDUAL_7,
VALID_INDIVIDUAL_8,
]


extra_properties_with_list = {
Expand Down Expand Up @@ -260,7 +268,13 @@
"charts": [
{"field": "date_of_consent", "chart_type": "bar"},
{"field": "smoking", "chart_type": "bar"},
{"field": "baseline_creatinine", "chart_type": "bar"}
{"field": "baseline_creatinine", "chart_type": "bar"},
]
},
{
"section_title": "Third Section",
"charts": [
{"field": "lab_test_result_value", "chart_type": "bar"},
]
}
],
Expand Down
15 changes: 15 additions & 0 deletions chord_metadata_service/restapi/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,21 @@ def test_overview(self):
self.assertIsInstance(response_obj, dict)
self.assertEqual(response_obj["counts"]["individuals"], db_count)

@override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
def test_overview_bins(self):
# test that there is the correct number of data entries for number
# histograms, vs. number of bins
response = self.client.get('/api/public_overview')
response_obj = response.json()
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertIsInstance(response_obj, dict)
self.assertEqual(
# 1 more bin than intervals expected: e.g. for config.bins = [2, 3, 4],
# we expect data entries for ≤2, [2 3), [3 4), ≥4
len(response_obj["fields"]["lab_test_result_value"]["config"]["bins"]) + 1,
len(response_obj["fields"]["lab_test_result_value"]["data"]),
)

@override_settings(CONFIG_PUBLIC={})
def test_overview_no_config(self):
response = self.client.get('/api/public_overview')
Expand Down
22 changes: 14 additions & 8 deletions chord_metadata_service/restapi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import datetime

from collections import defaultdict, Counter
from typing import Tuple, Mapping, Generator
from calendar import month_abbr
from typing import Dict, List, Tuple, Mapping, Generator

from django.db.models import Count, F, Func, IntegerField, CharField, Case, When, Value
from django.db.models.functions import Cast
Expand Down Expand Up @@ -133,9 +133,9 @@ def custom_binning_generator(field_props) -> Generator[Tuple[int, int, str], Non
"""
Generator for custom bins. It expects an array of bin boundaries (`bins` property)
`minimum` and `maximum` properties are optional. When absent, there is no lower/upper
bound and the corresponding bin limit is open ended (as in "< 5").
If present but equal to the closest bin boundary, there is no open ended bin.
If present but different from closest bin, an extra bin is added to collect
bound and the corresponding bin limit is open-ended (as in "< 5").
If present but equal to the closest bin boundary, there is no open-ended bin.
If present but different from the closest bin, an extra bin is added to collect
all values down/up to the min/max value that is set (open-ended without limit)
For example, given the following configuration:
{
Expand All @@ -150,7 +150,7 @@ def custom_binning_generator(field_props) -> Generator[Tuple[int, int, str], Non
c = field_props["config"]
minimum = int(c["minimum"]) if "minimum" in c else None
maximum = int(c["maximum"]) if "maximum" in c else None
bins = [int(value) for value in c["bins"]]
bins: List[int] = [int(value) for value in c["bins"]]

# check prerequisites
# Note: it raises an error as it reflects an error in the config file
Expand All @@ -170,11 +170,17 @@ def custom_binning_generator(field_props) -> Generator[Tuple[int, int, str], Non
if minimum is None or minimum != bins[0]:
yield minimum, bins[0], f"< {bins[0]}"

for i in range(1, len(bins) - 2):
# Generate interstitial bins for the range.
# range() is semi-open: [1, len(bins))
# – so in terms of indices, we skip the first bin (we access it via i-1 for lhs)
# and generate [lhs, rhs) pairs for each pair of bins until the end.
# Values beyond the last bin gets handled separately.
for i in range(1, len(bins)):
lhs = bins[i - 1]
rhs = bins[i]
yield lhs, rhs, f"{lhs}-{rhs}"

# Then, handle values beyond the value of the last bin
if maximum is None or maximum != bins[-1]:
yield bins[-1], maximum, f"≥ {bins[-1]}"

Expand Down Expand Up @@ -275,7 +281,7 @@ def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping
annotated_queryset = queryset.values(field).annotate(total=Count("*"))
num_missing = 0

stats: Mapping[str, int] = dict()
stats: Dict[str, int] = dict()
for item in annotated_queryset:
key = item[field]
if key is None:
Expand Down Expand Up @@ -496,7 +502,7 @@ def get_range_stats(field_props):
.values(label=Case(*whens, default=Value("missing"), output_field=CharField()))\
.annotate(total=Count("label"))

stats: Mapping[str, int] = dict()
stats: Dict[str, int] = dict()
for item in query_set:
key = item["label"]
stats[key] = item["total"] if item["total"] > threshold else 0
Expand Down

0 comments on commit 86748f8

Please sign in to comment.