Merge pull request #352 from bento-platform/bugfix/bento-public-missing

Bugfix/bento public missing
bento-platform · Nov 9, 2022 · 4538f23 · 4538f23
2 parents 545d99e + 4dbd8b1
commit 4538f23
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 38 deletions.
diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py
@@ -31,7 +31,8 @@
 from chord_metadata_service.restapi.utils import (
     get_field_options,
     filter_queryset_field_value,
-    get_queryset_stats
+    biosample_tissue_stats,
+    experiment_type_stats
 )
 from chord_metadata_service.restapi.negociation import FormatInPostContentNegotiation
 from drf_spectacular.utils import extend_schema, inline_serializer
@@ -192,14 +193,9 @@ def get(self, request, *args, **kwargs):
         if filtered_qs.count() <= settings.CONFIG_PUBLIC["rules"]["count_threshold"]:
             return Response(settings.INSUFFICIENT_DATA_AVAILABLE)
 
-        tissues_count, sampled_tissues = get_queryset_stats(
-            filtered_qs,
-            "phenopackets__biosamples__sampled_tissue__label"
-        )
-        experiments_count, experiment_type = get_queryset_stats(
-            filtered_qs,
-            "phenopackets__biosamples__experiment__experiment_type"
-        )
+        tissues_count, sampled_tissues = biosample_tissue_stats(filtered_qs)
+        experiments_count, experiment_types = experiment_type_stats(filtered_qs)
+
         return Response({
             "count": filtered_qs.count(),
             "biosamples": {
@@ -208,6 +204,6 @@ def get(self, request, *args, **kwargs):
             },
             "experiments": {
                 "count": experiments_count,
-                "experiment_type": experiment_type
+                "experiment_type": experiment_types
             }
         })
diff --git a/chord_metadata_service/patients/tests/test_api.py b/chord_metadata_service/patients/tests/test_api.py
@@ -1,5 +1,6 @@
 from copy import deepcopy
 import json
+import uuid
 import csv
 import io
 from django.conf import settings
@@ -12,6 +13,12 @@
 from chord_metadata_service.restapi.utils import iso_duration_to_years
 from chord_metadata_service.phenopackets.tests import constants as ph_c
 from chord_metadata_service.phenopackets import models as ph_m
+from chord_metadata_service.phenopackets.models import Biosample, Procedure, MetaData, Phenopacket
+from chord_metadata_service.experiments.models import Experiment, ExperimentResult, Instrument
+from chord_metadata_service.experiments.tests import constants as exp_c
+from chord_metadata_service.chord.models import Project, Dataset, TableOwnership, Table
+from chord_metadata_service.chord.data_types import DATA_TYPE_EXPERIMENT
+from chord_metadata_service.chord.tests import constants as chord_c
 
 from . import constants as c
 
@@ -650,3 +657,59 @@ def test_public_filtering_age_range_min_and_max_no_config(self):
         self.assertIsInstance(response_obj, dict)
         self.assertIsInstance(response_obj, dict)
         self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE)
+
+
+class PublicFilteringBiosampleAndExperimentStatsTest(APITestCase):
+    """ Test for stats returned by api/public GET """
+
+    response_threshold = 0
+    num_individuals_without_phenopackets = 30
+
+    def setUp(self) -> None:
+        self.project = Project.objects.create(**chord_c.VALID_PROJECT_1)
+        self.dataset = Dataset.objects.create(**chord_c.valid_dataset_1(self.project))
+        to, tr = chord_c.valid_table_1(self.dataset.identifier, model_compatible=True)
+        TableOwnership.objects.create(**to)
+        self.table = Table.objects.create(**tr)
+
+        # plain individuals from other tests above, no phenopackets
+        self.individuals = [c.generate_valid_individual() for _ in range(self.num_individuals_without_phenopackets)]
+        for individual in self.individuals:
+            Individual.objects.create(**individual)
+
+        # ..... but add someone with 2 biosamples and one experiment
+        self.individual, _ = Individual.objects.get_or_create(
+            id='patient:1', sex='FEMALE', age={"age": "P25Y3M2D"})
+        self.procedure = Procedure.objects.create(**ph_c.VALID_PROCEDURE_1)
+        self.biosample_1 = Biosample.objects.create(**ph_c.valid_biosample_1(self.individual, self.procedure))
+        self.biosample_2 = Biosample.objects.create(**ph_c.valid_biosample_2(None, self.procedure))
+        self.meta_data = MetaData.objects.create(**ph_c.VALID_META_DATA_1)
+        self.phenopacket = Phenopacket.objects.create(
+            id="phenopacket_id:1",
+            subject=self.individual,
+            meta_data=self.meta_data,
+            table=self.table
+        )
+        self.phenopacket.biosamples.set([self.biosample_1, self.biosample_2])
+
+        # table for experiments metadata
+        to_exp = TableOwnership.objects.create(table_id=uuid.uuid4(), service_id=uuid.uuid4(),
+                                               service_artifact="experiments", dataset=self.dataset)
+        self.t_exp = Table.objects.create(ownership_record=to_exp, name="Table 2", data_type=DATA_TYPE_EXPERIMENT)
+
+        # add Experiments metadata and link to self.biosample_1
+        self.instrument = Instrument.objects.create(**exp_c.valid_instrument())
+        self.experiment_result = ExperimentResult.objects.create(**exp_c.valid_experiment_result())
+        self.experiment = Experiment.objects.create(**exp_c.valid_experiment(
+            biosample=self.biosample_1, instrument=self.instrument, table=self.t_exp))
+        self.experiment.experiment_results.set([self.experiment_result])
+
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
+    def test_public_get_stats(self):
+        response = self.client.get('/api/public?sex=FEMALE')
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        response_obj = response.json()
+        self.assertEqual(response_obj['biosamples']['count'], 2)
+        self.assertIsInstance(response_obj['biosamples']['sampled_tissue'], list)
+        self.assertEqual(response_obj['experiments']['count'], 1)
+        self.assertIsInstance(response_obj['experiments']['experiment_type'], list)
diff --git a/chord_metadata_service/restapi/utils.py b/chord_metadata_service/restapi/utils.py
@@ -226,8 +226,8 @@ def monthly_generator(start: str, end: str) -> Tuple[int, int]:
     [start_year, start_month] = [int(k) for k in start.split("-")]
     [end_year, end_month] = [int(k) for k in end.split("-")]
     last_month_nb = (end_year - start_year) * 12 + end_month
-    for month_nb in range(start_month, last_month_nb):
-        year = start_year + month_nb // 12
+    for month_nb in range(start_month, last_month_nb + 1):
+        year = start_year + (month_nb - 1) // 12
         month = month_nb % 12 or 12
         yield year, month
 
@@ -271,13 +271,15 @@ def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping
     """
     # values() restrict the table of results to this COLUMN
     # annotate() creates a `total` column for the aggregation
-    # Count() aggregates the results by performing a GROUP BY on the field
-    queryset = queryset.values(field).annotate(total=Count(field))
+    # Count("*") aggregates results including nulls
+    annotated_queryset = queryset.values(field).annotate(total=Count("*"))
+    num_missing = 0
 
     stats: Mapping[str, int] = dict()
-    for item in queryset:
+    for item in annotated_queryset:
         key = item[field]
         if key is None:
+            num_missing = item["total"]
             continue
 
         if not isinstance(key, str):
@@ -291,8 +293,7 @@ def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping
         stats[key] = item["total"]
 
     if add_missing:
-        isnull_filter = {f"{field}__isnull": True}
-        stats['missing'] = queryset.values(field).filter(**isnull_filter).count()
+        stats["missing"] = num_missing
 
     return stats
 
@@ -350,27 +351,6 @@ def get_age_numeric_binned(individual_queryset, bin_size):
     return individuals_age
 
 
-def get_queryset_stats(queryset, field):
-    """
-    Fetches public statistics for a field within a given queryset. This function
-    is used to compute statistics after filtering has been applied.
-    A cutoff is applied to all counts to avoid leaking too small sets of results.
-    """
-    stats = queryset_stats_for_field(queryset, field, add_missing=True)
-    threshold = settings.CONFIG_PUBLIC["rules"]["count_threshold"]
-    bins = []
-    total = 0
-    for key, value in stats.items():
-        bins.append({
-            "label": key,
-            "value": value if value > threshold else 0
-        })
-        total += value
-    if total <= threshold:
-        total = 0
-    return total, bins
-
-
 def get_categorical_stats(field_props):
     """
     Fetches statistics for a given categorical field and apply privacy policies
@@ -597,3 +577,35 @@ def filter_queryset_field_value(qs, field_props, value: str):
         condition = {f"{field}__startswith": val}
 
     return qs.filter(**condition)
+
+
+def experiment_type_stats(queryset):
+    """
+    returns count and bento_public format list of stats for experiment type
+    note that queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys
+    """
+    e_types = queryset.values(label=F("phenopackets__biosamples__experiment__experiment_type")).annotate(
+        value=Count("phenopackets__biosamples__experiment", distinct=True))
+    return bento_public_format_count_and_stats_list(e_types)
+
+
+def biosample_tissue_stats(queryset):
+    """
+    returns count and bento_public format list of stats for biosample sampled_tissue
+    """
+    b_tissue = queryset.values(label=F("phenopackets__biosamples__sampled_tissue__label")).annotate(
+        value=Count("phenopackets__biosamples", distinct=True))
+    return bento_public_format_count_and_stats_list(b_tissue)
+
+
+def bento_public_format_count_and_stats_list(annotated_queryset):
+    stats_list = []
+    total = 0
+    for q in annotated_queryset:
+        label = q["label"]
+        value = int(q["value"])
+        total += value
+        if label is not None:
+            stats_list.append({"label": label, "value": value})
+
+    return total, stats_list