Merge pull request #339 from bento-platform/features/stats_list

Features/stats list
bento-platform · Sep 23, 2022 · 31cb9c8 · 31cb9c8
2 parents ff172dc + 827405c
commit 31cb9c8
Show file tree

Hide file tree

Showing 6 changed files with 165 additions and 32 deletions.
diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py
@@ -23,7 +23,8 @@
 from chord_metadata_service.restapi.pagination import LargeResultsSetPagination, BatchResultsSetPagination
 from chord_metadata_service.restapi.utils import (
     get_field_options,
-    filter_queryset_field_value
+    filter_queryset_field_value,
+    get_queryset_stats
 )
 from chord_metadata_service.restapi.negociation import FormatInPostContentNegotiation
 
@@ -139,8 +140,25 @@ def get(self, request, *args, **kwargs):
                 *(e.error_list if hasattr(e, "error_list") else e.error_dict.items()),
             ))
 
-        if filtered_qs.count() > settings.CONFIG_PUBLIC["rules"]["count_threshold"]:
-            return Response({"count": filtered_qs.count()})
-        else:
-            # the count < threshold when there is no match in db the queryset is empty, count = 0
+        if filtered_qs.count() <= settings.CONFIG_PUBLIC["rules"]["count_threshold"]:
             return Response(settings.INSUFFICIENT_DATA_AVAILABLE)
+
+        tissues_count, sampled_tissues = get_queryset_stats(
+            filtered_qs,
+            "phenopackets__biosamples__sampled_tissue__label"
+        )
+        experiments_count, experiment_type = get_queryset_stats(
+            filtered_qs,
+            "phenopackets__biosamples__experiment__experiment_type"
+        )
+        return Response({
+            "count": filtered_qs.count(),
+            "biosamples": {
+                "count": tissues_count,
+                "sampled_tissue": sampled_tissues
+            },
+            "experiments": {
+                "count": experiments_count,
+                "experiment_type": experiment_type
+            }
+        })
diff --git a/chord_metadata_service/patients/tests/test_api.py b/chord_metadata_service/patients/tests/test_api.py
@@ -288,6 +288,10 @@ def test_public_get(self):
             self.assertEqual(response_obj, settings.INSUFFICIENT_DATA_AVAILABLE)
         else:
             self.assertEqual(Individual.objects.all().count(), response_obj['count'])
+            self.assertEqual(response_obj['biosamples']['count'], 0)
+            self.assertIsInstance(response_obj['biosamples']['sampled_tissue'], list)
+            self.assertEqual(response_obj['experiments']['count'], 0)
+            self.assertIsInstance(response_obj['experiments']['experiment_type'], list)
 
     @override_settings(CONFIG_PUBLIC={})
     def test_public_get_no_config(self):

diff --git a/chord_metadata_service/restapi/api_views.py b/chord_metadata_service/restapi/api_views.py
@@ -10,11 +10,11 @@
 from rest_framework.decorators import api_view, permission_classes
 
 from chord_metadata_service.restapi.utils import (
+    get_age_numeric_binned,
     get_field_options,
     parse_individual_age,
     stats_for_field,
-    compute_binned_ages,
-    get_field_bins,
+    queryset_stats_for_field,
     get_categorical_stats,
     get_date_stats,
     get_range_stats
@@ -71,16 +71,7 @@ def overview(_request):
     diseases_stats = stats_for_field(pheno_models.Phenopacket, "diseases__term__label")
     diseases_count = len(diseases_stats)
 
-    # age_numeric is computed at ingestion time of phenopackets. On some instances
-    # it might be unavailable and as a fallback must be computed from the age JSON field which
-    # has two alternate formats (hence more complex and slower to process)
-    individuals_age = get_field_bins(patients_models.Individual, "age_numeric", OVERVIEW_AGE_BIN_SIZE)
-    if None in individuals_age:  # fallback
-        del individuals_age[None]
-        individuals_age = Counter(individuals_age)
-        individuals_age.update(
-            compute_binned_ages(OVERVIEW_AGE_BIN_SIZE)   # single update instead of creating iterables in a loop
-        )
+    individuals_age = get_age_numeric_binned(patients_models.Individual.objects.all(), OVERVIEW_AGE_BIN_SIZE)
 
     r = {
         "phenopackets": phenopackets_count,
@@ -138,6 +129,58 @@ def overview(_request):
     return Response(r)
 
 
+@api_view(["GET", "POST"])
+@permission_classes([OverrideOrSuperUserOnly])
+def search_overview(request):
+    """
+    get+post:
+    Overview statistics of a list of patients (associated with a search result)
+    - Parameter
+        - id: a list of patient ids
+    """
+    individual_id = request.GET.getlist("id") if request.method == "GET" else request.data.get("id", [])
+
+    queryset = patients_models.Individual.objects.all()
+    if len(individual_id) > 0:
+        queryset = queryset.filter(id__in=individual_id)
+
+    biosamples_count = queryset.values("phenopackets__biosamples__id").count()
+    experiments_count = queryset.values("phenopackets__biosamples__experiment__id").count()
+
+    # Sex related fields stats are precomputed here and post processed later
+    # to include missing values inferred from the schema
+    individuals_sex = queryset_stats_for_field(queryset, "sex")
+
+    r = {
+        "biosamples": {
+            "count": biosamples_count,
+            "sampled_tissue": queryset_stats_for_field(queryset, "phenopackets__biosamples__sampled_tissue__label"),
+            "histological_diagnosis": queryset_stats_for_field(
+                queryset,
+                "phenopackets__biosamples__histological_diagnosis__label"
+            ),
+        },
+        "diseases": {
+            "term": queryset_stats_for_field(queryset, "phenopackets__diseases__term__label"),
+        },
+        "individuals": {
+            "sex": {k: individuals_sex.get(k, 0) for k in (s[0] for s in pheno_models.Individual.SEX)},
+            "age": get_age_numeric_binned(queryset, OVERVIEW_AGE_BIN_SIZE),
+        },
+        "phenotypic_features": {
+            "type": queryset_stats_for_field(queryset, "phenopackets__phenotypic_features__pftype__label")
+        },
+        "experiments": {
+            "count": experiments_count,
+            "experiment_type": queryset_stats_for_field(
+                queryset,
+                "phenopackets__biosamples__experiment__experiment_type"
+            ),
+        },
+    }
+    return Response(r)
+
+
 # Cache page for the requested url for 2 hours
 @cache_page(60 * 60 * 2)
 @api_view(["GET"])

diff --git a/chord_metadata_service/restapi/tests/test_api.py b/chord_metadata_service/restapi/tests/test_api.py
@@ -1,3 +1,4 @@
+import json
 import os
 from copy import deepcopy
 
@@ -103,6 +104,17 @@ def test_overview(self):
         self.assertEqual(response_obj['data_type_specific']['instruments']['platform']['Illumina'], 2)
         self.assertEqual(response_obj['data_type_specific']['instruments']['model']['Illumina HiSeq 4000'], 2)
 
+    def test_search_overview(self):
+        payload = json.dumps({'id': [ph_c.VALID_INDIVIDUAL_1['id']]})
+        response = self.client.post(reverse('search-overview'), payload, content_type='application/json')
+        response_obj = response.json()
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertIsInstance(response_obj, dict)
+        self.assertEqual(response_obj['biosamples']['count'], 1)
+        self.assertIn('wall of urinary bladder', response_obj['biosamples']['sampled_tissue'])
+        self.assertIn('Proptosis', response_obj['phenotypic_features']['type'])
+        self.assertIn(ph_c.VALID_DISEASE_1['term']['label'], response_obj['diseases']['term'])
+
 
 class McodeOverviewTest(APITestCase):
     # create 2 mcodepackets for 2 individuals

diff --git a/chord_metadata_service/restapi/urls.py b/chord_metadata_service/restapi/urls.py
@@ -18,6 +18,7 @@
      public_search_fields,
      public_overview,
      public_dataset,
+     search_overview
 )
 from chord_metadata_service.restapi.routers import BatchListRouter
 
@@ -72,30 +73,31 @@
 urlpatterns = [
     path('', include(router.urls)),
     path('', include(batch_router.urls)),
+
     # apps schemas
     path('chord_phenopacket_schema', phenopacket_views.get_chord_phenopacket_schema,
          name="chord-phenopacket-schema"),
     path('experiment_schema', experiment_views.get_experiment_schema,
          name="experiment-schema"),
     path('mcode_schema', mcode_views.get_mcode_schema,
          name="mcode-schema"),
-    # overview
+
+    # overviews (statistics)
     path('overview', overview, name="overview"),
-    # mcode overview
     path('mcode_overview', mcode_overview, name="mcode-overview"),
+    path('search_overview', search_overview, name="search-overview"),
+
     # autocomplete URLs
     path('disease_term_autocomplete', DiseaseTermAutocomplete.as_view(), name='disease-term-autocomplete',),
     path('phenotypic_feature_type_autocomplete', PhenotypicFeatureTypeAutocomplete.as_view(),
          name='phenotypic-feature-type-autocomplete',),
     path('biosample_sampled_tissue_autocomplete', BiosampleSampledTissueAutocomplete.as_view(),
          name='biosample-sampled-tissue-autocomplete',),
-    # public search results
+
+    # public endpoints (no confidential information leak)
     path('public', individual_views.PublicListIndividuals.as_view(),
          name='public',),
-    # public search fields schema
     path('public_search_fields', public_search_fields, name='public-search-fields',),
-    # public overview
     path('public_overview', public_overview, name='public-overview',),
-    # public dataset properties
     path('public_dataset', public_dataset, name='public-dataset'),
 ]
diff --git a/chord_metadata_service/restapi/utils.py b/chord_metadata_service/restapi/utils.py
@@ -1,7 +1,7 @@
 import isodate
 import datetime
 
-from collections import defaultdict
+from collections import defaultdict, Counter
 from typing import Tuple, Mapping
 from calendar import month_abbr
 
@@ -13,6 +13,9 @@
 from chord_metadata_service.experiments import models as experiments_models
 
 
+OVERVIEW_AGE_BIN_SIZE = 10
+
+
 def camel_case_field_names(string):
     """ Function to convert snake_case field names to camelCase """
     # Capitalize every part except the first
@@ -193,13 +196,21 @@ def stats_for_field(model, field: str, add_missing=False) -> Mapping[str, int]:
     Computes counts of distinct values for a given field. Mainly applicable to
     char fields representing categories
     """
+    queryset = model.objects.all()
+    return queryset_stats_for_field(queryset, field, add_missing)
+
+
+def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping[str, int]:
+    """
+    Computes counts of distinct values for a queryset.
+    """
     # values() restrict the table of results to this COLUMN
     # annotate() creates a `total` column for the aggregation
     # Count() aggregates the results by performing a GROUP BY on the field
-    query_set = model.objects.all().values(field).annotate(total=Count(field))
+    queryset = queryset.values(field).annotate(total=Count(field))
 
     stats: Mapping[str, int] = dict()
-    for item in query_set:
+    for item in queryset:
         key = item[field]
         if key is None:
             continue
@@ -212,17 +223,17 @@ def stats_for_field(model, field: str, add_missing=False) -> Mapping[str, int]:
 
     if add_missing:
         isnull_filter = {f"{field}__isnull": True}
-        stats['missing'] = model.objects.all().values(field).filter(**isnull_filter).count()
+        stats['missing'] = queryset.values(field).filter(**isnull_filter).count()
 
     return stats
 
 
-def get_field_bins(model, field, bin_size):
+def get_field_bins(query_set, field, bin_size):
     # computes a new column "binned" by substracting the modulo by bin size to
     # the value which requires binning (e.g. 28 => 28 - 28 % 10 = 20)
     # cast to integer to avoid numbers such as 60.00 if that was a decimal,
     # and aggregate over this value.
-    query_set = model.objects.all().annotate(
+    query_set = query_set.annotate(
         binned=Cast(
             F(field) - Func(F(field), bin_size, function="MOD"),
             IntegerField()
@@ -232,13 +243,17 @@ def get_field_bins(model, field, bin_size):
     return stats
 
 
-def compute_binned_ages(bin_size: int):
+def compute_binned_ages(individual_queryset, bin_size: int):
     """
     When age_numeric field is not available, use this function to process
     the age field in its various formats.
-    Returns an array of values floored to the closest decade (e.g. 25 --> 20)
+    Params:
+        - individual_queryset: a queryset made on the individual model, containing
+            the age and age_numeric fields
+        - bin_size: how many years there is per bin
+    Returns a list of values floored to the closest decade (e.g. 25 --> 20)
     """
-    a = pheno_models.Individual.objects.filter(age_numeric__isnull=True).values('age')
+    a = individual_queryset.filter(age_numeric__isnull=True).values('age')
     binned_ages = []
     for r in a.iterator():  # reduce memory footprint (no caching)
         if r["age"] is None:
@@ -248,6 +263,45 @@ def compute_binned_ages(bin_size: int):
     return binned_ages
 
 
+def get_age_numeric_binned(individual_queryset, bin_size):
+    """
+    age_numeric is computed at ingestion time of phenopackets. On some instances
+    it might be unavailable and as a fallback must be computed from the age JSON field which
+    has two alternate formats (hence more complex and slower to process)
+    """
+    individuals_age = get_field_bins(individual_queryset, "age_numeric", bin_size)
+    if None not in individuals_age:
+        return individuals_age
+
+    del individuals_age[None]
+    individuals_age = Counter(individuals_age)
+    individuals_age.update(
+        compute_binned_ages(individual_queryset, bin_size)   # single update instead of creating iterables in a loop
+    )
+    return individuals_age
+
+
+def get_queryset_stats(queryset, field):
+    """
+    Fetches public statistics for a field within a given queryset. This function
+    is used to compute statistics after filtering has been applied.
+    A cutoff is applied to all counts to avoid leaking too small sets of results.
+    """
+    stats = queryset_stats_for_field(queryset, field, add_missing=True)
+    threshold = settings.CONFIG_PUBLIC["rules"]["count_threshold"]
+    bins = []
+    total = 0
+    for key, value in stats.items():
+        bins.append({
+            "label": key,
+            "value": value if value > threshold else 0
+        })
+        total += value
+    if total <= threshold:
+        total = 0
+    return total, bins
+
+
 def get_categorical_stats(field_props):
     """
     Fetches statistics for a given categorical field and apply privacy policies