Merge pull request #384 from bento-platform/public-logging

Fix null individual_id in subject-associated biosamples; add logging for individuals public query endpoint
bento-platform · Mar 8, 2023 · cc9c459 · cc9c459
2 parents 406d215 + e770d0e
commit cc9c459
Show file tree

Hide file tree

Showing 10 changed files with 54 additions and 36 deletions.
diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py
@@ -95,8 +95,7 @@ def study_export(getPath: Callable[[str], str], dataset_id: str):
 
     # Export samples
     with open(getPath(SAMPLE_DATA_FILENAME), 'w', newline='\n') as file_sample:
-        sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier)\
-            .annotate(phenopacket_subject_id=F("phenopacket__subject"))
+        sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier)
         sample_export(sampl, file_sample)
 
     with open(getPath(SAMPLE_META_FILENAME), 'w', newline='\n') as file_sample_meta:
@@ -229,16 +228,17 @@ def sample_export(results, file_handle: TextIO):
     samples = []
     for sample in results:
 
-        # sample.inidividual may be null: use Phenopacket model Subject field
-        # instead if available or skip.
-        subject_id = None
-        if sample.individual is not None:
-            subject_id = sample.individual
-        elif sample.phenopacket_subject_id is not None:
-            subject_id = sample.phenopacket_subject_id
-        else:
+        # sample.individual may be null, if the biosample is not related to an individual.
+        # skip these.
+        # prior to 2.17.3, these were associated with phenopacket subject if individual was null;
+        # however, that is not fully correct behaviour and was a workaround for ingest not properly associating
+        # biosamples with individuals.
+
+        if sample.individual is None:
             continue
 
+        subject_id = sample.individual
+
         sample_obj = {
             'individual_id': sanitize_id(subject_id),
             'id': sanitize_id(sample.id)

diff --git a/chord_metadata_service/chord/ingest/phenopackets.py b/chord_metadata_service/chord/ingest/phenopackets.py
@@ -217,8 +217,21 @@ def ingest_phenopacket(phenopacket_data: dict[str, Any], table_id: str, validate
 
     new_phenopacket_id = phenopacket_data.get("id", str(uuid.uuid4()))
 
+    subject = phenopacket_data.get("subject")
+
     phenotypic_features = phenopacket_data.get("phenotypic_features", [])
-    biosamples = phenopacket_data.get("biosamples", [])
+
+    # Pre-process biosamples; historically (<2.17.3) we were running into issues because a missing individual_id in
+    # the biosample meant it would be left as None rather than properly associated with a specified subject.
+    #   - Here, we tag the biosample with the subject's ID if a subject is specified, since the Phenopacket spec
+    #     explicitly says of the biosamples field:
+    #       "This field describes samples that have been derived from the patient who is the object of the Phenopacket"
+    biosamples = [
+        {**bs, "individual_id": subject["id"]} if subject else bs
+        for bs in phenopacket_data.get("biosamples", [])
+    ]
+
+    # Pull other fields out of the phenopacket input
     genes = phenopacket_data.get("genes", [])
     diseases = phenopacket_data.get("diseases", [])
     hts_files = phenopacket_data.get("hts_files", [])
@@ -229,7 +242,7 @@ def ingest_phenopacket(phenopacket_data: dict[str, Any], table_id: str, validate
     # - or, if it already exists, *update* the extra properties if needed.
     #   This is one of the few cases of 'updating' something that exists in Katsu.
     subject_obj: Optional[pm.Individual] = None
-    if subject := phenopacket_data.get("subject"):  # we have a dictionary of subject data in the phenopacket
+    if subject:  # we have a dictionary of subject data in the phenopacket
         subject_obj = update_or_create_subject(subject)
 
     # Get or create all phenotypic features in the phenopacket

diff --git a/chord_metadata_service/chord/tests/example_phenopacket.json b/chord_metadata_service/chord/tests/example_phenopacket.json
@@ -122,7 +122,6 @@
         },
         {
             "id": "sample2",
-            "individual_id": "patient1",
             "description": "",
             "sampled_tissue": {
                 "id": "UBERON:0002367",
@@ -182,7 +181,6 @@
         },
         {
             "id": "sample4",
-            "individual_id": "patient1",
             "description": "",
             "sampled_tissue": {
                 "id": "UBERON:0001222",
@@ -208,7 +206,6 @@
         },
         {
             "id": "sample5",
-            "individual_id": "patient1",
             "description": "",
             "sampled_tissue": {
                 "id": "UBERON:0015876",

diff --git a/chord_metadata_service/chord/tests/test_export_cbio.py b/chord_metadata_service/chord/tests/test_export_cbio.py
@@ -70,13 +70,6 @@ def setUp(self) -> None:
         )
         self.exp_res = ExperimentResult.objects.all()
 
-        # Update the last sample to remove direct reference to any individual.
-        # In that case, Sample and Individual are cross-referenced through the
-        # Phenopacket model.
-        pm.Biosample.objects.filter(
-                id=EXAMPLE_INGEST_PHENOPACKET["biosamples"][-1]["id"]
-            ).update(individual=None)
-
     @staticmethod
     def stream_to_dict(output: TextIO) -> Dict[str, str]:
         """
@@ -176,8 +169,8 @@ def test_export_cbio_patient_data(self):
                 break
 
     def test_export_cbio_sample_data(self):
-        samples = pm.Biosample.objects.filter(phenopacket=self.p)\
-            .annotate(phenopacket_subject_id=F("phenopacket__subject"))
+        samples = pm.Biosample.objects.filter(phenopacket=self.p)
+
         with io.StringIO() as output:
             exp.sample_export(samples, output)
             # Check header
@@ -211,10 +204,7 @@ def test_export_cbio_sample_data(self):
 
                 self.assertTrue(REGEXP_INVALID_FOR_ID.search(record["PATIENT_ID"]) is None)
                 self.assertTrue(REGEXP_INVALID_FOR_ID.search(record["SAMPLE_ID"]) is None)
-                self.assertEqual(
-                    record["PATIENT_ID"],
-                    exp.sanitize_id(EXAMPLE_INGEST_PHENOPACKET["biosamples"][sample_count]["individual_id"])
-                )
+                self.assertEqual(record["PATIENT_ID"], exp.sanitize_id(samples[sample_count].individual_id))
                 self.assertEqual(
                     record["SAMPLE_ID"],
                     exp.sanitize_id(EXAMPLE_INGEST_PHENOPACKET["biosamples"][sample_count]["id"])

diff --git a/chord_metadata_service/chord/tests/test_ingest.py b/chord_metadata_service/chord/tests/test_ingest.py
@@ -153,6 +153,12 @@ def test_ingesting_phenopackets_json(self):
 
         biosamples = list(p.biosamples.all().order_by("id"))
         self.assertEqual(len(biosamples), 5)
+
+        # Make sure biosamples are properly associated with phenopacket subject
+        #  - Some test biosamples exclude individual_id; these should be properly associated too
+        for bs in biosamples:
+            self.assertEqual(bs.individual_id, p.subject.id)
+
         # TODO: More
 
     def test_reingesting_updating_phenopackets_json(self):

diff --git a/chord_metadata_service/metadata/settings.py b/chord_metadata_service/metadata/settings.py
@@ -42,6 +42,8 @@
 ).lower() == "true"
 logging.info(f"DEBUG: {DEBUG}")
 
+LOG_LEVEL = os.environ.get("KATSU_LOG_LEVEL", "DEBUG" if DEBUG else "INFO").upper()
+
 
 # CHORD-specific settings
 
@@ -199,7 +201,7 @@
     },
     'loggers': {
         '': {
-            'level': 'DEBUG' if DEBUG else 'INFO',
+            'level': LOG_LEVEL,
             'handlers': ['console'],
         },
     },

diff --git a/chord_metadata_service/package.cfg b/chord_metadata_service/package.cfg
@@ -1,4 +1,4 @@
 [package]
 name = katsu
-version = 2.17.2
+version = 2.17.3
 authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire, Paul Pillot, Son Chau
diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py
@@ -1,6 +1,6 @@
 from datetime import datetime
 
-from rest_framework import viewsets, filters, mixins
+from rest_framework import viewsets, filters, mixins, serializers
 from rest_framework.response import Response
 from rest_framework.views import APIView
 from rest_framework.settings import api_settings
@@ -12,12 +12,14 @@
 from django.db.models import Count, F, Q
 from django.db.models.functions import Coalesce
 from django.contrib.postgres.aggregates import ArrayAgg
+from drf_spectacular.utils import extend_schema, inline_serializer
 from bento_lib.responses import errors
 from bento_lib.search import build_search_response
 
 from .serializers import IndividualSerializer
 from .models import Individual
 from .filters import IndividualFilter
+from chord_metadata_service.logger import logger
 from chord_metadata_service.phenopackets.api_views import BIOSAMPLE_PREFETCH, PHENOPACKET_PREFETCH
 from chord_metadata_service.phenopackets.models import Phenopacket
 from chord_metadata_service.restapi.api_renderers import (
@@ -35,8 +37,7 @@
     experiment_type_stats
 )
 from chord_metadata_service.restapi.negociation import FormatInPostContentNegotiation
-from drf_spectacular.utils import extend_schema, inline_serializer
-from rest_framework import serializers
+
 
 OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"
 
@@ -162,7 +163,7 @@ def filter_queryset(self, queryset):
             options = get_field_options(field_props)
             if value not in options \
                     and not (
-                        # case insensitive search on categories
+                        # case-insensitive search on categories
                         field_props["datatype"] == "string"
                         and value.lower() in [o.lower() for o in options]
                     ) \
@@ -190,14 +191,19 @@ def get(self, request, *args, **kwargs):
                 *(e.error_list if hasattr(e, "error_list") else e.error_dict.items()),
             ))
 
-        if filtered_qs.count() <= settings.CONFIG_PUBLIC["rules"]["count_threshold"]:
+        qct = filtered_qs.count()
+
+        if qct <= (threshold := settings.CONFIG_PUBLIC["rules"]["count_threshold"]):
+            logger.info(
+                f"Public individuals endpoint recieved query params {request.query_params} which resulted in "
+                f"sub-threshold count: {qct} <= {threshold}")
             return Response(settings.INSUFFICIENT_DATA_AVAILABLE)
 
         tissues_count, sampled_tissues = biosample_tissue_stats(filtered_qs)
         experiments_count, experiment_types = experiment_type_stats(filtered_qs)
 
         return Response({
-            "count": filtered_qs.count(),
+            "count": qct,
             "biosamples": {
                 "count": tissues_count,
                 "sampled_tissue": sampled_tissues

diff --git a/chord_metadata_service/restapi/pagination.py b/chord_metadata_service/restapi/pagination.py
@@ -5,6 +5,7 @@
 
 __all__ = [
     "LargeResultsSetPagination",
+    "BatchResultsSetPagination",
 ]
 
 

diff --git a/chord_metadata_service/restapi/utils.py b/chord_metadata_service/restapi/utils.py
@@ -14,6 +14,7 @@
 
 from chord_metadata_service.phenopackets import models as pheno_models
 from chord_metadata_service.experiments import models as experiments_models
+from chord_metadata_service.logger import logger
 
 
 LENGTH_Y_M = 4 + 1 + 2  # dates stored as yyyy-mm-dd
@@ -640,6 +641,8 @@ def filter_queryset_field_value(qs, field_props, value: str):
     else:
         raise NotImplementedError()
 
+    logger.debug(f"Filtering {model}.{field} with {condition}")
+
     return qs.filter(**condition)