Skip to content

Commit

Permalink
Merge pull request #384 from bento-platform/public-logging
Browse files Browse the repository at this point in the history
Fix null individual_id in subject-associated biosamples; add logging for individuals public query endpoint
  • Loading branch information
davidlougheed committed Mar 8, 2023
2 parents 406d215 + e770d0e commit cc9c459
Show file tree
Hide file tree
Showing 10 changed files with 54 additions and 36 deletions.
20 changes: 10 additions & 10 deletions chord_metadata_service/chord/export_cbio.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,7 @@ def study_export(getPath: Callable[[str], str], dataset_id: str):

# Export samples
with open(getPath(SAMPLE_DATA_FILENAME), 'w', newline='\n') as file_sample:
sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier)\
.annotate(phenopacket_subject_id=F("phenopacket__subject"))
sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier)
sample_export(sampl, file_sample)

with open(getPath(SAMPLE_META_FILENAME), 'w', newline='\n') as file_sample_meta:
Expand Down Expand Up @@ -229,16 +228,17 @@ def sample_export(results, file_handle: TextIO):
samples = []
for sample in results:

# sample.inidividual may be null: use Phenopacket model Subject field
# instead if available or skip.
subject_id = None
if sample.individual is not None:
subject_id = sample.individual
elif sample.phenopacket_subject_id is not None:
subject_id = sample.phenopacket_subject_id
else:
# sample.individual may be null, if the biosample is not related to an individual.
# skip these.
# prior to 2.17.3, these were associated with phenopacket subject if individual was null;
# however, that is not fully correct behaviour and was a workaround for ingest not properly associating
# biosamples with individuals.

if sample.individual is None:
continue

subject_id = sample.individual

sample_obj = {
'individual_id': sanitize_id(subject_id),
'id': sanitize_id(sample.id)
Expand Down
17 changes: 15 additions & 2 deletions chord_metadata_service/chord/ingest/phenopackets.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,21 @@ def ingest_phenopacket(phenopacket_data: dict[str, Any], table_id: str, validate

new_phenopacket_id = phenopacket_data.get("id", str(uuid.uuid4()))

subject = phenopacket_data.get("subject")

phenotypic_features = phenopacket_data.get("phenotypic_features", [])
biosamples = phenopacket_data.get("biosamples", [])

# Pre-process biosamples; historically (<2.17.3) we were running into issues because a missing individual_id in
# the biosample meant it would be left as None rather than properly associated with a specified subject.
# - Here, we tag the biosample with the subject's ID if a subject is specified, since the Phenopacket spec
# explicitly says of the biosamples field:
# "This field describes samples that have been derived from the patient who is the object of the Phenopacket"
biosamples = [
{**bs, "individual_id": subject["id"]} if subject else bs
for bs in phenopacket_data.get("biosamples", [])
]

# Pull other fields out of the phenopacket input
genes = phenopacket_data.get("genes", [])
diseases = phenopacket_data.get("diseases", [])
hts_files = phenopacket_data.get("hts_files", [])
Expand All @@ -229,7 +242,7 @@ def ingest_phenopacket(phenopacket_data: dict[str, Any], table_id: str, validate
# - or, if it already exists, *update* the extra properties if needed.
# This is one of the few cases of 'updating' something that exists in Katsu.
subject_obj: Optional[pm.Individual] = None
if subject := phenopacket_data.get("subject"): # we have a dictionary of subject data in the phenopacket
if subject: # we have a dictionary of subject data in the phenopacket
subject_obj = update_or_create_subject(subject)

# Get or create all phenotypic features in the phenopacket
Expand Down
3 changes: 0 additions & 3 deletions chord_metadata_service/chord/tests/example_phenopacket.json
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@
},
{
"id": "sample2",
"individual_id": "patient1",
"description": "",
"sampled_tissue": {
"id": "UBERON:0002367",
Expand Down Expand Up @@ -182,7 +181,6 @@
},
{
"id": "sample4",
"individual_id": "patient1",
"description": "",
"sampled_tissue": {
"id": "UBERON:0001222",
Expand All @@ -208,7 +206,6 @@
},
{
"id": "sample5",
"individual_id": "patient1",
"description": "",
"sampled_tissue": {
"id": "UBERON:0015876",
Expand Down
16 changes: 3 additions & 13 deletions chord_metadata_service/chord/tests/test_export_cbio.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,6 @@ def setUp(self) -> None:
)
self.exp_res = ExperimentResult.objects.all()

# Update the last sample to remove direct reference to any individual.
# In that case, Sample and Individual are cross-referenced through the
# Phenopacket model.
pm.Biosample.objects.filter(
id=EXAMPLE_INGEST_PHENOPACKET["biosamples"][-1]["id"]
).update(individual=None)

@staticmethod
def stream_to_dict(output: TextIO) -> Dict[str, str]:
"""
Expand Down Expand Up @@ -176,8 +169,8 @@ def test_export_cbio_patient_data(self):
break

def test_export_cbio_sample_data(self):
samples = pm.Biosample.objects.filter(phenopacket=self.p)\
.annotate(phenopacket_subject_id=F("phenopacket__subject"))
samples = pm.Biosample.objects.filter(phenopacket=self.p)

with io.StringIO() as output:
exp.sample_export(samples, output)
# Check header
Expand Down Expand Up @@ -211,10 +204,7 @@ def test_export_cbio_sample_data(self):

self.assertTrue(REGEXP_INVALID_FOR_ID.search(record["PATIENT_ID"]) is None)
self.assertTrue(REGEXP_INVALID_FOR_ID.search(record["SAMPLE_ID"]) is None)
self.assertEqual(
record["PATIENT_ID"],
exp.sanitize_id(EXAMPLE_INGEST_PHENOPACKET["biosamples"][sample_count]["individual_id"])
)
self.assertEqual(record["PATIENT_ID"], exp.sanitize_id(samples[sample_count].individual_id))
self.assertEqual(
record["SAMPLE_ID"],
exp.sanitize_id(EXAMPLE_INGEST_PHENOPACKET["biosamples"][sample_count]["id"])
Expand Down
6 changes: 6 additions & 0 deletions chord_metadata_service/chord/tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ def test_ingesting_phenopackets_json(self):

biosamples = list(p.biosamples.all().order_by("id"))
self.assertEqual(len(biosamples), 5)

# Make sure biosamples are properly associated with phenopacket subject
# - Some test biosamples exclude individual_id; these should be properly associated too
for bs in biosamples:
self.assertEqual(bs.individual_id, p.subject.id)

# TODO: More

def test_reingesting_updating_phenopackets_json(self):
Expand Down
4 changes: 3 additions & 1 deletion chord_metadata_service/metadata/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
).lower() == "true"
logging.info(f"DEBUG: {DEBUG}")

LOG_LEVEL = os.environ.get("KATSU_LOG_LEVEL", "DEBUG" if DEBUG else "INFO").upper()


# CHORD-specific settings

Expand Down Expand Up @@ -199,7 +201,7 @@
},
'loggers': {
'': {
'level': 'DEBUG' if DEBUG else 'INFO',
'level': LOG_LEVEL,
'handlers': ['console'],
},
},
Expand Down
2 changes: 1 addition & 1 deletion chord_metadata_service/package.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[package]
name = katsu
version = 2.17.2
version = 2.17.3
authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire, Paul Pillot, Son Chau
18 changes: 12 additions & 6 deletions chord_metadata_service/patients/api_views.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from datetime import datetime

from rest_framework import viewsets, filters, mixins
from rest_framework import viewsets, filters, mixins, serializers
from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework.settings import api_settings
Expand All @@ -12,12 +12,14 @@
from django.db.models import Count, F, Q
from django.db.models.functions import Coalesce
from django.contrib.postgres.aggregates import ArrayAgg
from drf_spectacular.utils import extend_schema, inline_serializer
from bento_lib.responses import errors
from bento_lib.search import build_search_response

from .serializers import IndividualSerializer
from .models import Individual
from .filters import IndividualFilter
from chord_metadata_service.logger import logger
from chord_metadata_service.phenopackets.api_views import BIOSAMPLE_PREFETCH, PHENOPACKET_PREFETCH
from chord_metadata_service.phenopackets.models import Phenopacket
from chord_metadata_service.restapi.api_renderers import (
Expand All @@ -35,8 +37,7 @@
experiment_type_stats
)
from chord_metadata_service.restapi.negociation import FormatInPostContentNegotiation
from drf_spectacular.utils import extend_schema, inline_serializer
from rest_framework import serializers


OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"

Expand Down Expand Up @@ -162,7 +163,7 @@ def filter_queryset(self, queryset):
options = get_field_options(field_props)
if value not in options \
and not (
# case insensitive search on categories
# case-insensitive search on categories
field_props["datatype"] == "string"
and value.lower() in [o.lower() for o in options]
) \
Expand Down Expand Up @@ -190,14 +191,19 @@ def get(self, request, *args, **kwargs):
*(e.error_list if hasattr(e, "error_list") else e.error_dict.items()),
))

if filtered_qs.count() <= settings.CONFIG_PUBLIC["rules"]["count_threshold"]:
qct = filtered_qs.count()

if qct <= (threshold := settings.CONFIG_PUBLIC["rules"]["count_threshold"]):
logger.info(
f"Public individuals endpoint recieved query params {request.query_params} which resulted in "
f"sub-threshold count: {qct} <= {threshold}")
return Response(settings.INSUFFICIENT_DATA_AVAILABLE)

tissues_count, sampled_tissues = biosample_tissue_stats(filtered_qs)
experiments_count, experiment_types = experiment_type_stats(filtered_qs)

return Response({
"count": filtered_qs.count(),
"count": qct,
"biosamples": {
"count": tissues_count,
"sampled_tissue": sampled_tissues
Expand Down
1 change: 1 addition & 0 deletions chord_metadata_service/restapi/pagination.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

__all__ = [
"LargeResultsSetPagination",
"BatchResultsSetPagination",
]


Expand Down
3 changes: 3 additions & 0 deletions chord_metadata_service/restapi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from chord_metadata_service.phenopackets import models as pheno_models
from chord_metadata_service.experiments import models as experiments_models
from chord_metadata_service.logger import logger


LENGTH_Y_M = 4 + 1 + 2 # dates stored as yyyy-mm-dd
Expand Down Expand Up @@ -640,6 +641,8 @@ def filter_queryset_field_value(qs, field_props, value: str):
else:
raise NotImplementedError()

logger.debug(f"Filtering {model}.{field} with {condition}")

return qs.filter(**condition)


Expand Down

0 comments on commit cc9c459

Please sign in to comment.