Skip to content

Commit

Permalink
Merge pull request #466 from bento-platform/fix/interpretations-ingest
Browse files Browse the repository at this point in the history
fix genomic interpretation ingestion
  • Loading branch information
v-rocheleau committed Jan 4, 2024
2 parents c4785df + 3b5a27f commit cdd9300
Show file tree
Hide file tree
Showing 9 changed files with 83 additions and 30 deletions.
67 changes: 44 additions & 23 deletions chord_metadata_service/chord/ingest/phenopackets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from chord_metadata_service.patients.values import KaryotypicSex
from chord_metadata_service.restapi.schema_utils import patch_project_schemas
from chord_metadata_service.restapi.types import ExtensionSchemaDict
from chord_metadata_service.restapi.utils import time_element_to_years
from chord_metadata_service.restapi.utils import COMPUTED_PROPERTY_PREFIX, time_element_to_years

from .exceptions import IngestError
from .resources import ingest_resource
Expand All @@ -34,6 +34,16 @@ def _get_or_create_opt(key: str, data: dict, create_func: Callable[..., T]) -> O
return obj


def _clean_extra_properties(extra_properties: dict) -> Dict:
"""
Removes computed properties from an extra_properties dictionary.
Computed extra_properties start with "__" and should never be ingested.
"""
if extra_properties:
return {k: v for k, v in extra_properties.items() if not k.startswith(COMPUTED_PROPERTY_PREFIX)}
return extra_properties


def get_or_create_phenotypic_feature(pf: dict) -> pm.PhenotypicFeature:
# Below is code for if we want to re-use phenotypic features in the future
# For now, the lack of a many-to-many relationship doesn't let us do that.
Expand Down Expand Up @@ -64,7 +74,7 @@ def get_or_create_phenotypic_feature(pf: dict) -> pm.PhenotypicFeature:
severity=pf.get("severity"),
onset=pf.get("onset"),
evidence=pf.get("evidence"), # TODO: Separate class for evidence?
extra_properties=pf.get("extra_properties", {}),
extra_properties=_clean_extra_properties(pf.get("extra_properties", {})),
)
pf_obj.save()
return pf_obj
Expand All @@ -84,7 +94,7 @@ def validate_phenopacket(phenopacket_data: dict[str, Any],


def update_or_create_subject(subject: dict) -> pm.Individual:
extra_properties: dict[str, Any] = subject.get("extra_properties", {})
extra_properties: dict[str, Any] = _clean_extra_properties(subject.get("extra_properties", {}))

# Pre-process subject data: ---------------------------------------------------------------------------------

Expand Down Expand Up @@ -145,7 +155,7 @@ def get_or_create_biosample(bs: dict) -> pm.Biosample:
pathological_stage=bs.get("pathological_stage", {}),
is_control_sample=bs.get("is_control_sample", False),
diagnostic_markers=bs.get("diagnostic_markers", []),
extra_properties=bs.get("extra_properties", {}),
extra_properties=_clean_extra_properties(bs.get("extra_properties", {})),
**bs_query
)

Expand Down Expand Up @@ -240,7 +250,7 @@ def get_or_create_genomic_interpretation(gen_interp: dict) -> pm.GenomicInterpre
interpretation_status=gen_interp["interpretation_status"],
gene_descriptor=gene_descriptor,
variant_interpretation=variant_interpretation,
extra_properties=gen_interp.get("extra_properties", {}),
extra_properties=_clean_extra_properties(gen_interp.get("extra_properties", {})),
)

if related_obj:
Expand All @@ -256,38 +266,49 @@ def get_or_create_disease(disease) -> pm.Disease:
term=disease["term"],
disease_stage=disease.get("disease_stage", []),
clinical_tnm_finding=disease.get("clinical_tnm_finding", []),
extra_properties=disease.get("extra_properties", {}),
extra_properties=_clean_extra_properties(disease.get("extra_properties", {})),
**query_and_check_nulls(disease, "onset")
)
return d_obj


def get_or_create_diagnosis(diagnosis: dict) -> pm.Diagnosis:
# Create GenomicInterpretation
genomic_interpretations_data = diagnosis.get("genomic_interpretations", [])
genomic_interpretations = [
get_or_create_genomic_interpretation(gen_interp)
for gen_interp
in genomic_interpretations_data
]
# disease = pm.Disease.objects.get_or_create(diagnosis["disease"])
diag_obj, _ = pm.Diagnosis.objects.get_or_create(
def get_or_create_interpretation_diagnosis(interpretation: dict) -> pm.Diagnosis | None:
diagnosis = interpretation.get("diagnosis")

if not diagnosis:
# No diagnosis to create
return

# One-to-one relation between Interpretation and Diagnosis.
# If an Interpretation has a Diagnosis, the created Diagnosis row uses the interpretation's ID as its PK
# This ensures unique diagnoses if more than one share the same disease/extra_properties
id = interpretation.get("id")
diag_obj, created = pm.Diagnosis.objects.get_or_create(
id=id,
disease=diagnosis.get("disease", {}),
extra_properties=diagnosis.get("extra_properties", {})
extra_properties=_clean_extra_properties(diagnosis.get("extra_properties", {})),
)
# diag_obj.disease.set(disease)
diag_obj.genomic_interpretations.set(genomic_interpretations)

if created:
# Create GenomicInterpretation
genomic_interpretations_data = diagnosis.get("genomic_interpretations", [])
genomic_interpretations = [
get_or_create_genomic_interpretation(gen_interp)
for gen_interp
in genomic_interpretations_data
]
diag_obj.genomic_interpretations.set(genomic_interpretations)
return diag_obj


def get_or_create_interpretation(interpretation: dict) -> pm.Interpretation:
diagnosis = get_or_create_diagnosis(interpretation["diagnosis"])
diagnosis = get_or_create_interpretation_diagnosis(interpretation)
interp_obj, _ = pm.Interpretation.objects.get_or_create(
id=interpretation["id"],
diagnosis=diagnosis,
progress_status=interpretation["progress_status"],
summary=interpretation.get("summary", {}),
extra_properties=interpretation.get("extra_properties", {})
summary=interpretation.get("summary", ""),
extra_properties=_clean_extra_properties(interpretation.get("extra_properties", {}))
)

return interp_obj
Expand Down Expand Up @@ -373,7 +394,7 @@ def ingest_phenopacket(phenopacket_data: dict[str, Any],
submitted_by=meta_data.get("submitted_by"),
phenopacket_schema_version=meta_data.get("phenopacket_schema_version"),
external_references=meta_data.get("external_references", []),
extra_properties=meta_data.get("extra_properties", {}),
extra_properties=_clean_extra_properties(meta_data.get("extra_properties", {})),
)
meta_data_obj.save()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
"karyotypic_sex": "UNKNOWN_KARYOTYPE",
"extra_properties": {
"cool_guy": true,
"smoker": false
"smoker": false,
"__computed": "This extra_property was computed and should not be ingested."
}
},
"phenotypic_features": [
Expand Down
3 changes: 3 additions & 0 deletions chord_metadata_service/chord/tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ def test_ingesting_phenopackets_json(self):
self.assertEqual(p.subject.sex, EXAMPLE_INGEST_PHENOPACKET["subject"]["sex"])
self.assertEqual(p.subject.karyotypic_sex, EXAMPLE_INGEST_PHENOPACKET["subject"]["karyotypic_sex"])

self.assertIn("__computed", EXAMPLE_INGEST_PHENOPACKET["subject"]["extra_properties"])
self.assertNotIn("__computed", p.subject.extra_properties)

pfs = list(p.phenotypic_features.all().order_by("pftype__id"))

self.assertEqual(len(pfs), 2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ class Migration(migrations.Migration):
migrations.AddField(
model_name='interpretation',
name='diagnosis',
field=models.ForeignKey(blank=True, help_text='One or more diagnoses, if made.', null=True, on_delete=django.db.models.deletion.CASCADE, to='phenopackets.diagnosis'),
field=models.OneToOneField(blank=True, help_text='The diagnosis, if made.', null=True, on_delete=django.db.models.deletion.CASCADE, to='phenopackets.diagnosis'),
),
migrations.DeleteModel(
name='Gene',
Expand All @@ -491,4 +491,9 @@ class Migration(migrations.Migration):
name='subject',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='genomic_interpretations', to='patients.individual'),
),
migrations.AlterField(
model_name='diagnosis',
name='id',
field=models.CharField(help_text="Unique ID, uses the parent Interpretation's id when ingesting.", max_length=200, primary_key=True, serialize=False),
),
]
6 changes: 4 additions & 2 deletions chord_metadata_service/phenopackets/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,8 @@ class Diagnosis(BaseTimeStamp):
FHIR: Condition
"""
id = models.CharField(primary_key=True, max_length=200,
help_text="Unique ID, uses the parent Interpretation's id when ingesting.")
disease = models.JSONField(null=True, blank=True, validators=[ontology_validator])
genomic_interpretations = models.ManyToManyField(
GenomicInterpretation, blank=True,
Expand Down Expand Up @@ -389,8 +391,8 @@ class Interpretation(BaseTimeStamp):
id = models.CharField(primary_key=True, max_length=200, help_text='An arbitrary identifier for the interpretation.')
progress_status = models.CharField(choices=PROGRESS_STATUS, max_length=200, blank=True,
help_text='The current status of work on the case.')
diagnosis = models.ForeignKey(Diagnosis, blank=True, null=True, on_delete=models.CASCADE,
help_text='One or more diagnoses, if made.')
diagnosis = models.OneToOneField(Diagnosis, blank=True, null=True, on_delete=models.CASCADE,
help_text='The diagnosis, if made.')
summary = models.CharField(max_length=200, blank=True, help_text='Free text summary of the interpretation.')
extra_properties = JSONField(blank=True, null=True,
help_text='Extra properties that are not supported by current schema')
Expand Down
11 changes: 10 additions & 1 deletion chord_metadata_service/phenopackets/serializers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from rest_framework import serializers

from chord_metadata_service.restapi.utils import computed_property
from .models import (
MetaData,
PhenotypicFeature,
Expand Down Expand Up @@ -159,12 +161,19 @@ def to_representation(self, instance):
response["variant_interpretation"] = VariantInterpretationSerializer(
instance.variant_interpretation, many=False, required=False).data

# subject_or_biosample_id is obtained from the referenced subject/biosample
# The 'subject_or_biosample_id' value is obtained from the referenced subject/biosample
# The '__related_type' property is added to extra_properties as a computed value ("__" prefix)
# This allows us to disambiguate on the client side for links
extra_properties = response.get("extra_properties", {})
computed_related_type = computed_property("related_type")
if instance.subject:
response["subject_or_biosample_id"] = instance.subject.id
extra_properties[computed_related_type] = "subject"
elif instance.biosample:
response["subject_or_biosample_id"] = instance.biosample.id
extra_properties[computed_related_type] = "biosample"

response["extra_properties"] = extra_properties
return response


Expand Down
3 changes: 2 additions & 1 deletion chord_metadata_service/phenopackets/tests/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,8 +603,9 @@ def valid_genomic_interpretation(gene_descriptor=None, variant_interpretation=No
return base


def valid_diagnosis(disease):
def valid_diagnosis(disease, id="interpretation:1"):
return dict(
id=id,
disease=disease,
extra_properties={
"comment": "test data"
Expand Down
4 changes: 3 additions & 1 deletion chord_metadata_service/phenopackets/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,13 @@ class CreateDiagnosisTest(APITestCase):

def setUp(self):
self.disease_ontology = c.VALID_DISEASE_ONTOLOGY
self.diagnosis = c.valid_diagnosis(self.disease_ontology)
self.diagnosis = c.valid_diagnosis(self.disease_ontology, "interpretation:unique_id")

def test_diagnosis(self):
response = get_post_response('diagnoses-list', self.diagnosis)
self.assertEqual(response.status_code, status.HTTP_201_CREATED)

def test_serializer(self):
serializer = s.DiagnosisSerializer(data=self.diagnosis)
self.assertEqual(serializer.is_valid(), True)

Expand Down
9 changes: 9 additions & 0 deletions chord_metadata_service/restapi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
"biosample": pheno_models.Biosample,
}

COMPUTED_PROPERTY_PREFIX = "__"


class BinWithValue(TypedDict):
label: str
Expand Down Expand Up @@ -691,3 +693,10 @@ def bento_public_format_count_and_stats_list(annotated_queryset) -> tuple[int, l
stats_list.append({"label": label, "value": value})

return total, stats_list


def computed_property(name: str):
"""
Takes a name and returns it prefixed with "__"
"""
return COMPUTED_PROPERTY_PREFIX + name

0 comments on commit cdd9300

Please sign in to comment.