Merge pull request #107 from c3g/more-summary-data

Disease and phenotypic feature summary data
bento-platform · Apr 1, 2020 · 782a989 · 782a989
2 parents 38b74f9 + 521903a
commit 782a989
Show file tree

Hide file tree

Showing 8 changed files with 97 additions and 73 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,8 @@
 dist: bionic
 language: python
 python:
-  - "3.7"
+  - "3.6"
+  - "3.8"
 addons:
   postgresql: "11"
   apt:

diff --git a/chord_metadata_service/chord/models.py b/chord_metadata_service/chord/models.py
@@ -122,7 +122,7 @@ def n_of_tables(self):
     keywords = ArrayField(JSONField(null=True, blank=True), blank=True, null=True,
                           help_text="Tags associated with the dataset, which will help in its discovery.")
     version = models.CharField(max_length=200, blank=True, default=version_default,
-                                  help_text="A release point for the dataset when applicable.")
+                               help_text="A release point for the dataset when applicable.")
     extra_properties = JSONField(blank=True, null=True,
                                  help_text="Extra properties that do not fit in the previous specified attributes.")
 

diff --git a/chord_metadata_service/chord/views_search.py b/chord_metadata_service/chord/views_search.py
@@ -14,7 +14,7 @@
 from chord_metadata_service.metadata.settings import DEBUG
 from chord_metadata_service.patients.models import Individual
 from chord_metadata_service.phenopackets.api_views import PHENOPACKET_PREFETCH
-from chord_metadata_service.phenopackets.models import Phenopacket, Biosample
+from chord_metadata_service.phenopackets.models import Phenopacket
 from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA
 from chord_metadata_service.phenopackets.serializers import PhenopacketSerializer
 from chord_metadata_service.metadata.elastic import es
@@ -103,24 +103,48 @@ def chord_table_summary(_request, table_id):
         table = Dataset.objects.get(identifier=table_id)
         phenopackets = Phenopacket.objects.filter(dataset=table)
 
-        biosamples_set = frozenset(
-            p["biosamples__id"] for p in phenopackets.prefetch_related("biosamples").values("biosamples__id"))
+        diseases_counter = Counter()
+        phenotypic_features_counter = Counter()
 
-        biosamples_cs = Counter(b.is_control_sample for b in Biosample.objects.filter(id__in=biosamples_set))
+        biosamples_set = set()
+        individuals_set = set()
 
-        biosamples_taxonomy = Counter(b.taxonomy["id"] for b in Biosample.objects.filter(id__in=biosamples_set)
-                                      if b.taxonomy is not None)
+        biosamples_cs = Counter()
+        biosamples_taxonomy = Counter()
 
-        individuals_set = frozenset({
-            *(p["subject"] for p in phenopackets.values("subject")),
-            *(p["biosamples__individual_id"]
-              for p in phenopackets.prefetch_related("biosamples").values("biosamples__individual_id")),
-        })
+        individuals_sex = Counter()
+        individuals_k_sex = Counter()
+        individuals_taxonomy = Counter()
+
+        def count_individual(ind):
+            individuals_set.add(ind.id)
+            individuals_sex.update((ind.sex,))
+            individuals_k_sex.update((ind.karyotypic_sex,))
+            if ind.taxonomy is not None:
+                individuals_taxonomy.update((ind.taxonomy["id"],))
+
+        for p in phenopackets.prefetch_related("biosamples"):
+            for b in p.biosamples.all():
+                biosamples_set.add(b.id)
+                biosamples_cs.update((b.is_control_sample,))
+
+                if b.taxonomy is not None:
+                    biosamples_taxonomy.update((b.taxonomy["id"],))
+
+                if b.individual is not None:
+                    count_individual(b.individual)
+
+                for pf in b.phenotypic_features.all():
+                    phenotypic_features_counter.update((pf.pftype["id"],))
+
+            for d in p.diseases.all():
+                diseases_counter.update((d.term["id"],))
+
+            for pf in p.phenotypic_features.all():
+                phenotypic_features_counter.update((pf.pftype["id"],))
 
-        individuals_sex = Counter(i.sex for i in Individual.objects.filter(id__in=individuals_set))
-        individuals_k_sex = Counter(i.karyotypic_sex for i in Individual.objects.filter(id__in=individuals_set))
-        individuals_taxonomy = Counter(i.taxonomy["id"] for i in Individual.objects.filter(id__in=individuals_set)
-                                       if i.taxonomy is not None)
+            # Currently, phenopacket subject is required so we can assume it's not None
+            count_individual(p.subject)
 
         return Response({
             "count": phenopackets.count(),
@@ -130,14 +154,15 @@ def chord_table_summary(_request, table_id):
                     "is_control_sample": dict(biosamples_cs),
                     "taxonomy": dict(biosamples_taxonomy),
                 },
+                "diseases": dict(diseases_counter),
                 "individuals": {
                     "count": len(individuals_set),
                     "sex": {k: individuals_sex[k] for k in (s[0] for s in Individual.SEX)},
                     "karyotypic_sex": {k: individuals_k_sex[k] for k in (s[0] for s in Individual.KARYOTYPIC_SEX)},
-                    "diseases": {},
                     "taxonomy": dict(individuals_taxonomy),
                     # TODO: age histogram
                 },
+                "phenotypic_features": dict(phenotypic_features_counter),
             }
         })
 
@@ -271,42 +296,40 @@ def fhir_search(request, internal_data=False):
 
     res = es.search(index=settings.FHIR_INDEX_NAME, body=query)
 
-    subject_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Patient']
-    htsfile_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'DocumentReference']
-    disease_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Condition']
-    biosample_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Specimen']
-    phenotypicfeature_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Observation']
-    phenopacket_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Composition']
+    def hits_for(resource_type: str):
+        return frozenset(hit["_id"].split("|")[1] for hit in res["hits"]["hits"]
+                         if hit['_source']['resourceType'] == resource_type)
 
-    if (not subject_ids and not htsfile_ids and not disease_ids
-        and not biosample_ids and not phenotypicfeature_ids and not phenopacket_ids):
+    subject_ids = hits_for('Patient')
+    htsfile_ids = hits_for('DocumentReference')
+    disease_ids = hits_for('Condition')
+    biosample_ids = hits_for('Specimen')
+    phenotypicfeature_ids = hits_for('Observation')
+    phenopacket_ids = hits_for('Composition')
+
+    if all((not subject_ids, not htsfile_ids, not disease_ids, not biosample_ids, not phenotypicfeature_ids,
+            not phenopacket_ids)):
         return Response(build_search_response([], start))
-    else:
-        phenopackets = phenopacket_filter_results(
-            subject_ids,
-            htsfile_ids,
-            disease_ids,
-            biosample_ids,
-            phenotypicfeature_ids,
-            phenopacket_ids
-        )
+
+    phenopackets = phenopacket_filter_results(
+        subject_ids,
+        htsfile_ids,
+        disease_ids,
+        biosample_ids,
+        phenotypicfeature_ids,
+        phenopacket_ids
+    )
 
     if not internal_data:
-        datasets = Dataset.objects.filter(
-            identifier__in = [
-                p.dataset_id for p in phenopackets
-            ]
-        )  # TODO: Maybe can avoid hitting DB here
+        # TODO: Maybe can avoid hitting DB here
+        datasets = Dataset.objects.filter(identifier__in=frozenset(p.dataset_id for p in phenopackets))
         return Response(build_search_response([{"id": d.identifier, "data_type": PHENOPACKET_DATA_TYPE_ID}
                                                for d in datasets], start))
     return Response(build_search_response({
         dataset_id: {
             "data_type": PHENOPACKET_DATA_TYPE_ID,
             "matches": list(PhenopacketSerializer(p).data for p in dataset_phenopackets)
-        } for dataset_id, dataset_phenopackets in itertools.groupby(
-            phenopackets,
-            key=lambda p: str(p.dataset_id)
-        )
+        } for dataset_id, dataset_phenopackets in itertools.groupby(phenopackets, key=lambda p: str(p.dataset_id))
     }, start))
 
 

diff --git a/chord_metadata_service/patients/models.py b/chord_metadata_service/patients/models.py
@@ -44,10 +44,11 @@ class Individual(models.Model, IndexableMixin):
     active = models.BooleanField(default=False, help_text='Whether this patient\'s record is in active use.')
     deceased = models.BooleanField(default=False, help_text='Indicates if the individual is deceased or not.')
     # mCode specific
-    # this field should be complex Ontology - clinical status and code - two Codeable concept - single, cl status has enum list of values
+    # this field should be complex Ontology - clinical status and code - two Codeable concept - single, cl status has
+    # enum list of values
     comorbid_condition = JSONField(blank=True, null=True, help_text='One or more conditions that occur with primary'
                                                                     ' condition.')
-    #TODO decide use ONTOLOGY_CLASS vs. CODEABLE_CONCEPT
+    # TODO decide use ONTOLOGY_CLASS vs. CODEABLE_CONCEPT
     ecog_performance_status = JSONField(blank=True, null=True, help_text='Value representing the Eastern Cooperative '
                                                                          'Oncology Group performance status.')
     karnofsky = JSONField(blank=True, null=True, help_text='Value representing the Karnofsky Performance status.')

diff --git a/examples/dataset.json b/examples/dataset.json
@@ -0,0 +1,17 @@
+{
+  "title": "Test Dataset",
+  "description": "This is a test dataset",
+  "project": "uuid-for-project-record",
+  "data_use": {
+    "consent_code": {
+      "primary_category": {"code": "GRU"},
+      "secondary_categories": [
+        {"code": "GSO"}
+      ]
+    },
+    "data_use_requirements": [
+      {"code":  "COL"},
+      {"code":  "PUB"}
+    ]
+  }
+}
diff --git a/examples/single_req.json b/examples/single_req.json
@@ -1,24 +1,6 @@
 {
   "table_id": "3df5e8b0-3949-4d3c-a37f-1c6a81940d50",
   "workflow_id": "phenopackets_json",
-  "workflow_metadata": {
-    "inputs": [
-      {
-        "id": "json_document",
-        "type": "file",
-        "extensions": [
-          ".json"
-        ]
-      }
-    ],
-    "outputs": [
-      {
-        "id": "json_document",
-        "type": "file",
-        "value": "{json_document}"
-      }
-    ]
-  },
   "workflow_params": {
     "phenopackets_json.json_document": "/home/dlougheed/git/chord_metadata_service/examples/single.json"
   },

diff --git a/requirements.txt b/requirements.txt
@@ -3,24 +3,24 @@ attrs==19.3.0
 Babel==2.8.0
 certifi==2019.11.28
 chardet==3.0.4
-chord-lib==0.6.0
-codecov==2.0.16
+chord-lib==0.7.0
+codecov==2.0.22
 colorama==0.4.3
 coreapi==2.3.3
 coreschema==0.0.4
-coverage==5.0.3
+coverage==5.0.4
 Django==2.2.11
 django-filter==2.2.0
 django-nose==1.4.6
 django-rest-swagger==2.2.0
 djangorestframework==3.10.3
 djangorestframework-camel-case==1.1.2
 docutils==0.16
-elasticsearch==7.1.0
+elasticsearch==7.6.0
 fhirclient==3.2.0
 idna==2.9
 imagesize==1.2.0
-importlib-metadata==1.5.0
+importlib-metadata==1.6.0
 isodate==0.6.0
 itypes==1.1.0
 Jinja2==2.11.1
@@ -34,10 +34,10 @@ packaging==20.3
 psycopg2-binary==2.8.4
 Pygments==2.6.1
 pyparsing==2.4.6
-pyrsistent==0.15.7
+pyrsistent==0.16.0
 python-dateutil==2.8.1
 pytz==2019.3
-PyYAML==5.3
+PyYAML==5.3.1
 rdflib==4.2.2
 rdflib-jsonld==0.4.0
 redis==3.4.1
@@ -58,6 +58,6 @@ sqlparse==0.3.1
 strict-rfc3339==0.7
 uritemplate==3.0.1
 urllib3==1.25.8
-Werkzeug==1.0.0
+Werkzeug==1.0.1
 wincertstore==0.2
 zipp==3.1.0
diff --git a/setup.py b/setup.py
@@ -16,7 +16,7 @@
 
     python_requires=">=3.6",
     install_requires=[
-        "chord_lib[django]==0.6.0",
+        "chord_lib[django]==0.7.0",
         "Django>=2.2,<3.0",
         "django-filter>=2.2,<3.0",
         "django-nose>=1.4,<2.0",