Skip to content

Commit

Permalink
Merge pull request #107 from c3g/more-summary-data
Browse files Browse the repository at this point in the history
Disease and phenotypic feature summary data
  • Loading branch information
davidlougheed committed Apr 1, 2020
2 parents 38b74f9 + 521903a commit 782a989
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 73 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
dist: bionic
language: python
python:
- "3.7"
- "3.6"
- "3.8"
addons:
postgresql: "11"
apt:
Expand Down
2 changes: 1 addition & 1 deletion chord_metadata_service/chord/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def n_of_tables(self):
keywords = ArrayField(JSONField(null=True, blank=True), blank=True, null=True,
help_text="Tags associated with the dataset, which will help in its discovery.")
version = models.CharField(max_length=200, blank=True, default=version_default,
help_text="A release point for the dataset when applicable.")
help_text="A release point for the dataset when applicable.")
extra_properties = JSONField(blank=True, null=True,
help_text="Extra properties that do not fit in the previous specified attributes.")

Expand Down
107 changes: 65 additions & 42 deletions chord_metadata_service/chord/views_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from chord_metadata_service.metadata.settings import DEBUG
from chord_metadata_service.patients.models import Individual
from chord_metadata_service.phenopackets.api_views import PHENOPACKET_PREFETCH
from chord_metadata_service.phenopackets.models import Phenopacket, Biosample
from chord_metadata_service.phenopackets.models import Phenopacket
from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA
from chord_metadata_service.phenopackets.serializers import PhenopacketSerializer
from chord_metadata_service.metadata.elastic import es
Expand Down Expand Up @@ -103,24 +103,48 @@ def chord_table_summary(_request, table_id):
table = Dataset.objects.get(identifier=table_id)
phenopackets = Phenopacket.objects.filter(dataset=table)

biosamples_set = frozenset(
p["biosamples__id"] for p in phenopackets.prefetch_related("biosamples").values("biosamples__id"))
diseases_counter = Counter()
phenotypic_features_counter = Counter()

biosamples_cs = Counter(b.is_control_sample for b in Biosample.objects.filter(id__in=biosamples_set))
biosamples_set = set()
individuals_set = set()

biosamples_taxonomy = Counter(b.taxonomy["id"] for b in Biosample.objects.filter(id__in=biosamples_set)
if b.taxonomy is not None)
biosamples_cs = Counter()
biosamples_taxonomy = Counter()

individuals_set = frozenset({
*(p["subject"] for p in phenopackets.values("subject")),
*(p["biosamples__individual_id"]
for p in phenopackets.prefetch_related("biosamples").values("biosamples__individual_id")),
})
individuals_sex = Counter()
individuals_k_sex = Counter()
individuals_taxonomy = Counter()

def count_individual(ind):
individuals_set.add(ind.id)
individuals_sex.update((ind.sex,))
individuals_k_sex.update((ind.karyotypic_sex,))
if ind.taxonomy is not None:
individuals_taxonomy.update((ind.taxonomy["id"],))

for p in phenopackets.prefetch_related("biosamples"):
for b in p.biosamples.all():
biosamples_set.add(b.id)
biosamples_cs.update((b.is_control_sample,))

if b.taxonomy is not None:
biosamples_taxonomy.update((b.taxonomy["id"],))

if b.individual is not None:
count_individual(b.individual)

for pf in b.phenotypic_features.all():
phenotypic_features_counter.update((pf.pftype["id"],))

for d in p.diseases.all():
diseases_counter.update((d.term["id"],))

for pf in p.phenotypic_features.all():
phenotypic_features_counter.update((pf.pftype["id"],))

individuals_sex = Counter(i.sex for i in Individual.objects.filter(id__in=individuals_set))
individuals_k_sex = Counter(i.karyotypic_sex for i in Individual.objects.filter(id__in=individuals_set))
individuals_taxonomy = Counter(i.taxonomy["id"] for i in Individual.objects.filter(id__in=individuals_set)
if i.taxonomy is not None)
# Currently, phenopacket subject is required so we can assume it's not None
count_individual(p.subject)

return Response({
"count": phenopackets.count(),
Expand All @@ -130,14 +154,15 @@ def chord_table_summary(_request, table_id):
"is_control_sample": dict(biosamples_cs),
"taxonomy": dict(biosamples_taxonomy),
},
"diseases": dict(diseases_counter),
"individuals": {
"count": len(individuals_set),
"sex": {k: individuals_sex[k] for k in (s[0] for s in Individual.SEX)},
"karyotypic_sex": {k: individuals_k_sex[k] for k in (s[0] for s in Individual.KARYOTYPIC_SEX)},
"diseases": {},
"taxonomy": dict(individuals_taxonomy),
# TODO: age histogram
},
"phenotypic_features": dict(phenotypic_features_counter),
}
})

Expand Down Expand Up @@ -271,42 +296,40 @@ def fhir_search(request, internal_data=False):

res = es.search(index=settings.FHIR_INDEX_NAME, body=query)

subject_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Patient']
htsfile_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'DocumentReference']
disease_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Condition']
biosample_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Specimen']
phenotypicfeature_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Observation']
phenopacket_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Composition']
def hits_for(resource_type: str):
return frozenset(hit["_id"].split("|")[1] for hit in res["hits"]["hits"]
if hit['_source']['resourceType'] == resource_type)

if (not subject_ids and not htsfile_ids and not disease_ids
and not biosample_ids and not phenotypicfeature_ids and not phenopacket_ids):
subject_ids = hits_for('Patient')
htsfile_ids = hits_for('DocumentReference')
disease_ids = hits_for('Condition')
biosample_ids = hits_for('Specimen')
phenotypicfeature_ids = hits_for('Observation')
phenopacket_ids = hits_for('Composition')

if all((not subject_ids, not htsfile_ids, not disease_ids, not biosample_ids, not phenotypicfeature_ids,
not phenopacket_ids)):
return Response(build_search_response([], start))
else:
phenopackets = phenopacket_filter_results(
subject_ids,
htsfile_ids,
disease_ids,
biosample_ids,
phenotypicfeature_ids,
phenopacket_ids
)

phenopackets = phenopacket_filter_results(
subject_ids,
htsfile_ids,
disease_ids,
biosample_ids,
phenotypicfeature_ids,
phenopacket_ids
)

if not internal_data:
datasets = Dataset.objects.filter(
identifier__in = [
p.dataset_id for p in phenopackets
]
) # TODO: Maybe can avoid hitting DB here
# TODO: Maybe can avoid hitting DB here
datasets = Dataset.objects.filter(identifier__in=frozenset(p.dataset_id for p in phenopackets))
return Response(build_search_response([{"id": d.identifier, "data_type": PHENOPACKET_DATA_TYPE_ID}
for d in datasets], start))
return Response(build_search_response({
dataset_id: {
"data_type": PHENOPACKET_DATA_TYPE_ID,
"matches": list(PhenopacketSerializer(p).data for p in dataset_phenopackets)
} for dataset_id, dataset_phenopackets in itertools.groupby(
phenopackets,
key=lambda p: str(p.dataset_id)
)
} for dataset_id, dataset_phenopackets in itertools.groupby(phenopackets, key=lambda p: str(p.dataset_id))
}, start))


Expand Down
5 changes: 3 additions & 2 deletions chord_metadata_service/patients/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,11 @@ class Individual(models.Model, IndexableMixin):
active = models.BooleanField(default=False, help_text='Whether this patient\'s record is in active use.')
deceased = models.BooleanField(default=False, help_text='Indicates if the individual is deceased or not.')
# mCode specific
# this field should be complex Ontology - clinical status and code - two Codeable concept - single, cl status has enum list of values
# this field should be complex Ontology - clinical status and code - two Codeable concept - single, cl status has
# enum list of values
comorbid_condition = JSONField(blank=True, null=True, help_text='One or more conditions that occur with primary'
' condition.')
#TODO decide use ONTOLOGY_CLASS vs. CODEABLE_CONCEPT
# TODO decide use ONTOLOGY_CLASS vs. CODEABLE_CONCEPT
ecog_performance_status = JSONField(blank=True, null=True, help_text='Value representing the Eastern Cooperative '
'Oncology Group performance status.')
karnofsky = JSONField(blank=True, null=True, help_text='Value representing the Karnofsky Performance status.')
Expand Down
17 changes: 17 additions & 0 deletions examples/dataset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"title": "Test Dataset",
"description": "This is a test dataset",
"project": "uuid-for-project-record",
"data_use": {
"consent_code": {
"primary_category": {"code": "GRU"},
"secondary_categories": [
{"code": "GSO"}
]
},
"data_use_requirements": [
{"code": "COL"},
{"code": "PUB"}
]
}
}
18 changes: 0 additions & 18 deletions examples/single_req.json
Original file line number Diff line number Diff line change
@@ -1,24 +1,6 @@
{
"table_id": "3df5e8b0-3949-4d3c-a37f-1c6a81940d50",
"workflow_id": "phenopackets_json",
"workflow_metadata": {
"inputs": [
{
"id": "json_document",
"type": "file",
"extensions": [
".json"
]
}
],
"outputs": [
{
"id": "json_document",
"type": "file",
"value": "{json_document}"
}
]
},
"workflow_params": {
"phenopackets_json.json_document": "/home/dlougheed/git/chord_metadata_service/examples/single.json"
},
Expand Down
16 changes: 8 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,24 @@ attrs==19.3.0
Babel==2.8.0
certifi==2019.11.28
chardet==3.0.4
chord-lib==0.6.0
codecov==2.0.16
chord-lib==0.7.0
codecov==2.0.22
colorama==0.4.3
coreapi==2.3.3
coreschema==0.0.4
coverage==5.0.3
coverage==5.0.4
Django==2.2.11
django-filter==2.2.0
django-nose==1.4.6
django-rest-swagger==2.2.0
djangorestframework==3.10.3
djangorestframework-camel-case==1.1.2
docutils==0.16
elasticsearch==7.1.0
elasticsearch==7.6.0
fhirclient==3.2.0
idna==2.9
imagesize==1.2.0
importlib-metadata==1.5.0
importlib-metadata==1.6.0
isodate==0.6.0
itypes==1.1.0
Jinja2==2.11.1
Expand All @@ -34,10 +34,10 @@ packaging==20.3
psycopg2-binary==2.8.4
Pygments==2.6.1
pyparsing==2.4.6
pyrsistent==0.15.7
pyrsistent==0.16.0
python-dateutil==2.8.1
pytz==2019.3
PyYAML==5.3
PyYAML==5.3.1
rdflib==4.2.2
rdflib-jsonld==0.4.0
redis==3.4.1
Expand All @@ -58,6 +58,6 @@ sqlparse==0.3.1
strict-rfc3339==0.7
uritemplate==3.0.1
urllib3==1.25.8
Werkzeug==1.0.0
Werkzeug==1.0.1
wincertstore==0.2
zipp==3.1.0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

python_requires=">=3.6",
install_requires=[
"chord_lib[django]==0.6.0",
"chord_lib[django]==0.7.0",
"Django>=2.2,<3.0",
"django-filter>=2.2,<3.0",
"django-nose>=1.4,<2.0",
Expand Down

0 comments on commit 782a989

Please sign in to comment.