In [1]:
import collections
import itertools
import pyld
import json
import pyhpo
import jsonpath_ng
import glob
import gzip
import csv
import logging
import yaml

logger = logging.getLogger(__name__)

In [2]:
def ensure_list(value):
    '''JSON-LD, being very RDF-y, will represent 0..* fields as single items if there's just one, and lists if more than one. Sometimes it's more convenient to be consistent.'''
    if isinstance(value, list):
        return value
    return [value]

In [3]:
with gzip.open('MONDO.csv.gz', 'rt') as csvf:
    '''Keys will be like "MONDO_0044647", values like "kyphosis-lateral tongue atrophy-myofibrillar myopathy syndrome"'''
    reader = csv.DictReader(csvf)
    class_prefix = 'http://purl.obolibrary.org/obo/'
    mondo_lookup = dict((row['Class ID'][len(class_prefix):], row['Preferred Label']) for row in reader if row['Class ID'].startswith(class_prefix))

In [4]:
pyhpo.Ontology()
logger.warning(f'HPO version: {pyhpo.Ontology.version()}')

HPO version: 2025-01-16


In [5]:
def obo_to_labeled_phenotype(obo):
    try:
        term = pyhpo.Ontology.get_hpo_object(obo.replace('obo:HP_', 'HP:'))
        return {'id': term.id, 'label': term.name}
    except:
        return {'id': obo}

dcsource_path = jsonpath_ng.parse('$.."dc:source"')

def id_or_value(x):
    if isinstance(x, dict):
        return x.get('id', '')
    return x

def proband_phenotype_report(proband):
    phenotypes = [obo_to_labeled_phenotype(p) for p in ensure_list(proband.get('phenotypes', []))]
    sources = dcsource_path.find(proband)
    pmids = list(set(id_or_value(s.value).replace('https://pubmed.ncbi.nlm.nih.gov/', '') for s in sources))
    return {
        'id': proband['id'],
        'pmids': pmids,
        'label': proband['rdfs:label'],
        'phenotypes': phenotypes,
        'sex': proband.get('sex', 'UNKNOWN'),
        'ageType': proband.get('ageType', ''),
        'ageValue': proband.get('ageValue', ''),
        'ageUnit': proband.get('ageUnit', ''),
        'variantObservations': [
            { 'alleles': [
                { 'canonical_reference': allele.get('https://terms.ga4gh.org/CanonicalReference', {'id': ''})['id'],
                  'label': allele['http://www.w3.org/2004/02/skos/core#prefLabel']
                } for allele in ensure_list(vo['allele'])],
              'alleleOrigin': vo.get('alleleOrigin', ''),
              'zygosity': vo.get('zygosity', ''),
            } for vo in ensure_list(proband['variant'])],
    }

In [6]:
def frame_probands(genegraph_data):
    frame = { '@context': genegraph_data['@context'], '@type': 'Proband', '@embed': '@link' }
    return pyld.jsonld.frame(genegraph_data, frame)

def frame_probands_from_file(f):
    with open(f) as inf:
        return frame_probands(json.load(inf))

In [9]:
frame_probands_from_file('input/gene-validity-202512/gg_027327c3-27fd-4703-be91-93467cb392e5v1.0.json')['@graph'][0]

{'id': 'https://genegraph.clinicalgenome.org/r/066ea23a-30b0-47d0-92a5-019d411a6c1a',
 'type': 'Proband',
 'dc:source': 'https://pubmed.ncbi.nlm.nih.gov/22693284',
 'rdfs:label': 'Thevenon Family 3 Proband',
 'ageType': 'AgeAtReport',
 'ageUnit': 'Years',
 'ageValue': 4,
 'allele': {'id': 'https://genegraph.clinicalgenome.org/r/fe32e7b8-c950-4866-a010-ac369a505269',
  'type': 'https://terms.ga4gh.org/VariationDescriptor',
  'http://www.w3.org/2004/02/skos/core#prefLabel': 'GRCh38 1p36.31(chr1:6794391-6843539)x1'},
 'detectionMethod': 'Illumina HumanCytoSNP-12 v2.1 chips\n',
 'firstTestingMethod': 'Chromosomal microarray',
 'phenotypeFreeText': 'Hypotonia and motor delay from birth, ataxia at 17 months, walking at 21 months.\n ',
 'phenotypes': ['obo:HP_0000750',
  'obo:HP_0001252',
  'obo:HP_0001270',
  'obo:HP_0001251'],
 'sex': 'Female',
 'variant': {'id': 'https://genegraph.clinicalgenome.org/r/779eb9ee-ca03-4765-a0ae-7c00de541e36_variant_evidence_item',
  'type': 'VariantObservatio

In [10]:
proband_summaries_all = {}
for f in glob.glob('input/gene-validity-202512/*.json'):
    probands_framed = frame_probands_from_file(f)
    if '@graph' in probands_framed:
        for p in probands_framed['@graph']:
            report = proband_phenotype_report(p)
            proband_summaries_all[report['id']] = report
    else:
        logger.warning(f'No proband data in {f}')

No proband data in input/gene-validity-202512/gg_58ac87e7-9052-42c2-9037-feefc9333576v1.0.json
No proband data in input/gene-validity-202512/gg_264745c2-d298-433e-b07c-461354e6d1d7v1.1.json
No proband data in input/gene-validity-202512/gg_9047de01-be16-4d1d-8544-03c6a5afb096v1.0.json
No proband data in input/gene-validity-202512/gg_8deefe2b-8506-4c36-93db-724c7256c34ev2.0.json
No proband data in input/gene-validity-202512/gg_bc29235c-ee56-4523-ab37-4d4d10e0f89bv1.0.json
No proband data in input/gene-validity-202512/gg_631b8092-820c-4517-99fe-c226ed6ec8d5v1.0.json
No proband data in input/gene-validity-202512/gg_b2b3e8a3-2edf-4c71-9a0c-ba086b1f4398v1.2.json
No proband data in input/gene-validity-202512/gg_152ff76a-8b2a-4dcf-af23-dda71389523dv2.0.json
No proband data in input/gene-validity-202512/gg_cf116c3b-df4f-4b3f-886b-c04508a77ccdv1.1.json
No proband data in input/gene-validity-202512/gg_2c29334c-3292-46f6-98d6-da56cd4137dbv1.2.json
No proband data in input/gene-validity-202512/gg_1

In [11]:
len(proband_summaries_all)

20499

In [12]:
proband_summaries_with_phenotypes = {k: v for (k, v) in proband_summaries_all.items() if len(v['phenotypes']) > 0}
len(proband_summaries_with_phenotypes)

14161

In [14]:
print(yaml.dump(list(proband_summaries_with_phenotypes.items())[:5]))

- !!python/tuple
  - https://genegraph.clinicalgenome.org/r/32920af3-22bf-4620-b780-e1c00208c36d
  - ageType: AgeAtDiagnosis
    ageUnit: Months
    ageValue: 30
    id: https://genegraph.clinicalgenome.org/r/32920af3-22bf-4620-b780-e1c00208c36d
    label: F369-1
    phenotypes:
    - id: HP:0000090
      label: Nephronophthisis
    pmids:
    - '19177160'
    sex: UnknownEthnicity
    variantObservations:
    - alleleOrigin:
        id: cg:GermlineAlleleOrigin
      alleles:
      - canonical_reference: http://reg.genome.network/allele/CA2621987
        label: NM_153240.5(NPHP3):c.2563C>T (p.Gln855Ter)
      zygosity:
        id: cg:TwoVariantsInTrans
- !!python/tuple
  - https://genegraph.clinicalgenome.org/r/5fcea5fa-7a54-41a9-ac74-30df952a49bb
  - ageType: AgeAtDiagnosis
    ageUnit: Months
    ageValue: 2
    id: https://genegraph.clinicalgenome.org/r/5fcea5fa-7a54-41a9-ac74-30df952a49bb
    label: '1348'
    phenotypes:
    - id: HP:0000090
      label: Nephronophthisis
    - id:

In [13]:
import statistics as stat

In [15]:
pheno_counts = [len(x['phenotypes']) for x in proband_summaries_with_phenotypes.values()]

In [16]:
stat.mean(pheno_counts)

5.767742391074076

In [17]:
stat.median(pheno_counts)

4

In [19]:
max(pheno_counts)

63