In [1]:
import collections
import itertools
import pyld
import json
import phenopackets as pps
import pyhpo
import jsonpath_ng
from google.protobuf.json_format import MessageToJson
from google.protobuf.timestamp_pb2 import Timestamp
import glob
import gzip
import csv
import statistics as stat

JSON-LD, being very RDF-y, will represent 0..* fields as single items if there's just one, and lists if more than one. Sometimes it's more convenient to be consistent.

#### Function checks whether the param is list or not and returns list

- Use ??

In [3]:
def ensure_list(value):
    if isinstance(value, list):
        return value
    return [value]

In [4]:
ensure_list(1)

[1]

In [8]:
##### Created a dictionary of mondo terms using the gzip file, and format MONDO_ID : label 

## Use case??

In [7]:
with gzip.open('MONDO.csv.gz', 'rt') as csvf:
    reader = csv.DictReader(csvf)
    class_prefix = 'http://purl.obolibrary.org/obo/'
    mondo_lookup = dict((row['Class ID'][len(class_prefix):], row['Preferred Label']) for row in reader if row['Class ID'].startswith(class_prefix))

In [4]:
list(mondo_lookup.items())[:3]

[('MONDO_0044647',
  'kyphosis-lateral tongue atrophy-myofibrillar myopathy syndrome'),
 ('MONDO_1010032', 'Jacobsen syndrome, non-human animal'),
 ('MONDO_0007146', 'obsolete apnea, central sleep')]

In [None]:
# Using pyhpo ontologies 

In [5]:
pyhpo.Ontology()
timestamp = Timestamp()

In [6]:
pyhpo.Ontology.get_hpo_object(321).name

'Square face'

In [12]:
j = json.load(open('input/gene-validity-jsonld-dec-2024/cggv_c5a4546e-f7a6-4d4a-bd0d-763e2b7dca74v1.0.json'))

In [15]:
# Good Approach to get disease and genes data for the subject 

In [14]:
j['subject']['disease']

'obo:MONDO_0014526'

The idea behind using JSON-LD framing is two-fold:
* It gives us a quick way to get down to the probands
* By specifying `embed` of `@always`, we should be able to avoid having to keep our own dictionary of objects (*for the most part*... pyld does not appear to handle re-embeddeding `dc:source` when needed below)

In [29]:
frame = {'@context': j['@context'], '@type': 'Proband', '@embed': '@always'}

In [30]:
framed = pyld.jsonld.frame(j, frame)

In [31]:
framed['@graph'][0].keys()

dict_keys(['id', 'type', 'ageType', 'ageUnit', 'ageValue', 'allele', 'detectionMethod', 'firstTestingMethod', 'phenotypeFreeText', 'phenotypes', 'previousTesting', 'previousTestingDescription', 'sex', 'variant', 'rdfs:label'])

*FIXME*: not really sure what to use for `resources`, so just going with what is in the phenopacket store for now...

In [16]:
## Looks like metadata of the phenopackets 

In [12]:
resources = [
  pps.Resource(**resource) for resource in [
    {
      "id": "geno",
      "name": "Genotype Ontology",
      "url": "http://purl.obolibrary.org/obo/geno.owl",
      "version": "2022-03-05",
      "namespace_prefix": "GENO",
      "iri_prefix": "http://purl.obolibrary.org/obo/GENO_"
    },
    {
      "id": "hgnc",
      "name": "HUGO Gene Nomenclature Committee",
      "url": "https://www.genenames.org",
      "version": "06/01/23",
      "namespace_prefix": "HGNC",
      "iri_prefix": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/"
    },
    {
      "id": "omim",
      "name": "An Online Catalog of Human Genes and Genetic Disorders",
      "url": "https://www.omim.org",
      "version": "January 4, 2023",
      "namespace_prefix": "OMIM",
      "iri_prefix": "https://www.omim.org/entry/"
    },
    {
      "id": "so",
      "name": "Sequence types and features ontology",
      "url": "http://purl.obolibrary.org/obo/so.obo",
      "version": "2021-11-22",
      "namespace_prefix": "SO",
      "iri_prefix": "http://purl.obolibrary.org/obo/SO_"
    },
    {
      "id": "hp",
      "name": "human phenotype ontology",
      "url": "http://purl.obolibrary.org/obo/hp.owl",
      "version": "2024-07-01",
      "namespace_prefix": "HP",
      "iri_prefix": "http://purl.obolibrary.org/obo/HP_"
    }
  ]
]

In [17]:
## Phenotypic Feature population, getting hpo ids and storing their labels to it 

In [13]:
def phenotype_element_from_id(hpoid, **kwargs):
    term = pyhpo.Ontology.get_hpo_object(hpoid)
    return pps.PhenotypicFeature(type=pps.OntologyClass(id=term.id, label=term.name), **kwargs)

In [32]:
framed['@graph']

[{'id': 'cggv:11923ba7-9328-42f0-b624-7328f0923f32',
  'type': 'Proband',
  'ageType': 'AgeAtOnset',
  'ageUnit': 'Years',
  'ageValue': 49,
  'allele': [{'id': 'cggv:62891984-c008-40d5-8d73-edba47e382f1',
    'type': 'https://terms.ga4gh.org/VariationDescriptor',
    'http://www.w3.org/2004/02/skos/core#prefLabel': 'NM_004130.4(GYG1):c.143+3G>C',
    'https://terms.ga4gh.org/CanonicalReference': {'id': 'http://reg.genome.network/allele/CA175125'}},
   {'id': 'cggv:6403f94c-81c9-474e-b715-0422313deae4',
    'type': 'https://terms.ga4gh.org/VariationDescriptor',
    'http://www.w3.org/2004/02/skos/core#prefLabel': 'NM_004130.4(GYG1):c.970C>T (p.Arg324Ter)',
    'https://terms.ga4gh.org/CanonicalReference': {'id': 'http://reg.genome.network/allele/CA175126'}}],
  'detectionMethod': 'Previous genotyping ruled out disease-causing variants in the GBE1 or RBCK1 loci. Subsequent genotyping performed Sanger-based sequencing of cDNA from the GYG1 locus.',
  'firstTestingMethod': 'Sanger sequenc

In [14]:
# F1: extract labeled phenotype by using the HPO ids from gene graph data  , converting obo:HP to HP 


def obo_to_labeled_phenotype(obo):
    try:
        term = pyhpo.Ontology.get_hpo_object(obo.replace('obo:HP_', 'HP:'))
        return {'id': term.id, 'label': term.name}
    except:
        return {'id': obo}


## It will find all the dc:source items under the root dictionary ( .. deep scan operator )
dcsource_path = jsonpath_ng.parse('$.."dc:source"')


# Find value associated with id key for the argument 
def id_or_value(x):
    if isinstance(x, dict):
        return x.get('id', '')
    return x

def find_sources(proband):
    try:
        dc_sources = [proband['dc:source']]
    except KeyError:
        try:
          found = reversed(dcsource_path.find(proband))  # Why is reversed used ? ? 
            
          # Note: there is some hacky handling of non-embeded dc:source by testing
          #   for a string and ignoring (since it *should* be embedded elsewhere, then
          #   but this will probably break when there are multiple probands from some
          #   source in same file. Maybe pyld's @embed: @always will work better
          #   some day...
          sources_by_id = dict((x.value.get('id', 'UNKNOWN'), x.value)
                               for x in found
                               if not isinstance(x.value, str))
          dc_sources = list(sources_by_id.values())
        except Exception as e:
            print(f'ERROR finding dc:source with {proband=}')
            print(e)
            dc_sources = []
    return dc_sources

def proband_phenotype_report(proband):
    phenotypes = [obo_to_labeled_phenotype(p) for p in ensure_list(proband.get('phenotypes', []))]
    sources = find_sources(proband)
    #FIXME
    try:
        pmids = ','.join(set(s['id'].replace('https://pubmed.ncbi.nlm.nih.gov/', '') for s in sources)) # Clustered all the pmids with the associated proband
    except:
        pmids = ''
    return {
        'id': proband['id'],
        'pmids': pmids,
        'label': proband['rdfs:label'],
        'phenotypes': phenotypes,
    }

proband_phenotype_report(framed['@graph'][4])

{'id': 'cggv:a8ae038d-0f70-4e7b-bc6a-b09cc9767c3d',
 'pmids': '25272951',
 'label': 'Malfatti_2014_Proband_P1',
 'phenotypes': [{'id': 'HP:0008988', 'label': 'Pelvic girdle muscle atrophy'},
  {'id': 'HP:0034519',
   'label': 'Muscle fiber fuchsinophilic inclusion bodies'},
  {'id': 'HP:0003473', 'label': 'Fatigable weakness'},
  {'id': 'HP:0030225', 'label': 'Accumulation of muscle fiber desmin'},
  {'id': 'HP:0003236',
   'label': 'Elevated circulating creatine kinase concentration'},
  {'id': 'HP:0003749', 'label': 'Pelvic girdle muscle weakness'},
  {'id': 'HP:0003458', 'label': 'EMG: myopathic abnormalities'}]}

In [15]:
framed['@graph'][0]['id']

'cggv:11923ba7-9328-42f0-b624-7328f0923f32'

In [16]:
def proband_to_phenopacket(proband):
    references = []
    for dc_source in find_sources(proband):
      pmid = dc_source['id'].replace('https://pubmed.ncbi.nlm.nih.gov/', '')
      reference = pps.ExternalReference(id=f'PMID:{pmid}', reference=dc_source['id'], description=dc_source['dc:title'])
      referenceEvidence = pps.Evidence(reference=reference, evidence_code=pps.OntologyClass(id="ECO:0006017", label="author statement from published clinical study used in manual assertion"))
      references.append(reference)
    phenotypes = [phenotype_element_from_id(p.replace('obo:HP_', 'HP:'), evidence=[referenceEvidence]) for p in proband.get('phenotypes', [])]
    metadata = pps.MetaData(external_references=references,
                            created=timestamp.GetCurrentTime(),
                            created_by="Automated import from ClinGen GCI data",
                            phenopacket_schema_version="2.0",
                            resources=resources)
    subject_args = {'id': f'{pmid}:{proband['rdfs:label']}'}
    try:
        subject_args['sex'] = pps.Sex.Value(proband['sex'])
    except:
        subject_args['sex'] = pps.Sex.UNKNOWN_SEX
    individual = pps.Individual(**subject_args)
    phenopacket = pps.Phenopacket(id=proband['id'], subject=individual, phenotypic_features=phenotypes, meta_data=metadata)
    return phenopacket

proband_to_phenopacket(framed['@graph'][0])

id: "cggv:11923ba7-9328-42f0-b624-7328f0923f32"
subject {
  id: "25272951:Malfatti_2014_Proband_P7"
}
phenotypic_features {
  type {
    id: "HP:0003547"
    label: "Shoulder girdle muscle weakness"
  }
  evidence {
    evidence_code {
      id: "ECO:0006017"
      label: "author statement from published clinical study used in manual assertion"
    }
    reference {
      id: "PMID:25272951"
      reference: "https://pubmed.ncbi.nlm.nih.gov/25272951"
      description: "A new muscle glycogen storage disease associated with glycogenin-1 deficiency."
    }
  }
}
phenotypic_features {
  type {
    id: "HP:0009077"
    label: "Weakness of long finger extensor muscles"
  }
  evidence {
    evidence_code {
      id: "ECO:0006017"
      label: "author statement from published clinical study used in manual assertion"
    }
    reference {
      id: "PMID:25272951"
      reference: "https://pubmed.ncbi.nlm.nih.gov/25272951"
      description: "A new muscle glycogen storage disease associated wi

In [17]:
inputs = sorted(glob.glob('input/gene-validity-jsonld-dec-2024/*.json'))[::-1]
len(inputs)

2913

In [18]:
proband_summaries = {}
for f in inputs:
    with open(f) as jf:
        j = json.load(jf)
    framed = pyld.jsonld.frame(j, frame)
    for proband in framed.get('@graph', []):
        if proband['id'] in proband_summaries:
            print(f'WARNING: more than one record for {proband['id']}, not sure which one to use!')
        summary = proband_phenotype_report(proband)
        summary['evidenceStrength'] = j.get('evidenceStrength', '')
        disease_ids = [x.replace('obo:', '').replace(':', '_') for x in ensure_list(j.get('subject', {}).get('disease', 'unknown'))]
        summary['disease'] = [{'id': did, 'label': mondo_lookup.get(did, '')} for did in disease_ids]
        proband_summaries[proband['id']] = summary


In [19]:
probands_with_phenotypes = [ps for ps in proband_summaries.values() if len(ps['phenotypes'])> 0]
print(f'Probands with phenotypes: {len(probands_with_phenotypes)}')
print(f'Probands with phenotypes where evidence at least moderate: {len([x for x in probands_with_phenotypes if x.get('evidenceStrength') in ('Moderate', 'Strong', 'Definitive')])}')

Probands with phenotypes: 11520
Probands with phenotypes where evidence at least moderate: 11042


In [20]:
collections.Counter([x.get('disease', [{}])[0].get('label', '') for x in probands_with_phenotypes])

Counter({'nonsyndromic genetic hearing loss': 341,
         'complex neurodevelopmental disorder': 325,
         'mitochondrial disease': 308,
         'hypertrophic cardiomyopathy': 186,
         'Leigh syndrome': 133,
         'developmental and epileptic encephalopathy': 128,
         'autosomal recessive limb-girdle muscular dystrophy': 127,
         'dilated cardiomyopathy': 126,
         'ciliopathy': 122,
         'Charcot-Marie-Tooth disease': 115,
         'monogenic diabetes': 94,
         'syndromic intellectual disability': 76,
         'peroxisome biogenesis disorder': 69,
         'amyotrophic lateral sclerosis type 10': 66,
         'long QT syndrome': 64,
         'neuronal ceroid lipofuscinosis': 62,
         'hereditary pheochromocytoma-paraganglioma': 60,
         'X-linked syndromic intellectual disability': 57,
         'collagen 6-related myopathy': 50,
         'inherited retinal dystrophy': 46,
         'arrhythmogenic right ventricular cardiomyopathy': 42,
    

In [21]:
with open('probands_with_phenotypes.tsv', 'wt') as ouf:
    print('\t'.join(('evidence_strength', 'evidence_id', 'pmid', 'label', 'phenotypes', 'phenotype_labels', 'diseases', 'disease_labels')), file=ouf)
    for line in probands_with_phenotypes:
        phenotypes = '|'.join(x['id'] for x in line.get('phenotypes'))
        phenotype_labels = '|'.join(x.get('label', '') for x in line.get('phenotypes'))
        diseases = '|'.join(x['id'] for x in line.get('disease'))
        disease_labels = '|'.join(x['label'] for x in line.get('disease'))
        print('\t'.join((
            line['evidenceStrength'],
            line['id'],
            line['pmids'],
            line['label'],
            phenotypes,
            phenotype_labels,
            diseases,
            disease_labels,
        )), file=ouf)

In [24]:
phenotype_counts = [len(p['phenotypes']) for p in proband_summaries.values()]

In [32]:
countdata = phenotype_counts
print(f'Among all {len(countdata)} probands:')
print(f'mean: {stat.mean(countdata)}')
print(f'stdev: {stat.stdev(countdata)}')
print(f'median: {stat.median(countdata)}')
print(f'max: {max(countdata)}')

Among all 16635 probands:
mean: 3.934776074541629
stdev: 5.04361303690977
median: 2
max: 63


In [33]:
countdata = [pc for pc in phenotype_counts if pc != 0]
print(f'Among {len(countdata)} probands with at least one phenotype reported:')
print(f'mean: {stat.mean(countdata)}')
print(f'stdev: {stat.stdev(countdata)}')
print(f'median: {stat.median(countdata)}')
print(f'max: {max(countdata)}')

Among 11520 probands with at least one phenotype reported:
mean: 5.681857638888889
stdev: 5.177473698230645
median: 4.0
max: 63


In [42]:
terms_by_count = collections.Counter()
for p in proband_summaries.values():
    terms_by_count.update(pheno.get('label', pheno.get('id', 'UNKNOWN')) for pheno in p['phenotypes'])
terms_by_count

Counter({'Global developmental delay': 1180,
         'Seizure': 950,
         'Hypotonia': 778,
         'Microcephaly': 750,
         'Intellectual disability': 721,
         'Delayed speech and language development': 452,
         'Motor delay': 444,
         'Short stature': 429,
         'Failure to thrive': 419,
         'Feeding difficulties': 366,
         'Abnormal facial shape': 358,
         'Ataxia': 338,
         'Scoliosis': 325,
         'Thrombocytopenia': 317,
         'Hepatomegaly': 300,
         'Generalized hypotonia': 299,
         'Nystagmus': 295,
         'Hypertrophic cardiomyopathy': 291,
         'Elevated circulating creatine kinase concentration': 287,
         'Hypertelorism': 260,
         'Intellectual disability, severe': 253,
         'Muscle weakness': 249,
         'Bronchiectasis': 228,
         'Cerebellar atrophy': 223,
         'Hearing impairment': 220,
         'Growth delay': 219,
         'Sensorineural hearing impairment': 213,
         'Dy

In [33]:
import pandas as pd

In [34]:
df = pd.read_csv("probands_with_phenotypes.tsv", sep='\t')

In [35]:
df.head()

Unnamed: 0,evidence_strength,evidence_id,pmid,label,phenotypes,phenotype_labels,diseases,disease_labels
0,Definitive,cggv:0595520c-2e2f-477c-b8d7-6482e46a3087,30778343,P25,HP:0000278|HP:0000369|HP:0031691|HP:0004430,Retrognathia|Low-set ears|Severe viral infecti...,MONDO_0014423,severe combined immunodeficiency due to DNA-PK...
1,Definitive,cggv:4313ec4d-811f-4f4f-8740-baf1c7586553,30121298,P1,HP:0002028|HP:0030828|HP:0010976|HP:0002788|HP...,Chronic diarrhea|Wheezing|B lymphocytopenia|Re...,MONDO_0014423,severe combined immunodeficiency due to DNA-PK...
2,Definitive,cggv:a4e0f372-6094-418a-a34c-2d2567f1246a,26546606,P1,HP:0005403|HP:0010997|HP:0004430|HP:0010976,T lymphocytopenia|Chromosomal breakage induced...,MONDO_0014423,severe combined immunodeficiency due to DNA-PK...
3,Definitive,cggv:deb4feae-d140-4a6b-80d0-805a6750c685,19075392,ID177,HP:0010976|HP:0010997|HP:0004430|HP:0005403,B lymphocytopenia|Chromosomal breakage induced...,MONDO_0014423,severe combined immunodeficiency due to DNA-PK...
4,Definitive,cggv:efab95d2-93b5-434b-a7f3-8a065c1d9c52,25842288,Pt2,HP:0002110|HP:0006538|HP:0001888|HP:0032252|HP...,Bronchiectasis|Recurrent bronchopulmonary infe...,MONDO_0014423,severe combined immunodeficiency due to DNA-PK...


In [40]:
df[df["pmid"].map( lambda x : len([x])) >1]

Unnamed: 0,evidence_strength,evidence_id,pmid,label,phenotypes,phenotype_labels,diseases,disease_labels


In [41]:
## Issues / Doubts 

1) Why pmids were extracted and then reversed ? 
2) RDFS labels are not corrected as evident in my code
3) 