#### Checks to be conducted :

- Check for all the files where a proband misses dc:source or not and what can be done is such cases ?
- Check whether the gene associated with the files is same as the gene associated with the variants ? But this needs not to be true
- Check for the disease associated with the files and should we use them to fill **interpretations** element of the phenopacket, as it needs **diagnosis** element requires **disease**
- For the phenotypes of a proband how many pubmed ids are associated with it ( theoritically 1 ) ?

#### Issues pending
- Error handling for certain dictionaries
- Should we add gene ontology for allele descriptions

In [None]:
# Library Imports
import csv
import glob
from google.protobuf.timestamp_pb2 import Timestamp
import gzip
import json
import jsonpath_ng
import mygene
import phenopackets.schema.v2 as pps2
import pyhpo
import pyld

#### Constants and housekeeping definitions

In [None]:
INPUT_FOLDER = '../../../Downloads/gene_disease_validity_20250616/'
PUBMED_URL_ROOT = 'https://pubmed.ncbi.nlm.nih.gov/'
timestamp = Timestamp()
mg = mygene.MyGeneInfo()

In [None]:
# Dictionary MONDO_ID : label
with gzip.open("MONDO.csv.gz", "rt") as csvzip:
    csvf = csv.DictReader(csvzip)
    class_prefix = 'http://purl.obolibrary.org/obo/'
    mondo_lookup = dict((row['Class ID'][len(class_prefix):],row['Preferred Label']) for row in csvf if row['Class ID'].startswith(class_prefix))

# Test
list(mondo_lookup.items())[:4]

In [None]:
pyhpo.Ontology()

#### Function Definitions

In [None]:
# Check if the argument is list and if not convert it to list
def ensure_list(value):
    if isinstance(value,list):
        return value
    elif value == None:
        return([])
    else:
        return([value])
# Test
ensure_list([])

In [None]:
# check availability of the argument in the dicitionary and return the associated value, if not will return the argument
def check_value(x, key):
    if isinstance(x, dict):
        return(x.get(key))
    else:
        return(key)

# Test
check_value(obo_to_labelled_phenotype ('obo:HP_0034519'), 'label') # Function defined later

In [None]:
# Test file for framing
test_file = json.load(open("../../../Downloads/gene_disease_validity_20250616/cggv_0a0e2c4c-4d2f-4157-a611-75866f061bacv1.2.json"))

# JSON-LD Framing 
frame = {'@context': test_file['@context'], '@type': 'Proband', '@embed': '@always'}
framed = pyld.jsonld.frame(test_file, frame)

# Test 
framed['@graph'][0].keys()

In [None]:
def obo_to_labelled_phenotype(obo):
    try:
        term = pyhpo.Ontology.get_hpo_object(obo.replace('obo:HP_','HP:'))
        return {'id' : term.id, 'label' : term.name}
    except:
        return {'id':obo}

# Test
obo_to_labelled_phenotype ('obo:HP_0034519') 

In [None]:
# Fall back to jsonpath_ng if proband doesn;t have 'dc:source' key
dcsource_path = jsonpath_ng.parse("$..'dc:source'") 

def find_sources(proband):
    try:
        dc_sources = [proband['dc:source']]
    except KeyError:
        try:
            found = reversed(dcsource_path.find(proband))
            sources_by_id = dict((x.value.get('id', 'UNKNOWN'), x.value)
                                 for x in found
                                 if not isinstance(x, str))
            dc_sources = [sources_by_id.values()]
        except Exception as e:
            print(f"Error finding dc:source with {proband=}")
            print(e)
            dc_sources = []
    return(dc_sources)

# Test
print(find_sources(framed["@graph"][2]))

In [None]:
def variants_extraction(proband):
    variants = {}
    try :
        variants_data = proband['variant']
        variants_data = ensure_list(variants_data)
        for variant_number, item in enumerate(variants_data) :
            # Might break if the variant in a proband doesn't have dc:source and assuming a variant has one dc:source in it
            if isinstance(item["dc:source"], dict):
                pmid = 'PMID:' + item["dc:source"]["id"].replace(PUBMED_URL_ROOT,'')
            elif isinstance(item["dc:source"], str):
                pmid = 'PMID:' + item["dc:source"].replace(PUBMED_URL_ROOT,'')
            alleles = ensure_list(item["allele"])
            for allele_number, allele in enumerate(alleles):
                variant = allele["http://www.w3.org/2004/02/skos/core#prefLabel"]
                variants[f'variant_{variant_number+1}.{allele_number+1}'] = {pmid:variant}
    except Exception as e:
        print(f'No variant found in {proband=}')
    return variants

# Should add .get method to counter the Keyerror associated with dc:source, allele, and http://www.w3.org/2004/02/skos/core#prefLabel keys    
# Test
variants_extraction(framed['@graph'][2])

In [None]:
def proband_to_phenotype_report(proband):
    phenotypes = [obo_to_labelled_phenotype(p) for p in ensure_list(proband.get('phenotypes',[]))]
    sources = find_sources(proband)
    try :
        pmids = ','.join(set(s['id'].replace(PUBMED_URL_ROOT,'') for s in sources))
    except:
        pmids = ''
    return { 'id' : proband['id'],
             'pmids' : pmids,
             'phenotypes' : phenotypes,
             'label' : proband.get('rdfs:label',''),
              'age' : str(proband.get('ageValue','')) + ' ' + proband.get('ageUnit','') + ' ' + proband.get('ageType',''),
            'variants' : variants_extraction(proband)
           }
            

# Test
proband_to_phenotype_report(framed['@graph'][1])    

In [None]:
files = sorted(glob.glob(f'{INPUT_FOLDER}/*.json'))
len(list(files))

In [None]:
proband_summaries = {}
for file in files:
    with open(file, 'rt') as jf:
        json_file = json.load(jf)
        framed = pyld.jsonld.frame(json_file, frame)
        for proband in framed.get('@graph',[]):
            if proband['id'] in proband_summaries:
                print(f'WARNING: more than one record for {proband["id"]}, not sure which one to use.')
            summary = proband_to_phenotype_report(proband)
            summary['evidence_strength'] = json_file.get('evidenceStrength','')
            proband_summaries[proband['id']] = summary        
            

#### The structure of the data is not consistent

- Sometimes the variants are single dictionaries other times they are list of dictionary key value pairs
- Sometimes allelels in the variants are dictionary other times ther are list of dictionary key value pairs
- Also, the dc:source of variants sometimes it has 'id' associated with it as a dictionary and other times there's only single element without id attribute

### Phenopacket Generation

#### Constants

In [None]:
# Description of external resources used for referencing an object
resources = [
  pps2.Resource(**resource) for resource in [
    {
      "id": "geno",
      "name": "Genotype Ontology",
      "url": "http://purl.obolibrary.org/obo/geno.owl",
      "version": "2022-03-05",
      "namespace_prefix": "GENO",
      "iri_prefix": "http://purl.obolibrary.org/obo/GENO_"
    },
    {
      "id": "hgnc",
      "name": "HUGO Gene Nomenclature Committee",
      "url": "https://www.genenames.org",
      "version": "06/01/23",
      "namespace_prefix": "HGNC",
      "iri_prefix": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/"
    },
    {
      "id": "omim",
      "name": "An Online Catalog of Human Genes and Genetic Disorders",
      "url": "https://www.omim.org",
      "version": "January 4, 2023",
      "namespace_prefix": "OMIM",
      "iri_prefix": "https://www.omim.org/entry/"
    },
    {
      "id": "hp",
      "name": "human phenotype ontology",
      "url": "http://purl.obolibrary.org/obo/hp.owl",
      "version": "2024-07-01",
      "namespace_prefix": "HP",
      "iri_prefix": "http://purl.obolibrary.org/obo/HP_"
    },
    { "id": "mondo",
      "name": "Mondo Disease Ontology",
      "url": "http://purl.obolibrary.org/obo/mondo.owl",
      "version": "2025-06-03",
      "namespace_prefix": "MONDO",
      "iri_prefix": "http://purl.obolibrary.org/obo/MONDO_" 
    }
  ]
]

#### Functions

- Read the phenopacket documentation to understand the code, there is no api documentation avaialble while writing this code ( https://phenopacket-schema.readthedocs.io/en/latest/toplevel.html )
- The level of heirarichies depends on the structure provided in the documentations

In [None]:
def phenotype_element_from_id(hpoid, **kwargs):
    term = pyhpo.Ontology.get_hpo_object(hpoid)
    return pps2.PhenotypicFeature(type=pps2.OntologyClass(id=term.id, label=term.name), **kwargs)

In [None]:
def disease_element_from_id( mondo_id ):
    disease_id = mondo_id.replace('obo:','')
    disease_label = mondo_lookup.get(disease_id,'')
    disease = pps2.OntologyClass(id=disease_id.replace('_',':'), label = disease_label)
    return(disease)
    

In [None]:
def proband_to_phenopacket(proband, file):
    references = []
    referenceEvidence = []
    for dc_sources in find_sources(proband):
        for dc_source in dc_sources:
            pmid = dc_source.get('id','').replace(PUBMED_URL_ROOT, '')
            reference = pps2.ExternalReference(id=f'PMID:{pmid}', reference=dc_source['id'], description=dc_source['dc:title'])
            referenceEvidence.append(pps2.Evidence(reference=reference, evidence_code=pps2.OntologyClass(id="ECO:0006017", label="author statement from published clinical study used in manual assertion")))
            references.append(reference)
    phenotypes = [phenotype_element_from_id(p.replace('obo:HP_', 'HP:'), evidence=referenceEvidence) for p in proband.get('phenotypes', [])]
    metadata = pps2.MetaData(external_references=references,
                            created = timestamp.GetCurrentTime(),
                            created_by="Automated import from ClinGen GCI data",
                            phenopacket_schema_version="2.0",
                             resources = resources
                            )
    subject_args = {'id': f'PMID_{pmid}:{proband['rdfs:label']}'}
    subject_args['sex'] = pps2.Sex.Value(proband.get('sex','UNKNOWN').upper())
    individual = pps2.Individual(**subject_args)   
    for key, value in variants_extraction(proband).items():
        genomic_interpretations = []
        for k,v in value.items():
            variant_interpretation = (pps2.VariantInterpretation( acmg_pathogenicity_classification = 'NOT_PROVIDED',
                                          variation_descriptor = pps2.VariationDescriptor(id=  k + '_' + key,
                               expressions = [pps2.Expression(syntax = 'hgvs', value = v)])))
            genomic_interpretations.append(pps2.GenomicInterpretation(subject_or_biosample_id = proband['id'],
                                         interpretation_status = 'UNKNOWN_STATUS',
                                         variant_interpretation = variant_interpretation ))
    diagnosis = pps2.Diagnosis( disease = disease_element_from_id(file['subject']['disease']),
               genomic_interpretations = genomic_interpretations)
    interpretatioin = pps2.Interpretation( id = proband['id'],
                     progress_status = 'UNKNOWN_PROGRESS',
                    diagnosis = diagnosis
                   )
        
        
    phenopacket = pps2.Phenopacket(id=proband['id'], subject=individual, phenotypic_features=phenotypes, meta_data=metadata, interpretations = [interpretatioin])
    return phenopacket

# Test
proband_to_phenopacket(proband,test_file)

In [None]:
proband

### Scratch Code for testing

#### Subject 

In [None]:
pps2.Individual(id= str(proband.get('dc:source','')['id'].replace('https://pubmed.ncbi.nlm.nih.gov/','')) + ':'+ str(proband.get('rdfs:label','')), sex=proband.get("sex","UNKNOWN").upper())

#### Phenotypic Feature

In [None]:
pps2.PhenotypicFeature(type = pps2.OntologyClass(id='HP:0004758', label='Effort-induced polymorphic ventricular tachycardia'),
                      evidence=[pps2.Evidence(evidence_code = pps2.OntologyClass(id='ECO:0006017', label='author statement supported from referenced clinical study used in manual assertion'),
             reference =pps2.ExternalReference(id="PMID:15178757", reference='https://pubmed.ncbi.nlm.nih.gov/15178757',
                       description='A cardiac arrhythmia syndrome caused by loss of ankyrin-B function.'))])

#### Ontology Class

In [None]:
pps2.OntologyClass(id='HP:0004758', label='Effort-induced polymorphic ventricular tachycardia')

#### Evidence 

In [None]:
pps2.Evidence(evidence_code = pps2.OntologyClass(id='ECO:0006017', label='author statement supported from referenced clinical study used in manual assertion'),
             reference =pps2.ExternalReference(id="PMID:15178757", reference='https://pubmed.ncbi.nlm.nih.gov/15178757',
                       description='A cardiac arrhythmia syndrome caused by loss of ankyrin-B function.') )

##### External ReferenceÂ¶

In [None]:
pps2.ExternalReference(id="PMID:15178757", reference='https://pubmed.ncbi.nlm.nih.gov/15178757',
                       description='A cardiac arrhythmia syndrome caused by loss of ankyrin-B function.')

#### Metadata

In [None]:
pps2.MetaData(
    created = timestamp.GetCurrentTime(),
    created_by = "Automated import from ClinGen GCI data",
    phenopacket_schema_version = "2.0",
    resources = resources,
 external_references = [pps2.ExternalReference(id="PMID:15178757", reference='https://pubmed.ncbi.nlm.nih.gov/15178757',
                       description='A cardiac arrhythmia syndrome caused by loss of ankyrin-B function.')]
)
    
    

In [None]:
pps2.Interpretation(id='123456',
                    progress_status='UNKNOWN_PROGRESS',
                    diagnosis = pps2.Diagnosis( disease = pps2.OntologyClass(id='MONDO:0044647',label='kyphosis-lateral tongue atrophy-myofibrillar myopathy syndrome'),
               genomic_interpretations = genomic_interpretations))

                    

In [None]:
pps2.Diagnosis( disease = pps2.OntologyClass(id='MONDO:0044647',label='kyphosis-lateral tongue atrophy-myofibrillar myopathy syndrome'),
               genomic_interpretations = genomic_interpretations)


In [None]:
pps2.Disease(pps2.OntologyClass(id='MONDO:0044647',label='kyphosis-lateral tongue atrophy-myofibrillar myopathy syndrome'))

In [None]:
genomic_interpretations = [pps2.GenomicInterpretation( subject_or_biosample_id = 'AJBKS',
                            interpretation_status = 'UNKNOWN_STATUS',
                            variant_interpretation = pps2.VariantInterpretation( acmg_pathogenicity_classification = 'NOT_PROVIDED',
                           variation_descriptor = pps2.VariationDescriptor(id='12344',
                               expressions = [pps2.Expression(syntax = 'hgvs', value = 'NM_172056.2(KCNH2):c.1764C>A (p.Asn588Lys)')],
                                                                           
                                                                          )))]
genomic_interpretations                          

In [None]:
pps2.VariantInterpretation( acmg_pathogenicity_classification = 'NOT_PROVIDED',
                           variation_descriptor = pps2.VariationDescriptor(id='12344',
                               expressions = [pps2.Expression(syntax = 'hgvs', value = 'NM_172056.2(KCNH2):c.1764C>A (p.Asn588Lys)')],
                                                                           
                                                                          )
                          )
                           

In [None]:
pps2.Expression(syntax = 'hgvs', value = 'NM_172056.2(KCNH2):c.1764C>A (p.Asn588Lys)')

In [None]:
pps2.OntologyClass(id = 'GENO_0000888', label = 'germline allele origin')

In [None]:
import requests

In [None]:
requests.get('http://purl.obolibrary.org/obo/','GENO_0000888')