In [1]:
# Library Imports 
import csv
import glob
from google.protobuf.json_format import MessageToJson
from google.protobuf.timestamp_pb2 import Timestamp
import gzip
import json
import jsonpath_ng
import logging
import os
import phenopackets.schema.v2 as pps2
import pyhpo
import pyld
import pronto
import requests

##### Initializing libraris

In [2]:
logger = logging.getLogger(__name__)
pyhpo.Ontology()
geno = pronto.Ontology("http://purl.obolibrary.org/obo/geno.owl") # Can use something similar for mondo lookup in next version 
#logger.warning(f'HPO version: {pyhpo.Ontology.version()}')

  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(prop, curies)
  self._extract_object_property(pr

### Constant


In [3]:
INPUT_FOLDER = '../../../Desktop/gene-validity-jsonld-2026_01_22/'
timestamp = Timestamp()

##### Sample Framing
- Used a sample file to get the frame for extracting proband information [ Hack ], the best option is to hardcode the frame, which is tough with the data structure of the files being inconsistent

In [4]:
genegraph_validity_jsonld_sample = json.load(open(INPUT_FOLDER + '/gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json'))
frame = {'@context': genegraph_validity_jsonld_sample['@context'], '@type': 'Proband', '@embed': '@always'}

#### Funtions

In [5]:
# Used Prof's function as it is 
def ensure_list(value):
    '''JSON-LD, being very RDF-y, will represent 0..* fields as single items if there's just one, and lists if more than one. Sometimes it's more convenient to be consistent.'''
    if isinstance(value, list):
        return value
    else:
        return([value])

In [6]:
# Creating a dictionary with mondo identifiers and labels 
# {"MONDO_0044647": "kyphosis-lateral tongue atrophy-myofibrillar myopathy syndrome",}
with gzip.open("MONDO.csv.gz", 'rt') as csvf:
    file_content = csv.DictReader(csvf)
    class_prefix = "http://purl.obolibrary.org/obo/"
    mondo_lookup = dict((row["Class ID"].strip(class_prefix),row["Preferred Label"])
                        for row in file_content 
                        if row["Class ID"].startswith(class_prefix))

In [7]:
# geno lookup for tackling zygosity while creating phenopackets
geno_lookup = {term.name : term.id for term in geno.terms()}

In [8]:
def hpo_to_labeled_phenotype(hpo):
    try:
        term = pyhpo.Ontology.get_hpo_object(hpo.replace('obo:HP_', 'HP:'))
        return {'id': term.id, 'label': term.name}
    except:
        return {'id': hpo}

In [9]:
# Again to tackle JSON-LD formatting
def id_or_value(x):
    if isinstance(x, dict):
        return x.get('id', '')
    return x

In [10]:
def proband_summary(proband):
    phenotypes = [hpo_to_labeled_phenotype(p) for p in ensure_list(proband.get('phenotypes', []))]
    return {
        'id': proband['id'],
        'sources': proband.get('dc:source',''),
        'label': proband.get('rdfs:label',''),
        'phenotypes': phenotypes,
        'sex': proband.get('sex', 'UNKNOWN'),
        'ageType': proband.get('ageType', ''),
        'ageValue': proband.get('ageValue', ''),
        'ageUnit': proband.get('ageUnit', ''),
        'variants': proband.get('variant',[])
    }

In [11]:
# Tackle sources with pmids as value
def get_article_information(proband_id, source):
    pmid = source.strip('https://pubmed.ncbi.nlm.nih.gov/')
    response = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=json")
    if response.status_code != 200:
        logger.warning(f" {response.status_code} status code for the pubmed id {pmid} for the proband {proband_id}")

    else:
        pmid_summary = response.json()
        titile = pmid_summary["result"][pmid].get('title', 'NO TITLE FOUND')
        first_author = pmid_summary["result"][pmid].get("sortfirstauthor", 'NO FIRST AUTHOR')
        return(titile, first_author)

In [12]:
# Tackle different source values
def source_description(proband_id, source):
    try:
        if source.startswith('https://pubmed.ncbi.nlm.nih.gov/'):
            get_article_information(proband_id, source)
    except Exception as e :
        print(e)
        logger.warning(f" {source} has replaced pmid for dc:source value, for the proband {proband_id}")

In [13]:
# Description needs furhter external resources like ECO ontology , to be tackled in next version 

In [14]:
# Description of external resources used for referencing an object
resources = [
  pps2.Resource(**resource) for resource in [
    {
      "id": "geno",
      "name": "Genotype Ontology",
      "url": "http://purl.obolibrary.org/obo/geno.owl",
      "version": "2022-03-05",
      "namespace_prefix": "GENO",
      "iri_prefix": "http://purl.obolibrary.org/obo/GENO_"
    },
    {
      "id": "hgnc",
      "name": "HUGO Gene Nomenclature Committee",
      "url": "https://www.genenames.org",
      "version": "06/01/23",
      "namespace_prefix": "HGNC",
      "iri_prefix": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/"
    },
    {
      "id": "omim",
      "name": "An Online Catalog of Human Genes and Genetic Disorders",
      "url": "https://www.omim.org",
      "version": "January 4, 2023",
      "namespace_prefix": "OMIM",
      "iri_prefix": "https://www.omim.org/entry/"
    },
    {
      "id": "hp",
      "name": "human phenotype ontology",
      "url": "http://purl.obolibrary.org/obo/hp.owl",
      "version": "2024-07-01",
      "namespace_prefix": "HP",
      "iri_prefix": "http://purl.obolibrary.org/obo/HP_"
    },
    { "id": "mondo",
      "name": "Mondo Disease Ontology",
      "url": "http://purl.obolibrary.org/obo/mondo.owl",
      "version": "2025-06-03",
      "namespace_prefix": "MONDO",
      "iri_prefix": "http://purl.obolibrary.org/obo/MONDO_" 
    }
  ]
]

In [15]:
def proband_summary_to_phenopacket(proband_summary):

    # Tackling references 
    # With the new data files, each proband if it has pmid as dc:source it's only one and they are only tackled here
    references = []
    referenceEvidence = []
    
    try:
        if proband_summary["sources"].startswith('https://pubmed.ncbi.nlm.nih.gov/'):
            pmid = proband_summary["sources"].strip('https://pubmed.ncbi.nlm.nih.gov/')
            description, author = get_article_information(proband_summary["id"], proband_summary["sources"])
            reference = pps2.ExternalReference(
                id=f'PMID:{pmid}', 
                reference=proband_summary["sources"], 
                description=description
            )
            referenceEvidence.append(pps2.Evidence(
                reference=reference, 
                evidence_code=pps2.OntologyClass(
                    id="ECO:0006017", 
                    label="author statement from published clinical study used in manual assertion"
                )
            ))
            references.append(reference)
            
    except Exception as e:
        print("Hello")
        print(e)

    # Constant PMID [Need to tackle this again] 
    pmid = proband_summary["sources"].strip('https://pubmed.ncbi.nlm.nih.gov/')

    # Phenotypic features
    phenotypes = [
        pps2.PhenotypicFeature(
            type=pps2.OntologyClass(id=phenotype["id"], label=phenotype["label"]),
            evidence=referenceEvidence
        ) for phenotype in proband_summary.get("phenotypes", [])
    ]

    # Variants
    genomic_interpretations = []
    for variant in ensure_list(proband_summary["variants"]):
        for allele in ensure_list(variant["allele"]):
            variation_descriptor = pps2.VariationDescriptor(
                id=allele["http://www.w3.org/2004/02/skos/core#prefLabel"],
                molecule_context='unspecified_molecule_context',
                allelic_state=pps2.OntologyClass(
                    id=geno_lookup[variant["zygosity"]["id"].strip('cg:').lower()],
                    label=variant["zygosity"]["id"].strip('cg:').lower()
                )
            )
            
            variant_interpretation = pps2.VariantInterpretation(
                acmg_pathogenicity_classification="NOT_PROVIDED",
                therapeutic_actionability="UNKNOWN_ACTIONABILITY",
                variation_descriptor=variation_descriptor
            )
            
            genomic_interpretation = pps2.GenomicInterpretation(
                subject_or_biosample_id=f"PMID_{pmid}:{proband_summary['label']}",
                interpretation_status='UNKNOWN_STATUS',
                variant_interpretation=variant_interpretation
            )
            genomic_interpretations.append(genomic_interpretation)
            
    diagnosis = pps2.Diagnosis(
        disease=pps2.OntologyClass(id='MONDO:0700096', label='human disease'),
        genomic_interpretations=genomic_interpretations
    )
    
    interpretation = pps2.Interpretation(
        id=proband_summary["id"],
        progress_status='UNKNOWN_PROGRESS',
        diagnosis=diagnosis
    )

    # Metadata
    metadata = pps2.MetaData(
        external_references=references,
        created=timestamp.GetCurrentTime(),
        created_by="Automated import from ClinGen GCI data",
        phenopacket_schema_version="2.0",
        resources=resources
    )

    # Individual
    subject_args = {'id': f"PMID_{pmid}:{proband_summary['label']}"}
    subject_args['sex'] = pps2.Sex.Value(proband_summary.get('sex', 'UNKNOWN').upper())
    individual = pps2.Individual(**subject_args) 

    # Phenopacket
    phenopacket = pps2.Phenopacket(
        id=proband_summary['id'], 
        subject=individual, 
        phenotypic_features=phenotypes, 
        meta_data=metadata,
        interpretations = [interpretation]
    )

    return phenopacket

In [16]:
test_frame = pyld.jsonld.frame(genegraph_validity_jsonld_sample, frame)
test_proband = test_frame["@graph"][0]
test_summary = proband_summary(test_proband)

In [17]:
print(MessageToJson(proband_summary_to_phenopacket(test_summary)))

{
  "id": "https://genegraph.clinicalgenome.org/r/0708860f-3679-4c7c-8809-631729d6490a",
  "subject": {
    "id": "PMID_11709191:Yoshida_Patient MK",
    "sex": "MALE"
  },
  "phenotypicFeatures": [
    {
      "type": {
        "id": "HP:0007973",
        "label": "Retinal dysplasia"
      },
      "evidence": [
        {
          "evidenceCode": {
            "id": "ECO:0006017",
            "label": "author statement from published clinical study used in manual assertion"
          },
          "reference": {
            "id": "PMID:11709191",
            "reference": "https://pubmed.ncbi.nlm.nih.gov/11709191",
            "description": "Muscular dystrophy and neuronal migration disorder caused by mutations in a glycosyltransferase, POMGnT1."
          }
        }
      ]
    },
    {
      "type": {
        "id": "HP:0007260",
        "label": "Type II lissencephaly"
      },
      "evidence": [
        {
          "evidenceCode": {
            "id": "ECO:0006017",
            "l