##### A quick notebook utilizing pandas to get quick stats out of the genegraph data / scratch

In [1]:
# Library Imports
import glob
import json
import jsonpath_ng
import os 
import pandas as pd
from pyld import jsonld
import 

In [2]:
# Constants
INPUT_FOLDER = './input/gene-validity-jsonld-20251030/gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json'
MONDO_MAPPINGS = './input/MONDO_20251030.csv.gz'

In [8]:
# mondo lookup to map ids to names
mondo_mappings = pd.read_csv(MONDO_MAPPINGS, compression='gzip')
mondo_lookup = mondo_mappings.loc[:,['Class ID', 'Preferred Label']]
mondo_lookup['Class ID'] = mondo_lookup['Class ID'].map( lambda x : x.strip('http://purl.obolibrary.org/obo/'))
mondo_lookup.head()

  mondo_mappings = pd.read_csv(MONDO_MAPPINGS, compression='gzip')


Unnamed: 0,Class ID,Preferred Label
0,MONDO_0044647,kyphosis-lateral tongue atrophy-myofibrillar m...
1,MONDO_1010032,"Jacobsen syndrome, non-human animal"
2,MONDO_0007146,"obsolete apnea, central sleep"
3,MONDO_0017964,"obsolete 46,XX disorder of sex development ind..."
4,MONDO_0014635,"microphthalmia, isolated, with coloboma 10"


In [24]:
# Reading a file to check what it has 
genegraph_validity_jsonld_sample = json.load(open(INPUT_FOLDER + '/gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json'))
genegraph_validity_jsonld_sample['subject']['disease']

'obo:MONDO_0700068'

- Used json_ld for further processing
- Trying to use @embed : always [ assumption it will resolve the labels to proper uris in the files ] 

In [17]:
# Getting a frame to extract all the entities associated with proband data along with the gene and disease
frame = {'@context': genegraph_validity_jsonld_sample['@context'], '@type': 'Proband', '@embed': '@always'}
framed_data = jsonld.frame(genegraph_validity_jsonld_sample, frame)
framed_data["@graph"][0]

{'id': 'https://genegraph.clinicalgenome.org/r/0708860f-3679-4c7c-8809-631729d6490a',
 'type': 'Proband',
 'dc:source': 'https://pubmed.ncbi.nlm.nih.gov/11709191',
 'rdfs:label': 'Yoshida_Patient MK',
 'ageType': 'AgeAtReport',
 'ageUnit': 'Years',
 'ageValue': 8,
 'allele': {'id': 'https://genegraph.clinicalgenome.org/r/9b9f6f3a-715d-4fcf-9271-10d1cd54cb8f',
  'type': 'https://terms.ga4gh.org/VariationDescriptor',
  'http://www.w3.org/2004/02/skos/core#prefLabel': 'NM_017739.3(POMGNT1):c.1649G>A (p.Ser550Asn)',
  'https://terms.ga4gh.org/CanonicalReference': {'id': 'http://reg.genome.network/allele/CA116540'}},
 'detectionMethod': 'The entire coding region (ex 1-22) and intron-exon flanking sequences of POMGNT1 were amplified and directly sequenced.',
 'firstTestingMethod': 'PCR',
 'phenotypeFreeText': 'Clinical features from PMID: 12588800: Moderate white matter lucency. Proband did not show head control at 8yo. CK levels at 4yo = 628 U/L.',
 'phenotypes': ['obo:HP_0007973',
  'obo:H

In [18]:
pd.DataFrame.from_dict([framed_data["@graph"][0]])

Unnamed: 0,id,type,dc:source,rdfs:label,ageType,ageUnit,ageValue,allele,detectionMethod,firstTestingMethod,phenotypeFreeText,phenotypes,previousTesting,previousTestingDescription,secondTestingMethod,sex,variant
0,https://genegraph.clinicalgenome.org/r/0708860...,Proband,https://pubmed.ncbi.nlm.nih.gov/11709191,Yoshida_Patient MK,AgeAtReport,Years,8,{'id': 'https://genegraph.clinicalgenome.org/r...,The entire coding region (ex 1-22) and intron-...,PCR,Clinical features from PMID: 12588800: Moderat...,"[obo:HP_0007973, obo:HP_0007260, obo:HP_000132...",True,Linkage analysis was performed to narrow down ...,Sanger sequencing,Male,{'id': 'https://genegraph.clinicalgenome.org/r...


In [21]:
# Directory 
genegraph_json_files_path = sorted(glob.glob(f'{INPUT_FOLDER}/*.json'))
len(genegraph_json_files_path)

3371

In [49]:
# Creating a common dataframe with all the probands data

all_proband_data = [] 

for file_path in genegraph_json_files_path:
    try:
        with open(file_path) as jf:
            file_content = json.load(jf)
            gene = file_content['subject']['gene'] 
            disease = file_content['subject']['disease'] 
            framed_data = jsonld.frame(file_content, frame)
            for proband in framed_data.get('@graph',[]):
                temp_proband_df = pd.DataFrame.from_dict([proband])
                temp_proband_df["gene"] = gene
                temp_proband_df["disease"]= disease
                temp_proband_df["source_file"] = os.path.basename(file_path)
                all_proband_data.append(temp_proband_df)
    except Exception as e:
        print(f"An exception {e} occur in reading file at {file_path}")
    

An exception Length of values (2) does not match length of index (1) occur in reading file at ./input/gene-validity-jsonld-20251030/gg_107f9b2d-8bc2-4165-9017-78c78b8151e0v1.0.json
An exception Length of values (2) does not match length of index (1) occur in reading file at ./input/gene-validity-jsonld-20251030/gg_7c114417-af28-41d9-8434-f99db9a16543v1.0.json
An exception 'subject' occur in reading file at ./input/gene-validity-jsonld-20251030/gg_87a36506-8821-4ddc-8ab8-6fa97936ee02.json


In [50]:
proband_data = pd.concat(all_proband_data, ignore_index=True)

In [51]:
proband_data.head()

Unnamed: 0,id,type,dc:source,rdfs:label,ageType,ageUnit,ageValue,allele,detectionMethod,firstTestingMethod,...,previousTesting,previousTestingDescription,secondTestingMethod,sex,variant,gene,disease,source_file,ethnicity,zygosity
0,https://genegraph.clinicalgenome.org/r/0708860...,Proband,https://pubmed.ncbi.nlm.nih.gov/11709191,Yoshida_Patient MK,AgeAtReport,Years,8.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The entire coding region (ex 1-22) and intron-...,PCR,...,True,Linkage analysis was performed to narrow down ...,Sanger sequencing,Male,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,obo:MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,
1,https://genegraph.clinicalgenome.org/r/352d557...,Proband,https://pubmed.ncbi.nlm.nih.gov/26908613,Xu_Proband 2,AgeAtReport,Years,32.0,{'id': 'https://genegraph.clinicalgenome.org/r...,Reanalysis of WES data revealed the POMGNT1 va...,Genotyping,...,True,WES was performed but the cause of RP was unid...,Sanger sequencing,Female,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,obo:MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,
2,https://genegraph.clinicalgenome.org/r/5f3835c...,Proband,https://pubmed.ncbi.nlm.nih.gov/11709191,Yoshida_Patient SA,AgeAtReport,Years,7.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The entire coding region (ex 1-22) and intron-...,PCR,...,True,Linkage analysis was performed to narrow down ...,Sanger sequencing,Female,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,obo:MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,
3,https://genegraph.clinicalgenome.org/r/612230d...,Proband,https://pubmed.ncbi.nlm.nih.gov/11709191,Yoshida_Patient YA,AgeAtReport,Years,6.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The entire coding region (ex 1-22) and intron-...,PCR,...,True,Linkage analysis was performed to narrow down ...,Sanger sequencing,Male,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,obo:MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,
4,https://genegraph.clinicalgenome.org/r/69b2ace...,Proband,https://pubmed.ncbi.nlm.nih.gov/18195152,Clement_Proband,AgeAtOnset,Years,12.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The POMGNT1 gene was screened for mutations.,Other,...,True,Abnormal glycosylation of α-DG was noted on mu...,Sanger sequencing,Female,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,obo:MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,


In [53]:
proband_data["disease"] = proband_data['disease'].map(lambda x: x.strip('obo:'))
proband_data.head()

Unnamed: 0,id,type,dc:source,rdfs:label,ageType,ageUnit,ageValue,allele,detectionMethod,firstTestingMethod,...,previousTesting,previousTestingDescription,secondTestingMethod,sex,variant,gene,disease,source_file,ethnicity,zygosity
0,https://genegraph.clinicalgenome.org/r/0708860...,Proband,https://pubmed.ncbi.nlm.nih.gov/11709191,Yoshida_Patient MK,AgeAtReport,Years,8.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The entire coding region (ex 1-22) and intron-...,PCR,...,True,Linkage analysis was performed to narrow down ...,Sanger sequencing,Male,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,
1,https://genegraph.clinicalgenome.org/r/352d557...,Proband,https://pubmed.ncbi.nlm.nih.gov/26908613,Xu_Proband 2,AgeAtReport,Years,32.0,{'id': 'https://genegraph.clinicalgenome.org/r...,Reanalysis of WES data revealed the POMGNT1 va...,Genotyping,...,True,WES was performed but the cause of RP was unid...,Sanger sequencing,Female,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,
2,https://genegraph.clinicalgenome.org/r/5f3835c...,Proband,https://pubmed.ncbi.nlm.nih.gov/11709191,Yoshida_Patient SA,AgeAtReport,Years,7.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The entire coding region (ex 1-22) and intron-...,PCR,...,True,Linkage analysis was performed to narrow down ...,Sanger sequencing,Female,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,
3,https://genegraph.clinicalgenome.org/r/612230d...,Proband,https://pubmed.ncbi.nlm.nih.gov/11709191,Yoshida_Patient YA,AgeAtReport,Years,6.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The entire coding region (ex 1-22) and intron-...,PCR,...,True,Linkage analysis was performed to narrow down ...,Sanger sequencing,Male,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,
4,https://genegraph.clinicalgenome.org/r/69b2ace...,Proband,https://pubmed.ncbi.nlm.nih.gov/18195152,Clement_Proband,AgeAtOnset,Years,12.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The POMGNT1 gene was screened for mutations.,Other,...,True,Abnormal glycosylation of α-DG was noted on mu...,Sanger sequencing,Female,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,


In [57]:
proband_data_w_disease_labels = pd.merge(proband_data,mondo_lookup, left_on='disease', right_on='Class ID', how='left')

In [59]:
proband_data_w_disease_labels.head()

Unnamed: 0,id,type,dc:source,rdfs:label,ageType,ageUnit,ageValue,allele,detectionMethod,firstTestingMethod,...,secondTestingMethod,sex,variant,gene,disease,source_file,ethnicity,zygosity,Class ID,Preferred Label
0,https://genegraph.clinicalgenome.org/r/0708860...,Proband,https://pubmed.ncbi.nlm.nih.gov/11709191,Yoshida_Patient MK,AgeAtReport,Years,8.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The entire coding region (ex 1-22) and intron-...,PCR,...,Sanger sequencing,Male,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,,MONDO_0700068,myopathy caused by variation in POMGNT1
1,https://genegraph.clinicalgenome.org/r/352d557...,Proband,https://pubmed.ncbi.nlm.nih.gov/26908613,Xu_Proband 2,AgeAtReport,Years,32.0,{'id': 'https://genegraph.clinicalgenome.org/r...,Reanalysis of WES data revealed the POMGNT1 va...,Genotyping,...,Sanger sequencing,Female,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,,MONDO_0700068,myopathy caused by variation in POMGNT1
2,https://genegraph.clinicalgenome.org/r/5f3835c...,Proband,https://pubmed.ncbi.nlm.nih.gov/11709191,Yoshida_Patient SA,AgeAtReport,Years,7.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The entire coding region (ex 1-22) and intron-...,PCR,...,Sanger sequencing,Female,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,,MONDO_0700068,myopathy caused by variation in POMGNT1
3,https://genegraph.clinicalgenome.org/r/612230d...,Proband,https://pubmed.ncbi.nlm.nih.gov/11709191,Yoshida_Patient YA,AgeAtReport,Years,6.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The entire coding region (ex 1-22) and intron-...,PCR,...,Sanger sequencing,Male,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,,MONDO_0700068,myopathy caused by variation in POMGNT1
4,https://genegraph.clinicalgenome.org/r/69b2ace...,Proband,https://pubmed.ncbi.nlm.nih.gov/18195152,Clement_Proband,AgeAtOnset,Years,12.0,{'id': 'https://genegraph.clinicalgenome.org/r...,The POMGNT1 gene was screened for mutations.,Other,...,Sanger sequencing,Female,{'id': 'https://genegraph.clinicalgenome.org/r...,hgnc:19139,MONDO_0700068,gg_00140591-caa8-4d47-b4ca-3f0577b16d73v2.1.json,,,MONDO_0700068,myopathy caused by variation in POMGNT1


In [62]:
# Basic stats
proband_data_w_disease_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20490 entries, 0 to 20489
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          20490 non-null  object 
 1   type                        20490 non-null  object 
 2   dc:source                   20490 non-null  object 
 3   rdfs:label                  20490 non-null  object 
 4   ageType                     15022 non-null  object 
 5   ageUnit                     15020 non-null  object 
 6   ageValue                    12808 non-null  float64
 7   allele                      11769 non-null  object 
 8   detectionMethod             13773 non-null  object 
 9   firstTestingMethod          16570 non-null  object 
 10  phenotypeFreeText           11228 non-null  object 
 11  phenotypes                  14122 non-null  object 
 12  previousTesting             11858 non-null  object 
 13  previousTestingDescription  966

In [69]:
# Check for temporal data
temporal_data = proband_data_w_disease_labels[proband_data_w_disease_labels["ageValue"].notnull()]
temporal_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12808 entries, 0 to 20489
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          12808 non-null  object 
 1   type                        12808 non-null  object 
 2   dc:source                   12808 non-null  object 
 3   rdfs:label                  12808 non-null  object 
 4   ageType                     12743 non-null  object 
 5   ageUnit                     12761 non-null  object 
 6   ageValue                    12808 non-null  float64
 7   allele                      7542 non-null   object 
 8   detectionMethod             9085 non-null   object 
 9   firstTestingMethod          10880 non-null  object 
 10  phenotypeFreeText           7745 non-null   object 
 11  phenotypes                  9888 non-null   object 
 12  previousTesting             7830 non-null   object 
 13  previousTestingDescription  6367 non

In [73]:
pd.DataFrame(temporal_data['Preferred Label'].value_counts()).reset_index().to_csv("./output/disease_w_temoral_info.csv", index= False)

In [74]:
proband_data_w_disease_labels.to_csv("./output/all_proband_data.csv",index= False)