# Ontology Data Prep

**Purpose**: To transform ontologies into A-box formatted knowledge graphs.

In [112]:
import os
import gzip
import json
import rdflib
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests
import io

Set up file locations

In [89]:
ontology_directory = '../resources/ontologies'
tmp_data = '../resources/tmp_data'
processed_data = '../resources/processed_data'

# Gene Ontology

In [2]:
go = rdflib.Graph().parse('../resources/ontologies/go-plus.owl').serialize(format='json-ld')

In [3]:
go = json.loads(go)

In [4]:
type_set = set()
for x in go:
    _type = x.get('@type',[])
    type_set.update(_type)
 
print(type_set)

{'http://www.w3.org/2003/11/swrl#ClassAtom', 'http://purl.obolibrary.org/obo/IAO_0000409', 'http://www.w3.org/2002/07/owl#Axiom', 'http://www.w3.org/2002/07/owl#ObjectProperty', 'http://purl.obolibrary.org/obo/IAO_0000225', 'http://www.w3.org/2002/07/owl#TransitiveProperty', 'http://www.w3.org/2002/07/owl#Ontology', 'http://www.w3.org/2002/07/owl#FunctionalProperty', 'http://www.w3.org/2003/11/swrl#Imp', 'http://www.w3.org/2002/07/owl#InverseFunctionalProperty', 'http://www.w3.org/2002/07/owl#AnnotationProperty', 'http://www.w3.org/2002/07/owl#Class', 'http://www.w3.org/2002/07/owl#NamedIndividual', 'http://www.w3.org/2003/11/swrl#Variable', 'http://www.w3.org/2002/07/owl#Restriction', 'http://purl.obolibrary.org/obo/IAO_0000078', 'http://www.w3.org/2003/11/swrl#IndividualPropertyAtom', 'http://www.w3.org/2002/07/owl#SymmetricProperty', 'http://www.w3.org/2003/11/swrl#AtomList', 'http://www.w3.org/2002/07/owl#AllDifferent', 'http://www.w3.org/2002/07/owl#AllDisjointClasses'}


In [25]:
classes = [x for x in go if 'http://www.w3.org/2002/07/owl#Class' in x.get('@type', [])]

In [26]:
classes[0]

{'@id': 'http://purl.obolibrary.org/obo/GO_0098641',
 '@type': ['http://www.w3.org/2002/07/owl#Class'],
 'http://purl.obolibrary.org/obo/IAO_0000115': [{'@value': 'Any cadherin binding that occurs as part of the process of cell-cell adhesion.'}],
 'http://www.geneontology.org/formats/oboInOwl#hasOBONamespace': [{'@value': 'molecular_function'}],
 'http://www.geneontology.org/formats/oboInOwl#id': [{'@value': 'GO:0098641'}],
 'http://www.w3.org/2000/01/rdf-schema#label': [{'@value': 'cadherin binding involved in cell-cell adhesion'}],
 'http://www.w3.org/2000/01/rdf-schema#subClassOf': [{'@id': 'http://purl.obolibrary.org/obo/GO_0045296'},
  {'@id': 'http://purl.obolibrary.org/obo/GO_0098632'}],
 'http://www.w3.org/2002/07/owl#equivalentClass': [{'@id': '_:N6f5d7faa01024e22aae05664f14a4ce5'}]}

In [80]:
class OntologyDataPack:
    def __init__(self, filepath, _graph, _graph_objects):
        # self._graph = rdflib.Graph().parse(filepath)
        # self._graph_objects = self._graph.serialize(format='json-ld')
        self._graph = _graph
        self._graph_objects = _graph_objects

        self._object_types = {
            'CLASS': 'http://www.w3.org/2002/07/owl#Class',
            'RELATIONSHIP': 'http://www.w3.org/2002/07/owl#ObjectProperty'
        }
        self.classes = self._extract_classes()
        self.relationships = self._extract_relationships()

    def _extract_classes(self):
        class_types = [x for x in self._graph_objects if self._object_types['CLASS'] in x.get('@type', [])]
        # Remove anonymous nodes
        # TODO: This happens because of having to connect hypernodes for equivalence classes that
        # describe more than one adjacency constraints
        identifiable_classes = [x for x in class_types if not x.get('@id').startswith('_:')]

        # Remove deprecated nodes
        non_deprecated = [x for x in identifiable_classes if 'http://www.w3.org/2002/07/owl#deprecated' not in x]
        return non_deprecated

    def _extract_relationships(self):
        rel_types = [x for x in self._graph_objects if self._object_types['RELATIONSHIP'] in x.get('@type', [])]
        return rel_types
  

In [81]:

def _uri_to_obo_short(uri):
    return uri.split('/').pop()

def _uri_to_curie(uri):
    obo_short = uri.split('/').pop()
    if not '_' in obo_short:
        return
    CURIE = obo_short.replace('_', ':')
    return CURIE

def _arr_value_extract(key, obj):
    if key in obj:
        arr1 = obj.get(key, [])
        arr2 = [x.get('@value') for x in arr1 if '@value' in x]
        return arr2

def _get_label(n):
    label = _arr_value_extract('http://www.w3.org/2000/01/rdf-schema#label', n)
    if label is not None:
        return ' '.join(label)
def _get_description(n):
    description = _arr_value_extract('http://purl.obolibrary.org/obo/IAO_0000115', n)
    if description is not None:
        return ' '.join(description)

def _get_parents(n):
    sco_key = 'http://www.w3.org/2000/01/rdf-schema#subClassOf'
    if sco_key in n:
        arr1 = n.get(sco_key, [])
        arr2 = [x for x in arr1 if x.get('@id', None) is not None]
        arr3 = [x.get('@id') for x in arr2 if not x.get('@id').startswith('_:')]
        arr4 = [_uri_to_curie(x) for x in arr3]
        return arr4
    
def obo_class2node(x, pkeys):
    uri = x.get('@id')
    obo_short = _uri_to_obo_short(uri)
    CURIE = _uri_to_curie(uri)
    if CURIE is None:
        return
    label = _get_label(x)
    description = _get_description(x)
    # parents = _get_parents(x)
    properties = {
        'uuid': CURIE,
        'displayName': label,
        'description': description if description is not None else '',
        'obo_short': obo_short,
        'CURIE': CURIE
    }

    for pk in pkeys:
        value = None
        if pk.get('collapse', True):
            arr1 = _arr_value_extract(pk['key'], x)
            if arr1 is not None:
                value = ' '.join(arr1)
        else:
            value = _arr_value_extract(pk['key'], x)

        if value is not None:
            properties[pk['name']] = value

    return properties

def obo_class2edges(x):
    uri = x.get('@id')
    CURIE = _uri_to_curie(uri)
    if CURIE is None:
        return []
    parents = _get_parents(x)
    if parents is None:
        return []

    edges = []

    for p in parents:
        edges.append(
            {
                'from': {
                    'uuid': CURIE
                },
                'to': {
                    'uuid': p
                },
                'label': 'is a',
                'properties': {}
            }
        )
    
    return edges


go_pkeys = [
    {
        'key': 'http://www.geneontology.org/formats/oboInOwl#hasOBONamespace',
        'name': 'go_type',
        'collapse': False
    }
]
    

In [82]:
dpGO = OntologyDataPack(_graph=go, _graph_objects=go, filepath='')

In [96]:
nodes = []
edges = []

for c in dpGO.classes:
    n = obo_class2node(c, go_pkeys)
    if n is not None and n['uuid'].startswith('GO:'):
        labels = None
        if n.get('go_type') is None:
            continue
        if 'molecular_function' in n.get('go_type'):
            labels = ['MolecularFunction']
        elif 'cellular_component' in n.get('go_type'):
            labels = ['CellularComponent']
        elif 'biological_process' in n.get('go_type'):
            labels = ['BiologicalProcess']

        if labels is None:
            continue
        node = {
            '_id': n.get('uuid'),
            'labels': labels,
            'properties': n
        }
        nodes.append(node)
    
    e = [e for e in obo_class2edges(c) if e is not None]
    
    edges.extend(e)

print(f'Nodes: {len(nodes)}')
print(f'Edges: {len(edges)}')

with gzip.open(os.path.join(processed_data, 'GO_node.jsonl.gz'), 'wt') as f:
    print('Writing Nodes to File')
    for n in tqdm(nodes):
        f.write(json.dumps(n) + '\n')
with gzip.open(os.path.join(processed_data, 'GO_edge.jsonl.gz'), 'wt') as f:
    print('Writing Edges to File')
    for e in tqdm(edges):
        f.write(json.dumps(e) + '\n')
metadata = {
    "_meta": {
        "version": "0.1.0",
        "date_updated": "2024-05-15",
        "maintainer": "BioBox Analytics"
    },
    "key": "gene_ontology",
    "name": "Gene Ontology (GO)",
    "description": "Contains classes in the Gene Ontology transformed as graph objects, assigned to one of the 3 gene ontology top-level concepts. Only direct parents subClassOf relations are preserved as `is a ` edges",
    "source": [
        {
            "uri": "https://geneontology.org/",
            "type": "doc"
        },
        {
            "uri": "http://purl.obolibrary.org/obo/go.owl",
            "type": "data",
            "version": "2024-04-24"
        }
    ],
    "concepts": {
        "BiologicalProcess": {
            "label": "Biological Process",
            "dbLabel": "BiologicalProcess",
            "definition": "A biological process is the execution of a genetically-encoded biological module or program. It consists of all the steps required to achieve the specific biological objective of the module. A biological process is accomplished by a particular set of molecular functions carried out by specific gene products (or macromolecular complexes), often in a highly regulated manner and in a particular temporal sequence.",
            "object_count": len([x for x in nodes if 'BiologicalProcess' in x.get('labels')])
        },
        "CellularComponent": {
            "label": "Cellular Component",
            "dbLabel": "CellularComponent",
            "definition": "A location, relative to cellular compartments and structures, occupied by a macromolecular machine. There are three types of cellular components described in the gene ontology: (1) the cellular anatomical entity where a gene product carries out a molecular function (e.g., plasma membrane, cytoskeleton) or membrane-enclosed compartments (e.g., mitochondrion); (2) virion components, where viral proteins act, and (3) the stable macromolecular complexes of which gene product are parts (e.g., the clathrin complex).",
            "object_count": len([x for x in nodes if 'CellularComponent' in x.get('labels')])
        },
        "MolecularFunction": {
            "label": "Molecular Function",
            "dbLabel": "MolecularFunction",
            "definition": "A molecular process that can be carried out by the action of a single macromolecular machine, usually via direct physical interactions with other molecular entities. Function in this sense denotes an action, or activity, that a gene product (or a complex) performs",
            "object_count": len([x for x in nodes if 'MolecularFunction' in x.get('labels')])
        }
    },
    "relationships": {}
}

with open(os.path.join(processed_data, 'GO_metadata.json'), 'w') as f:
    json.dump(metadata, f)

Nodes: 42255
Edges: 116687
Writing Nodes to File


100%|██████████████████████████████████████████████████████████████████████████| 42255/42255 [00:00<00:00, 45511.19it/s]


Writing Edges to File


100%|███████████████████████████████████████████████████████████████████████| 116687/116687 [00:01<00:00, 113749.42it/s]


# Gene Ontology - Annotations

In [97]:
def read_gaf(filepath):
    with gzip.open(filepath, 'rt') as file:
        for i, line in enumerate(file):
            if not line.startswith('!'):
                break
    df = pd.read_table(filepath, sep='\t', skiprows=i, header=None)
    return df

In [98]:
gaf = read_gaf(os.path.join(tmp_data, 'goa_human.gaf.gz'))

  df = pd.read_table(filepath, sep='\t', skiprows=i, header=None)


In [99]:
gaf.replace({np.nan: None}, inplace=True)

In [101]:
GAF_COLUMNS = [
    'DB',
    'DB Object ID',
    'DB Object Symbol',
    'Qualifier',
    'GO ID',
    'DB:Reference',
    'Evidence Code',
    'With (or) From',
    'Aspect',
    'DB Object Name',
    'DB Object Synonym',
    'DB Object Type',
    'Taxon',
    'Date',
    'Assigned By',
    'Annotation Extension',
    'Gene Product Form ID'
]
gaf.columns = GAF_COLUMNS

In [102]:
gaf.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,DB:Reference,Evidence Code,With (or) From,Aspect,DB Object Name,DB Object Synonym,DB Object Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20240129,UniProt,,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20240129,UniProt,,
2,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20230619,HPA,,
3,UniProtKB,A0A075B6H7,IGKV3-7,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20240129,UniProt,,
4,UniProtKB,A0A075B6H7,IGKV3-7,located_in,GO:0005886,GO_REF:0000044,IEA,UniProtKB-SubCell:SL-0039,C,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20240129,UniProt,,


Make mapping file between Ensembl Gene stable IDs and UniprotKB IDs

In [117]:
if not os.path.exists(os.path.join(tmp_data, 'ensembl2uniprot.csv')):
    res = requests.get(
        'http://www.ensembl.org/biomart/martservice',
        params={
            'query': """<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query  virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Attribute name = "ensembl_gene_id" /><Attribute name = "uniprotswissprot" /></Dataset></Query>"""
        }
    )
    ensembl2uniprot = pd.read_table(io.StringIO(res.text))
    ensembl2uniprot.to_csv(os.path.join(tmp_data, 'ensembl2uniprot.csv'))

In [118]:
ensembl2uniprot.head()

Unnamed: 0,Gene stable ID,UniProtKB/Swiss-Prot ID
0,ENSG00000198888,P03886
1,ENSG00000198763,P03891
2,ENSG00000198804,P00395
3,ENSG00000198712,P00403
4,ENSG00000228253,P03928


We approach the mapping as follows:
1. For each Uniprot ID, where they are reviewed, we get the corresponding Ensembl Gene ID
2. Qualifiers are transformed as edges that connect the Ensembl Gene ID to the GO ID


In [123]:
merged = pd.merge(gaf, ensembl2uniprot, left_on="DB Object ID", right_on="UniProtKB/Swiss-Prot ID", how="left")

In [124]:
merged.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,DB:Reference,Evidence Code,With (or) From,Aspect,DB Object Name,DB Object Synonym,DB Object Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID,Gene stable ID,UniProtKB/Swiss-Prot ID
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20240129,UniProt,,,ENSG00000177144,A0A024RBG1
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20240129,UniProt,,,ENSG00000177144,A0A024RBG1
2,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20230619,HPA,,,ENSG00000177144,A0A024RBG1
3,UniProtKB,A0A075B6H7,IGKV3-7,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20240129,UniProt,,,ENSG00000282310,A0A075B6H7
4,UniProtKB,A0A075B6H7,IGKV3-7,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20240129,UniProt,,,ENSG00000243063,A0A075B6H7


In [127]:
merged['Qualifier'].unique()

array(['enables', 'located_in', 'involved_in', 'part_of', 'NOT|enables',
       'NOT|involved_in', 'is_active_in', 'NOT|colocalizes_with',
       'colocalizes_with', 'acts_upstream_of_or_within', 'contributes_to',
       'NOT|located_in', 'NOT|part_of',
       'acts_upstream_of_positive_effect',
       'NOT|acts_upstream_of_or_within', 'acts_upstream_of',
       'acts_upstream_of_negative_effect',
       'acts_upstream_of_or_within_positive_effect',
       'acts_upstream_of_or_within_negative_effect', 'NOT|contributes_to',
       'NOT|acts_upstream_of_or_within_negative_effect',
       'NOT|is_active_in'], dtype=object)

In [128]:
edges = []
relationships = {}
for idx, row in tqdm(merged.iterrows()):
    d = row.to_dict()
    gene_id = d.get('Gene stable ID')
    go_id = d.get('GO ID')
    if gene_id is None:
        continue
    
    qualifier = d.get('Qualifier')
    
    aspect = d.get('Aspect')
    
    to_label = None
    
    if aspect == 'F':
        to_label = 'MolecularFunction'
    elif aspect == 'C':
        to_label = 'CellularComponent'
    elif aspect == 'P':
        to_label = 'BiologicalProcess'
        
    label = qualifier.replace('_', ' ').replace('|', ' ').lower()
    
    relationships[label] = {
        'from': 'Gene',
        'to': to_label
    }
    
    edge_properties = {k: v for k,v in {
        'evidence_code': d.get('Evidence Code'),
        'citation_reference': d.get('DB:Reference').split('|'),
        'with_or_from': d.get('With (or) From'),
        'assigning_authority': d.get('Assigned By'),
        'date': d.get('Date'),
        'annotation_extension': d.get('Annotation Extension')
    }.items() if v is not None}

    
    
    edge = {
        'from': {
            'uuid': gene_id
        },
        'to': {
            'uuid': go_id
        },
        'label': label,
        'properties': edge_properties
    }
    edges.append(edge)
    

1909271it [02:11, 14480.51it/s]


In [133]:
with gzip.open(os.path.join(processed_data, 'GAF_node.jsonl.gz'), 'wt') as f:
    print('Writing Nodes to File')
    for n in tqdm([]):
        f.write(json.dumps(n) + '\n')
with gzip.open(os.path.join(processed_data, 'GAF_edge.jsonl.gz'), 'wt') as f:
    print('Writing Edges to File')
    for e in tqdm(edges):
        f.write(json.dumps(e) + '\n')
metadata = {
    "_meta": {
        "version": "0.1.0",
        "date_updated": "2024-05-15",
        "maintainer": "BioBox Analytics"
    },
    "key": "gene_ontology_annotations",
    "name": "Gene Ontology - Annotations",
    "description": "A GO annotation is a statement about the function of a particular gene. GO annotations are created by associating a gene or gene product with a GO term. Together, these statements comprise a “snapshot” of current biological knowledge. Hence, GO annotations capture statements about how a gene functions at the molecular level, where in the cell it functions, and what biological processes (pathways, programs) it helps to carry out.",
    "source": [
        {
            "uri": "https://geneontology.org/docs/go-annotations/",
            "type": "doc"
        },
        {
            "uri": "https://current.geneontology.org/annotations/goa_human.gaf.gz",
            "type": "data",
            "version": "2024-04-24"
        }
    ],
    "concepts": {},
    "relationships": relationships
}

with open(os.path.join(processed_data, 'GAF_metadata.json'), 'w') as f:
    json.dump(metadata, f)

Writing Nodes to File


0it [00:00, ?it/s]


Writing Edges to File


100%|█████████████████████████████████████████████████████████████████████| 1909271/1909271 [00:16<00:00, 117784.03it/s]


In [130]:
relationships

{'enables': {'from': 'Gene', 'to': 'MolecularFunction'},
 'located in': {'from': 'Gene', 'to': 'CellularComponent'},
 'involved in': {'from': 'Gene', 'to': 'BiologicalProcess'},
 'part of': {'from': 'Gene', 'to': 'CellularComponent'},
 'not enables': {'from': 'Gene', 'to': 'MolecularFunction'},
 'not involved in': {'from': 'Gene', 'to': 'BiologicalProcess'},
 'is active in': {'from': 'Gene', 'to': 'CellularComponent'},
 'not colocalizes with': {'from': 'Gene', 'to': 'CellularComponent'},
 'colocalizes with': {'from': 'Gene', 'to': 'CellularComponent'},
 'acts upstream of or within': {'from': 'Gene', 'to': 'BiologicalProcess'},
 'contributes to': {'from': 'Gene', 'to': 'MolecularFunction'},
 'not located in': {'from': 'Gene', 'to': 'CellularComponent'},
 'not part of': {'from': 'Gene', 'to': 'CellularComponent'},
 'acts upstream of positive effect': {'from': 'Gene',
  'to': 'BiologicalProcess'},
 'not acts upstream of or within': {'from': 'Gene', 'to': 'BiologicalProcess'},
 'acts upstr

# Human Phenotype Ontology

This part will require you to download the hpo json serialized data from [here](https://hpo.jax.org/app/data/ontology). Place it hp.json file inside the ontologies directory

In [174]:
hp_owl = rdflib.Graph().parse('../resources/ontologies/hp.owl')
hp = json.loads(hp_owl.serialize(format='json-ld'))

In [175]:
hp[0]

{'@id': '_:Naa8d294b59a7411d8be7c1be36f887d2',
 '@type': ['http://www.w3.org/2002/07/owl#Axiom'],
 'http://www.geneontology.org/formats/oboInOwl#hasDbXref': [{'@value': 'OBOL:automatic'}],
 'http://www.w3.org/2002/07/owl#annotatedProperty': [{'@id': 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym'}],
 'http://www.w3.org/2002/07/owl#annotatedSource': [{'@id': 'http://purl.obolibrary.org/obo/UBERON_0004251'}],
 'http://www.w3.org/2002/07/owl#annotatedTarget': [{'@value': 'bone of inferior member middle limb segment'}]}

In [176]:
dpHP = OntologyDataPack(_graph=hp_owl, _graph_objects=hp, filepath='')

In [179]:
hp_classes = [x for x in dpHP.classes if x.get('@id').startswith('http://purl.obolibrary.org/obo/HP_')]


{'@id': 'http://purl.obolibrary.org/obo/HP_0100952',
 '@type': ['http://www.w3.org/2002/07/owl#Class'],
 'http://purl.obolibrary.org/obo/IAO_0000115': [{'@value': 'An increase in size of the subarachnoid space associated with the lateral cerebral sulcus (Sylvian fissure).'}],
 'http://purl.org/dc/terms/creator': [{'@id': 'https://orcid.org/0009-0006-4530-3154'}],
 'http://www.geneontology.org/formats/oboInOwl#creation_date': [{'@value': '2011-12-02T04:47:51Z'}],
 'http://www.geneontology.org/formats/oboInOwl#hasDbXref': [{'@value': 'UMLS:C4020921'}],
 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym': [{'@value': 'Enlarged lateral fissure'},
  {'@value': 'Enlarged lateral sulcus'},
  {'@value': 'Enlarged sylvian fissure'}],
 'http://www.geneontology.org/formats/oboInOwl#id': [{'@value': 'HP:0100952'}],
 'http://www.w3.org/2000/01/rdf-schema#label': [{'@value': 'Enlarged sylvian cistern'}],
 'http://www.w3.org/2000/01/rdf-schema#subClassOf': [{'@id': 'http://purl.obolibrary

In [180]:
all_hp_pkeys = set()
for c in hp_classes:
    all_hp_pkeys.update(c.keys())
all_hp_pkeys

{'@id',
 '@type',
 'http://purl.obolibrary.org/obo/IAO_0000115',
 'http://purl.obolibrary.org/obo/IAO_0000233',
 'http://purl.obolibrary.org/obo/RO_0002581',
 'http://purl.org/dc/elements/1.1/creator',
 'http://purl.org/dc/elements/1.1/date',
 'http://purl.org/dc/terms/contributor',
 'http://purl.org/dc/terms/creator',
 'http://purl.org/dc/terms/date',
 'http://www.geneontology.org/formats/oboInOwl#creation_date',
 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId',
 'http://www.geneontology.org/formats/oboInOwl#hasBroadSynonym',
 'http://www.geneontology.org/formats/oboInOwl#hasDbXref',
 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym',
 'http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym',
 'http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym',
 'http://www.geneontology.org/formats/oboInOwl#id',
 'http://www.geneontology.org/formats/oboInOwl#inSubset',
 'http://www.w3.org/2000/01/rdf-schema#comment',
 'http://www.w3.org/2000/01/rdf-sch

In [None]:
nodes = []
edges = []
hp_pkeys = [
    {
        'name': 'hasDbXref',
        'key': 'http://www.geneontology.org/formats/oboInOwl#hasDbXref',
        'collapse': False
    },
    {
        'name': 'hasBroadSynonym',
        'key': 'http://www.geneontology.org/formats/oboInOwl#hasBroadSynonym',
        'collapse': False
    },
    {
        'name': 'hasExactSynonym',
        'key': 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym',
        'collapse': False
    }
]
for c in dpHP.classes:
    n = obo_class2node(c, hp_pkeys)
    if n is not None and n['uuid'].startswith('HP:'):
        labels = ['Phenotype']
        node = {
            '_id': n.get('uuid'),
            'labels': labels,
            'properties': n
        }
        nodes.append(node)
    
    e = [e for e in obo_class2edges(c) if e is not None]
    
    edges.extend(e)

print(f'Nodes: {len(nodes)}')
print(f'Edges: {len(edges)}')

with gzip.open(os.path.join(processed_data, 'HP_node.jsonl.gz'), 'wt') as f:
    print('Writing Nodes to File')
    for n in tqdm(nodes):
        f.write(json.dumps(n) + '\n')
with gzip.open(os.path.join(processed_data, 'HP_edge.jsonl.gz'), 'wt') as f:
    print('Writing Edges to File')
    for e in tqdm(edges):
        f.write(json.dumps(e) + '\n')
metadata = {
    "_meta": {
        "version": "0.1.0",
        "date_updated": "2024-05-15",
        "maintainer": "BioBox Analytics"
    },
    "key": "human_phenotype",
    "name": "Human Phenotype Ontology (HPO)",
    "description": "The Human Phenotype Ontology (HPO) provides a standardized vocabulary of phenotypic abnormalities and clinical features encountered in human disease.",
    "source": [
        {
            "uri": "https://hpo.jax.org/app/about",
            "type": "doc"
        },
        {
            "uri": "http://purl.obolibrary.org/obo/hp.owl",
            "type": "data",
            "version": "2024-04-26"
        }
    ],
    "concepts": {
        "BiologicalProcess": {
            "label": "Biological Process",
            "dbLabel": "BiologicalProcess",
            "definition": "A biological process is the execution of a genetically-encoded biological module or program. It consists of all the steps required to achieve the specific biological objective of the module. A biological process is accomplished by a particular set of molecular functions carried out by specific gene products (or macromolecular complexes), often in a highly regulated manner and in a particular temporal sequence.",
            "object_count": len([x for x in nodes if 'BiologicalProcess' in x.get('labels')])
        },
        "CellularComponent": {
            "label": "Cellular Component",
            "dbLabel": "CellularComponent",
            "definition": "A location, relative to cellular compartments and structures, occupied by a macromolecular machine. There are three types of cellular components described in the gene ontology: (1) the cellular anatomical entity where a gene product carries out a molecular function (e.g., plasma membrane, cytoskeleton) or membrane-enclosed compartments (e.g., mitochondrion); (2) virion components, where viral proteins act, and (3) the stable macromolecular complexes of which gene product are parts (e.g., the clathrin complex).",
            "object_count": len([x for x in nodes if 'CellularComponent' in x.get('labels')])
        },
        "MolecularFunction": {
            "label": "Molecular Function",
            "dbLabel": "MolecularFunction",
            "definition": "A molecular process that can be carried out by the action of a single macromolecular machine, usually via direct physical interactions with other molecular entities. Function in this sense denotes an action, or activity, that a gene product (or a complex) performs",
            "object_count": len([x for x in nodes if 'MolecularFunction' in x.get('labels')])
        }
    },
    "relationships": {}
}

with open(os.path.join(processed_data, 'HP_metadata.json'), 'w') as f:
    json.dump(metadata, f)

# Human Phenotype Annotation

In [186]:
def read_hpoa(filepath):
    with open(filepath, 'r') as file:
        for i, line in enumerate(file):
            if not line.startswith('#'):
                break
    df = pd.read_table(filepath, sep='\t', skiprows=i)
    df.replace({np.nan: None}, inplace=True)
    return df

In [187]:
hpoa_df = read_hpoa(os.path.join(tmp_data, 'phenotype.hpoa'))

  df = pd.read_table(filepath, sep='\t', skiprows=i)


In [188]:
hpoa_df.head()

Unnamed: 0,database_id,disease_name,qualifier,hpo_id,reference,evidence,onset,frequency,sex,modifier,aspect,biocuration
0,OMIM:619340,Developmental and epileptic encephalopathy 96,,HP:0011097,PMID:31675180,PCS,,1/2,,,P,HPO:probinson[2021-06-21]
1,OMIM:619340,Developmental and epileptic encephalopathy 96,,HP:0002187,PMID:31675180,PCS,,1/1,,,P,HPO:probinson[2021-06-21]
2,OMIM:619340,Developmental and epileptic encephalopathy 96,,HP:0001518,PMID:31675180,PCS,,1/2,,,P,HPO:probinson[2021-06-21]
3,OMIM:619340,Developmental and epileptic encephalopathy 96,,HP:0032792,PMID:31675180,PCS,,1/2,,,P,HPO:probinson[2021-06-21]
4,OMIM:619340,Developmental and epileptic encephalopathy 96,,HP:0011451,PMID:31675180,PCS,,1/2,,,P,HPO:probinson[2021-06-21]
