In [2]:
import re
from itertools import chain
import networkx
import requests
from mydisease.utils import read_obo
from networkx.readwrite import json_graph
import json
from collections import defaultdict
from typing import List

from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.HPO

In [4]:
def graph_to_d(graph):
    """

    :param graph: A networkx graph made from reading ontology
    :type graph: networkx.classes.multidigraph.MultiDiGraph
    :return:
    """
    node_link_data = json_graph.node_link_data(graph)
    nodes = node_link_data['nodes']

    idx_id = {idx: node['id'] for idx,node in enumerate(nodes)}
    for link in node_link_data['links']:
        # store the edges (links) within the graph
        key = link['key']
        source = link['source']
        target = link['target']

        if key not in nodes[source]:
            nodes[source][key] = set()
        nodes[source][key].add(idx_id[target])

    # for mongo insertion
    for node in nodes:
        node['_id'] = node['id']
        del node['id']
        for k,v in node.items():
            if isinstance(v, set):
                node[k] = list(v)
    d = {node['_id']: node for node in nodes}

    return d

In [51]:
def parse_synonym(line: str):
    # line = "synonym: \"The other white meat\" EXACT MARKETING_SLOGAN [MEAT:00324, BACONBASE:03021]"
    return line[line.index("\"")+1:line.rindex("\"")] if line.count("\"") == 2 else line


def parse_def(line: str):
    # line = "\"A description.\" [url:http\://www.ncbi.goc/123, url:http\://www.ncbi.nlm.nih.gov/pubmed/15318016]"
    definition = line[line.index("\"")+1:line.rindex("\"")] if line.count("\"") == 2 else line
    if line.endswith("]") and line.count("["):
        left_bracket = [m.start() for m in re.finditer('\[', line)]
        right_bracket = [m.start() for m in re.finditer('\]', line)]
        endliststr = line[left_bracket[-1]+1:right_bracket[-1]-1]
        endlist = [x.strip() for x in endliststr.split(", ")]
        return definition, endlist
    else:
        return definition, None

def parse_xref(xrefs: List[str]):
    xrefs = [x for x in xrefs if ":" in x]
    for n,xref in enumerate(xrefs):
        xref = xref.split(" ")[0]
        if xref.startswith("MSH:"):
            xref = xref.replace("MSH:", "MESH:")
        if xref.startswith("MeSH:"):
            xref = xref.replace("MeSH:", "MESH:")
        if xref.startswith("Mesh:"):
            xref = xref.replace("Mesh:", "MESH:")
        xrefs[n] = xref
        
    return xrefs

In [52]:
"""
url = "http://purl.obolibrary.org/obo/doid.obo"
r = requests.get(url)
graph = read_obo(r.text.splitlines())
"""
graph = read_obo(open("/home/gstupp/projects/biothings/mydisease/mydisease/data/hp.obo").readlines())
d = graph_to_d(graph)

In [53]:
d['HP:0000252']['xref']

['MeSH:D008831 "Microcephaly"', 'UMLS:C0025958 "Microcephaly"']

In [54]:
for value in d.values():
    if 'xref' in value:
        value['xref'] = parse_xref(value['xref'])
    if 'synonym' in value:
        value['synonym'] = list(map(parse_synonym, value['synonym']))
    if 'def' in value:
        value['def'],ref = parse_def(value['def'])
        if ref:
            value['def_ref'] = ref

In [55]:
d['HP:0000252']['xref']

['MESH:D008831', 'UMLS:C0025958']

In [56]:
from collections import Counter
Counter([x.split(":")[0] for x in chain(*[x.get('xref',[]) for x in d.values()])]).most_common(100)

[('UMLS', 3913),
 ('MESH', 1065),
 ('SNOMEDCT', 277),
 ('MEDDRA', 93),
 ('ICD-10', 36),
 ('pmid', 22),
 ('EPCC', 13),
 ('MP', 11),
 ('MPATH', 5),
 ('NCIT', 5),
 ('ICD-O', 3),
 ('UToronto', 3),
 ('ICD-9', 3),
 ('DOI', 2),
 ('ICD10', 2),
 ('http', 1),
 ('DOID', 1),
 ('v', 1),
 ('SNOMEDct', 1),
 ('UMLS_CUI', 1),
 ('PMID', 1),
 ('NCI', 1)]

In [60]:
db.insert_many(d.values())

<pymongo.results.InsertManyResult at 0x7f76a63a0f78>

In [61]:
db.find_one('HP:0000252')

{'_id': 'HP:0000252',
 'alt_id': ['HP:0001366', 'HP:0005485', 'HP:0005489', 'HP:0005497'],
 'comment': 'Head circumference is measured from just above the glabella (the most prominent point on the frontal bone above the root of the nose) to the most posterior prominent point of the occipital bone using a tape measure. Some standard charts are organized by centiles [Hall et al. [2007]], others by standard deviations [Farkas, [1981]]. It is important to add an indication of how far below the normal standard the head circumference is if an accurate assessment of this can be made. Microcephaly is an absolute term. The term relative microcephaly can be used when the head size centile is less than the centile for height, for example, head size at the 3rd centile with height at the 75% for age and sex. On prenatal ultrasound, microcephaly is diagnosed if the head circumference or the biparietal diameter is more than three standard deviations below the mean.',
 'def': 'Occipito-frontal (head) 