In [39]:
import re
from itertools import chain
import networkx
import requests
from mydisease.utils import read_obo
from networkx.readwrite import json_graph
import json
from collections import defaultdict
from typing import List

from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.DO

In [58]:
def graph_to_d(graph):
    """

    :param graph: A networkx graph made from reading ontology
    :type graph: networkx.classes.multidigraph.MultiDiGraph
    :return:
    """
    node_link_data = json_graph.node_link_data(graph)
    nodes = node_link_data['nodes']

    idx_id = {idx: node['id'] for idx,node in enumerate(nodes)}
    for link in node_link_data['links']:
        # store the edges (links) within the graph
        key = link['key']
        source = link['source']
        target = link['target']

        if key not in nodes[source]:
            nodes[source][key] = set()
        nodes[source][key].add(idx_id[target])

    # for mongo insertion
    for node in nodes:
        node['_id'] = node['id']
        del node['id']
        for k,v in node.items():
            if isinstance(v, set):
                node[k] = list(v)
    d = {node['_id']: node for node in nodes}

    return d


def parse_synonym(line: str):
    # line = "synonym: \"The other white meat\" EXACT MARKETING_SLOGAN [MEAT:00324, BACONBASE:03021]"
    return line[line.index("\"")+1:line.rindex("\"")] if line.count("\"") == 2 else line


def parse_def(line: str):
    # line = "\"A description.\" [url:http\://www.ncbi.goc/123, url:http\://www.ncbi.nlm.nih.gov/pubmed/15318016]"
    definition = line[line.index("\"")+1:line.rindex("\"")] if line.count("\"") == 2 else line
    if line.endswith("]") and line.count("["):
        left_bracket = [m.start() for m in re.finditer('\[', line)]
        right_bracket = [m.start() for m in re.finditer('\]', line)]
        endliststr = line[left_bracket[-1]+1:right_bracket[-1]-1]
        endlist = [x.strip() for x in endliststr.split(", ")]
        return definition, endlist
    else:
        return definition, None

def parse_xref(xrefs: List[str]):
    xrefs = [x for x in xrefs if ":" in x]
    for n,xref in enumerate(xrefs):
        if xref.startswith("MSH:"):
            xrefs[n] = xref.replace("MSH:", "MESH:")
        if xref.startswith("MeSH:"):
            xrefs[n] = xref.replace("MeSH:", "MESH:")
    return xrefs

parse_xref(['MSH:D006954',  'SNOMEDCT_US_2016_03_01:190781009',  'SNOMEDCT_US_2016_03_01:34349009',  'UMLS_CUI:C0020481'])

['MESH:D006954',
 'SNOMEDCT_US_2016_03_01:190781009',
 'SNOMEDCT_US_2016_03_01:34349009',
 'UMLS_CUI:C0020481']

In [59]:
"""
url = "http://purl.obolibrary.org/obo/doid.obo"
r = requests.get(url)
graph = read_obo(r.text.splitlines())
"""
graph = read_obo(open("/home/gstupp/projects/biothings/mydisease/mydisease/data/doid.obo").readlines())
d = graph_to_d(graph)

for value in d.values():
    if 'xref' in value:
        value['xref'] = parse_xref(value['xref'])
    if 'synonym' in value:
        value['synonym'] = list(map(parse_synonym, value['synonym']))
    if 'def' in value:
        value['def'],ref = parse_def(value['def'])
        if ref:
            value['def_ref'] = ref

In [60]:
d['DOID:1171']

{'_id': 'DOID:1171',
 'comment': 'OMIM mapping confirmed by DO. [SN].',
 'is_a': ['DOID:1168'],
 'name': 'hyperlipoproteinemia type V',
 'synonym': ['familial hyperlipoproteinemia type V',
  'familial type 5 hyperlipoproteinemia (disorder)',
  'Fredrickson type V lipaemia'],
 'xref': ['MESH:D006954',
  'NCI:C35645',
  'OMIM:144650',
  'SNOMEDCT_US_2016_03_01:190781009',
  'SNOMEDCT_US_2016_03_01:34349009',
  'UMLS_CUI:C0020481']}

In [63]:
from collections import Counter
Counter([x.split(":")[0] for x in chain(*[x.get('xref',[]) for x in d.values()])]).most_common(100)

[('SNOMEDCT_US_2016_03_01', 12445),
 ('UMLS_CUI', 6474),
 ('NCI', 4566),
 ('OMIM', 3815),
 ('ICD10CM', 3117),
 ('MESH', 2958),
 ('ICD9CM', 2624),
 ('ORDO', 520),
 ('EFO', 131),
 ('CSP', 39),
 ('KEGG', 39),
 ('url', 36),
 ('HP', 31),
 ('NCI2009_04D', 30),
 ('SNOMEDCT', 14),
 ('MEDDRA', 11),
 ('EFOpat_id', 10),
 ('SNOMEDCT_US_2015_03_01', 7),
 ('ICD10', 5),
 ('Orphanet', 4),
 ('CTV3', 3),
 ('OMM', 2),
 ('MTH', 1),
 ('NDFRT', 1),
 ('UMLS', 1),
 ('WHO', 1),
 ('NORD', 1),
 ('DERMO', 1),
 ('IC10CM', 1),
 ('ICD9', 1)]

In [65]:
db.insert_many(d.values())

<pymongo.results.InsertManyResult at 0x7ff53b2827e0>

In [66]:
db.find_one()

{'_id': 'DOID:1171',
 'comment': 'OMIM mapping confirmed by DO. [SN].',
 'is_a': ['DOID:1168'],
 'name': 'hyperlipoproteinemia type V',
 'synonym': ['familial hyperlipoproteinemia type V',
  'familial type 5 hyperlipoproteinemia (disorder)',
  'Fredrickson type V lipaemia'],
 'xref': ['MESH:D006954',
  'NCI:C35645',
  'OMIM:144650',
  'SNOMEDCT_US_2016_03_01:190781009',
  'SNOMEDCT_US_2016_03_01:34349009',
  'UMLS_CUI:C0020481']}