In [1]:
import re
from itertools import chain
import networkx
import requests
from mydisease.utils import read_obo
from mydisease.utils.common import list2dict
from networkx.readwrite import json_graph
import json
from collections import defaultdict
from typing import List

from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.HPO

In [63]:
## This is copied from do.ipynb with modified parse_xrefs. TODO: make generic and import it
def graph_to_d(graph):
    """
    :param graph: A networkx graph made from reading ontology
    :type graph: networkx.classes.multidigraph.MultiDiGraph
    :return:
    """
    node_link_data = json_graph.node_link_data(graph)
    nodes = node_link_data['nodes']

    idx_id = {idx: node['id'] for idx,node in enumerate(nodes)}
    for link in node_link_data['links']:
        # store the edges (links) within the graph
        key = link['key']
        source = link['source']
        target = link['target']

        if key not in nodes[source]:
            nodes[source][key] = set()
        nodes[source][key].add(idx_id[target])

    # for mongo insertion
    for node in nodes:
        node['_id'] = node['id'].lower()
        if "alt_id" in node:
            node['alt_id'] = [x.lower() for x in node['alt_id']]
        if "is_a" in node:
            node['is_a'] = [x.lower() for x in node['is_a']]
        if "property_value" in node:
            del node['property_value']
        del node['id']
        for k,v in node.items():
            if isinstance(v, set):
                node[k] = list(v)
    d = {node['_id']: node for node in nodes}

    return d


def parse_synonym(line: str):
    # line = "synonym: \"The other white meat\" EXACT MARKETING_SLOGAN [MEAT:00324, BACONBASE:03021]"
    return line[line.index("\"")+1:line.rindex("\"")] if line.count("\"") == 2 else line


def parse_def(line: str):
    """
    Parse definition field.
    Returns a tuple(definition, list of crosslink urls)
    
    >>> parse_def("\"A description.\" [url:http://www.ncbi.goc/123, url:http://www.ncbi.nlm.nih.gov/pubmed/15318016]")
    ('A description.', ['url:http\\://www.ncbi.goc/123', 'url:http\\://www.ncbi.nlm.nih.gov/pubmed/1531801'])
    
    """
    definition = line[line.index("\"")+1:line.rindex("\"")] if line.count("\"") == 2 else line
    if line.endswith("]") and line.count("["):
        left_bracket = [m.start() for m in re.finditer('\[', line)]
        right_bracket = [m.start() for m in re.finditer('\]', line)]
        endliststr = line[left_bracket[-1]+1:right_bracket[-1]]
        endlist = [x.strip().replace("\\\\","").replace("\\","") for x in endliststr.split(", ")]
        return definition, endlist
    else:
        return definition, None


def parse_xref(xrefs: List[str]):
    """
    Parse xref field. Input is list of strings (xref IDs)
    Normalizes prefix strings (MSH -> MESH, ORDO -> Orphanet) and converts prefix to lowercase
    Returns dict[ID prefix: list of IDs without prefix]
    
    >>> parse_xref(['MSH:D006954',  'SNOMEDCT_US_2016_03_01:190781009',  'SNOMEDCT_US_2016_03_01:34349009',  'UMLS_CUI:C0020481'])
    {'MESH': ['D006954'],
     'SNOMEDCT_US_2016_03_01': ['190781009', '34349009'],
     'UMLS_CUI': ['C0020481']}
    
    """
    
    xrefs = [x for x in xrefs if ":" in x]
    xrefs = [x.split(":",1)[0].lower() + ":" + x.split(":",1)[1] for x in xrefs]
    xrefs = [x.split(" ",1)[0] for x in xrefs] ## <--- this is different between HPO and DO
    for n,xref in enumerate(xrefs):
        if xref.startswith("msh:"):
            xrefs[n] = "mesh:" + xref.split(":",1)[1]
        if xref.startswith("ordo:"):
            xrefs[n] = "orphanet:" + xref.split(":",1)[1]
        if xref.startswith("umls:"):
            xrefs[n] = "umls_cui:" + xref.split(":",1)[1]
        if xref.startswith("icd-10:"):
            xrefs[n] = "icd10cm:" + xref.split(":",1)[1]
    return list2dict(xrefs)

In [64]:
graph = read_obo(open("/home/gstupp/projects/biothings/mydisease/mydisease/data/hp.obo").readlines())
d = graph_to_d(graph)

for value in d.values():
    if 'xref' in value:
        value['xref'] = parse_xref(value['xref'])
    if 'synonym' in value:
        value['synonym'] = list(map(parse_synonym, value['synonym']))
    if 'def' in value:
        value['def'],ref = parse_def(value['def'])
        if ref:
            if 'xref' in value:
                value['xref'].update(parse_xref(ref))
            else:
                value['xref'] = parse_xref(ref)
            value['xref'] = {k:v for k,v in value['xref'].items() if k not in {"hpo","ddd"}}

In [65]:
d['hp:0000252']['xref']

{'mesh': ['D008831'],
 'pmid': ['19125436', '9683597'],
 'umls_cui': ['C0025958']}

In [66]:
from collections import Counter
Counter([x.split(":")[0] for x in chain(*[x.get('xref',[]) for x in d.values()])]).most_common(100)

[('umls_cui', 3706),
 ('pmid', 1093),
 ('mesh', 1066),
 ('snomedct', 266),
 ('utoronto', 121),
 ('meddra', 93),
 ('goc', 88),
 ('eurenomics', 64),
 ('orcid', 40),
 ('icd10cm', 36),
 ('mp', 30),
 ('eom', 19),
 ('neuromics', 16),
 ('emedicine', 13),
 ('epcc', 13),
 ('ki', 12),
 ('icm', 10),
 ('uk', 10),
 ('nihr', 7),
 ('uncl', 7),
 ('monarch', 6),
 ('ncit', 5),
 ('http', 5),
 ('hp', 5),
 ('mpath', 5),
 ('isbn', 5),
 ('https', 4),
 ('ukt', 4),
 ('gc', 3),
 ('icd-o', 3),
 ('cineas', 3),
 ('icd-9', 3),
 ('ukb', 3),
 ('phenotips', 3),
 ('icd10', 2),
 ('doi', 2),
 ('cochrane', 1),
 ('nci', 1),
 ('dsm-iv', 1),
 ('omim', 1),
 ('imm', 1),
 ('www', 1),
 ('v', 1),
 ('ordcid', 1),
 ('orcird', 1),
 ('isca', 1),
 ('doid', 1),
 ('hppo', 1),
 ('uhpo', 1),
 ('dd', 1),
 ('dsm', 1)]

In [67]:
db.drop()
db.insert_many(d.values())

<pymongo.results.InsertManyResult at 0x7fa57855b480>

In [68]:
db.find_one('hp:0000252')

{'_id': 'hp:0000252',
 'alt_id': ['hp:0001366', 'hp:0005485', 'hp:0005489', 'hp:0005497'],
 'comment': 'Head circumference is measured from just above the glabella (the most prominent point on the frontal bone above the root of the nose) to the most posterior prominent point of the occipital bone using a tape measure. Some standard charts are organized by centiles [Hall et al. [2007]], others by standard deviations [Farkas, [1981]]. It is important to add an indication of how far below the normal standard the head circumference is if an accurate assessment of this can be made. Microcephaly is an absolute term. The term relative microcephaly can be used when the head size centile is less than the centile for height, for example, head size at the 3rd centile with height at the 75% for age and sex. On prenatal ultrasound, microcephaly is diagnosed if the head circumference or the biparietal diameter is more than three standard deviations below the mean.',
 'def': 'Occipito-frontal (head) 