In [1]:
### Merge

from pymongo import MongoClient
from collections import Counter
import networkx as nx
import matplotlib.pyplot as plt
from pymongo import MongoClient
from collections import Counter
import seaborn as sns
from pprint import pprint
from collections import defaultdict
%matplotlib inline

In [2]:
db_names = ['orphanet','DO','ctdbase', 'disgenet', 'mesh','omim']
all_ids = set()
for db_name in db_names:
    db = MongoClient().mydisease[db_name]
    all_ids.update(set([x['_id'] for x in db.find({},{'_id':1})]))

In [3]:
Counter([x.split(":",1)[0] for x in all_ids])

Counter({'DOID': 6966,
         'MESH': 6798,
         'OMIM': 24903,
         'ObsoleteClass': 1,
         'Orphanet': 9947})

In [4]:
# build id graph
g = nx.Graph()
db_names = ['orphanet','DO','ctdbase', 'disgenet','mesh','omim']
for db_name in db_names:
    db = MongoClient().mydisease[db_name]
    for doc in db.find({'xref':{'$exists': True}},{'xref':1}):
        for xref in doc['xref']:
            g.add_edge(doc['_id'],xref)

### How many DOIDs are within 1 or 2 hops from each type of ID?

In [5]:
def num_doids_in_sg(g, cutoff):
    d = defaultdict(list)
    for id in all_ids:
        if id.startswith("DOID:"):
            continue
        if id not in g:
            continue
        neighbors = list(nx.single_source_shortest_path_length(g, id, cutoff=cutoff).keys())
        pre = [x.split(":")[0] for x in neighbors]
        d[id.split(":")[0]].append(pre.count("DOID"))
    d = dict(d)
    return {k:Counter(v) for k,v in d.items()}

In [6]:
num_doids_in_sg(g, 1)

{'MESH': Counter({0: 540,
          1: 2333,
          2: 147,
          3: 32,
          4: 8,
          5: 2,
          6: 2,
          9: 1,
          16: 1}),
 'OMIM': Counter({0: 2652, 1: 3683, 2: 62, 3: 1}),
 'Orphanet': Counter({0: 5050, 1: 469, 2: 2, 3: 2, 4: 2, 5: 1})}

In [7]:
num_doids_in_sg(g, 2)

{'MESH': Counter({0: 505,
          1: 2361,
          2: 154,
          3: 32,
          4: 8,
          5: 2,
          6: 2,
          9: 1,
          16: 1}),
 'OMIM': Counter({0: 2523, 1: 3680, 2: 158, 3: 17, 4: 14, 5: 5, 6: 1}),
 'Orphanet': Counter({0: 3035,
          1: 2167,
          2: 211,
          3: 55,
          4: 20,
          5: 8,
          6: 9,
          7: 1,
          8: 5,
          9: 6,
          10: 4,
          11: 1,
          16: 1,
          18: 1,
          21: 2})}

In [8]:
num_doids_in_sg(g, 100)

{'MESH': Counter({0: 318,
          1: 1599,
          2: 372,
          3: 200,
          4: 143,
          5: 53,
          6: 66,
          7: 40,
          8: 43,
          9: 68,
          10: 22,
          11: 54,
          12: 16,
          13: 21,
          14: 16,
          15: 15,
          21: 3,
          30: 6,
          33: 11}),
 'OMIM': Counter({0: 2201,
          1: 1752,
          2: 704,
          3: 290,
          4: 246,
          5: 116,
          6: 60,
          7: 119,
          8: 132,
          9: 71,
          10: 188,
          11: 106,
          12: 6,
          13: 227,
          14: 79,
          15: 16,
          21: 21,
          30: 53,
          33: 11}),
 'Orphanet': Counter({0: 2934,
          1: 1236,
          2: 318,
          3: 194,
          4: 167,
          5: 111,
          6: 70,
          7: 57,
          8: 47,
          9: 63,
          10: 56,
          11: 86,
          12: 15,
          13: 75,
          14: 31,
          15: 17,
  

In [9]:
## Between one hop and two hops, mesh and omim dont really change. But we can link 2k more orphanet IDs

In [10]:
def get_equiv_doid(g, did):
    """
    For a given ID, get the DOIDs it is equivalent to within 2 hops.
    """
    if did.startswith("DOID:"):
        return [did]
    if did not in g:
        return []
    equiv = list(nx.single_source_shortest_path_length(g, did, cutoff=2).keys())
    return [x for x in equiv if x.startswith("DOID:")]

get_equiv_doid(g, 'Orphanet:98306')

['DOID:0050440']

In [17]:
get_equiv_doid(g, 'UMLS_CUI:C1720859')

['DOID:0050440']

In [18]:
## merge docs
d = dict()
# make initial primary d with all DOID docs
db = MongoClient().mydisease.DO
d = {doc['_id']: {'disease_ontology': doc} for doc in db.find()}

# fill in from other sources
db_names = ['orphanet','ctdbase','disgenet','mesh','omim']
for db_name in db_names:
    db = MongoClient().mydisease[db_name]
    for doc in db.find():
        doids = get_equiv_doid(g, doc['_id'])
        for doid in doids:
            if db_name not in d[doid]:
                d[doid][db_name] = []
            d[doid][db_name].append(doc)

In [19]:
d['DOID:0050440'].keys()

dict_keys(['mesh', 'disease_ontology', 'ctdbase', 'orphanet', 'disgenet', 'omim'])

In [28]:
[{x['_id']:[y['geneName'] for y  in x['genes']]} for x in d['DOID:0050440']['disgenet']]

[{'UMLS_CUI:C0271694': ['AKT2', 'LMNA', 'PPARG', 'PPP1R3A', 'CAV1']},
 {'UMLS_CUI:C1720859': ['LMNA']},
 {'UMLS_CUI:C1720860': ['LMNA']},
 {'UMLS_CUI:C1720861': ['PPARG']}]

In [20]:
d['DOID:0050440']['disease_ontology']

{'_id': 'DOID:0050440',
 'comment': 'Xref MGI.\\nOMIM mapping confirmed by DO. [SN].',
 'def': 'A lipodystrophy characterized by abnormal subcutaneous adipose tissue distribution beginning in late childhood or early adult life.',
 'def_ref': ['url:http\\://en.wikipedia.org/wiki/Familial_partial_lipodystrophy',
  'url:http\\://omim.org/entry/60860'],
 'is_a': ['DOID:811'],
 'name': 'familial partial lipodystrophy',
 'subset': ['DO_MGI_slim'],
 'synonym': ['Dunnigan Syndrome', 'Koberling-Dunnigan Syndrome'],
 'xref': ['MESH:D052496',
  'NCI:C84708',
  'OMIM:151660',
  'OMIM:604367',
  'OMIM:608600',
  'OMIM:613877',
  'OMIM:615238',
  'SNOMEDCT_US_2016_03_01:49292002',
  'UMLS_CUI:C0271694',
  'UMLS_CUI:C1720859',
  'UMLS_CUI:C1720860',
  'UMLS_CUI:C1720861']}

In [21]:
d['DOID:0050440']['ctdbase'][0]['_id']

'MESH:D052496'

In [22]:
[x.get('disease_gene_associations',[]) for x in d['DOID:0050440']['orphanet']]

[[{'dga_status': 'Assessed',
   'dga_type': 'Disease-causing germline mutation(s) in',
   'gene_name': 'peroxisome proliferator activated receptor gamma',
   'gene_symbol': 'PPARG',
   'gene_type': 'gene with protein product',
   'loci': ['3p25']}],
 [],
 [{'dga_status': 'Not yet assessed',
   'dga_type': 'Candidate gene tested in',
   'gene_name': 'lamin A/C',
   'gene_symbol': 'LMNA',
   'gene_type': 'gene with protein product',
   'loci': ['1q22']}],
 [{'dga_status': 'Assessed',
   'dga_type': 'Disease-causing germline mutation(s) (loss of function) in',
   'gene_name': 'cell death inducing DFFA like effector c',
   'gene_symbol': 'CIDEC',
   'gene_type': 'gene with protein product',
   'loci': ['3p25']}],
 [{'dga_status': 'Assessed',
   'dga_type': 'Disease-causing germline mutation(s) in',
   'gene_name': 'lamin A/C',
   'gene_symbol': 'LMNA',
   'gene_type': 'gene with protein product',
   'loci': ['1q22']}],
 [{'dga_status': 'Assessed',
   'dga_type': 'Disease-causing germline m

In [23]:
d['DOID:0050440']['orphanet']

[{'_id': 'Orphanet:79083',
  'alternative_term': ['PPARG-related FPLD',
   'Familial partial lipodystrophy type 3',
   'FPLD3'],
  'ave_age_of_onset': ['Adult'],
  'disease_gene_associations': [{'dga_status': 'Assessed',
    'dga_type': 'Disease-causing germline mutation(s) in',
    'gene_name': 'peroxisome proliferator activated receptor gamma',
    'gene_symbol': 'PPARG',
    'gene_type': 'gene with protein product',
    'loci': ['3p25']}],
  'mapping': {'E': ['OMIM:604367'], 'NTBT': ['ICD10CM:E88.1']},
  'parents': ['Orphanet:377788'],
  'part_of': ['Orphanet:98306'],
  'preferred_label': 'PPARG-related familial partial lipodystrophy',
  'prevalence': [{'mean_value': '10.0',
    'prevalence_geographic': 'Worldwide',
    'prevalence_qualification': 'Case',
    'prevalence_type': 'Cases/families',
    'prevalence_validation_status': 'Validated',
    'source': '16409151[PMID]'},
   {'mean_value': '0.0',
    'prevalence_class': '<1 / 1 000 000',
    'prevalence_geographic': 'Worldwide',