In [1]:
### Merge
from pymongo import MongoClient
from collections import Counter
import networkx as nx
from tqdm import tqdm
import matplotlib.pyplot as plt
from pymongo import MongoClient
from collections import Counter
import seaborn as sns
from pprint import pprint
from collections import defaultdict
from mydisease.utils.common import dict2list
%matplotlib inline

In [2]:
db_names = ['ctdbase','DO','disgenet', 'hpo', 'mesh', 'omim', 'orphanet']
all_ids = set()
for db_name in db_names:
    db = MongoClient().mydisease[db_name]
    all_ids.update(set([x['_id'] for x in db.find({},{'_id':1})]))

In [3]:
Counter([x.split(":",1)[0] for x in all_ids])

Counter({'doid': 6966,
         'mesh': 11438,
         'obsoleteclass': 1,
         'omim': 24910,
         'orphanet': 9947,
         'umls_cui': 7607})

In [4]:
# build id graph
g = nx.Graph()
for db_name in db_names:
    db = MongoClient().mydisease[db_name]
    for doc in db.find({'xref':{'$exists': True}},{'xref':1}):
        for xref in dict2list(doc['xref']):
            g.add_edge(doc['_id'],xref)

### How many DOIDs are within 1 or 2 hops from each type of ID?

In [5]:
def num_doids_in_sg(g, cutoff):
    d = defaultdict(list)
    for id in all_ids:
        if id.startswith("doid:"):
            continue
        if id not in g:
            continue
        neighbors = list(nx.single_source_shortest_path_length(g, id, cutoff=cutoff).keys())
        pre = [x.split(":")[0] for x in neighbors]
        d[id.split(":")[0]].append(pre.count("doid"))
    d = dict(d)
    return {k:Counter(v) for k,v in d.items()}

In [6]:
num_doids_in_sg(g, 1)

{'mesh': Counter({0: 1052,
          1: 2482,
          2: 149,
          3: 32,
          4: 8,
          5: 2,
          6: 2,
          9: 1,
          16: 1}),
 'omim': Counter({0: 2655, 1: 3680, 2: 62, 3: 1}),
 'orphanet': Counter({0: 5049, 1: 470, 2: 2, 3: 2, 4: 2, 5: 1}),
 'umls_cui': Counter({0: 1026, 1: 1477, 2: 21})}

In [7]:
num_doids_in_sg(g, 2)

{'mesh': Counter({0: 1005,
          1: 2519,
          2: 158,
          3: 32,
          4: 9,
          5: 2,
          6: 2,
          9: 1,
          16: 1}),
 'omim': Counter({0: 2525, 1: 3678, 2: 158, 3: 17, 4: 14, 5: 5, 6: 1}),
 'orphanet': Counter({0: 3039,
          1: 2163,
          2: 211,
          3: 55,
          4: 20,
          5: 8,
          6: 9,
          7: 1,
          8: 5,
          9: 6,
          10: 4,
          11: 1,
          16: 1,
          18: 1,
          21: 2}),
 'umls_cui': Counter({0: 943, 1: 1551, 2: 29, 4: 1})}

In [8]:
num_doids_in_sg(g, 100)

{'mesh': Counter({0: 751,
          1: 1583,
          2: 379,
          3: 215,
          4: 151,
          5: 54,
          6: 61,
          7: 77,
          8: 35,
          9: 42,
          10: 26,
          11: 81,
          12: 37,
          13: 32,
          14: 30,
          15: 18,
          16: 13,
          18: 11,
          21: 9,
          31: 6,
          33: 12,
          34: 25,
          65: 9,
          78: 36,
          81: 36}),
 'omim': Counter({0: 2203,
          1: 1605,
          2: 613,
          3: 317,
          4: 223,
          5: 155,
          6: 59,
          7: 187,
          8: 58,
          9: 46,
          10: 90,
          11: 233,
          12: 60,
          13: 231,
          14: 82,
          15: 16,
          16: 3,
          18: 1,
          21: 21,
          31: 7,
          33: 11,
          34: 6,
          65: 2,
          78: 115,
          81: 54}),
 'orphanet': Counter({0: 2937,
          1: 1151,
          2: 291,
          3: 199,
    

In [9]:
## Between one hop and two hops, mesh, omim, umls dont really change. But we can link 2k more orphanet IDs
# because there is no source for direct DOID <-> orphanet ids

In [10]:
def get_equiv_doid(g, did):
    """
    For a given ID, get the DOIDs it is equivalent to within 2 hops.
    """
    if did.startswith("doid:"):
        return [did]
    if did not in g:
        return []
    equiv = list(nx.single_source_shortest_path_length(g, did, cutoff=2).keys())
    return [x for x in equiv if x.startswith("doid:")]

get_equiv_doid(g, 'orphanet:98306')

['doid:0050440']

In [11]:
get_equiv_doid(g, 'umls_cui:C1720859')

['doid:0050440']

In [12]:
## merge docs
client = MongoClient()
mydisease = client.mydisease.mydisease
mydisease.drop()

# make initial primary d with all DOID docs
d = dict()
db = MongoClient().mydisease.DO
d = [{'_id': doc['_id'], 'disease_ontology': doc} for doc in db.find()]
mydisease.insert_many(d)

<pymongo.results.InsertManyResult at 0x7fd7dbe404c8>

In [13]:
# fill in from other sources
for db_name in tqdm(set(db_names) - {'DO'}):
    db = MongoClient().mydisease[db_name]
    for doc in db.find():
        doids = get_equiv_doid(g, doc['_id'])
        for doid in doids:
            # if db_name not in d[doid]:
            #     d[doid][db_name] = []
            # d[doid][db_name].append(doc)
            mydisease.update_one({'_id':doid}, {'$push':{db_name: doc}}, upsert=True)

100%|██████████| 6/6 [01:01<00:00, 10.29s/it]


In [14]:
doc = mydisease.find_one('doid:0050440')
doc.keys()

dict_keys(['mesh', 'disease_ontology', 'disgenet', '_id', 'orphanet', 'ctdbase', 'omim'])

In [15]:
doc['disease_ontology']

{'_id': 'doid:0050440',
 'comment': 'Xref MGI.\\nOMIM mapping confirmed by DO. [SN].',
 'def': 'A lipodystrophy characterized by abnormal subcutaneous adipose tissue distribution beginning in late childhood or early adult life.',
 'is_a': ['doid:811'],
 'name': 'familial partial lipodystrophy',
 'subset': ['DO_MGI_slim'],
 'synonym': ['Dunnigan Syndrome', 'Koberling-Dunnigan Syndrome'],
 'xref': {'mesh': ['D052496'],
  'nci': ['C84708'],
  'omim': ['151660', '604367', '608600', '613877', '615238'],
  'snomedct_us_2016_03_01': ['49292002'],
  'umls_cui': ['C0271694', 'C1720859', 'C1720860', 'C1720861'],
  'url': ['http://en.wikipedia.org/wiki/Familial_partial_lipodystrophy',
   'http://omim.org/entry/608600']}}

In [16]:
doc['disgenet'][:2]

[{'_id': 'umls_cui:C0271694',
  'genes': [{'#pmids': 1,
    '#snps': 0,
    'gene_id': 208,
    'gene_name': 'AKT2',
    'score': 0.12,
    'source': ['CTD_human']},
   {'#pmids': 82,
    '#snps': 7,
    'gene_id': 4000,
    'gene_name': 'LMNA',
    'score': 0.157432691085282,
    'source': ['CTD_human']},
   {'#pmids': 17,
    '#snps': 2,
    'gene_id': 5468,
    'gene_name': 'PPARG',
    'score': 0.12706742019575198,
    'source': ['CTD_human']},
   {'#pmids': 0,
    '#snps': 0,
    'gene_id': 5506,
    'gene_name': 'PPP1R3A',
    'score': 0.12,
    'source': ['CTD_human']},
   {'#pmids': 3,
    '#snps': 0,
    'gene_id': 857,
    'gene_name': 'CAV1',
    'score': 0.120542883744161,
    'source': ['CTD_human']}]},
 {'_id': 'umls_cui:C1720859',
  'genes': [{'#pmids': 1,
    '#snps': 0,
    'gene_id': 4000,
    'gene_name': 'LMNA',
    'score': 0.12027144187208001,
    'source': ['ORPHANET']}]}]

In [17]:
{k:[x['_id'] for x in v] for k,v in doc.items() if k not in {'disease_ontology', '_id'}}

{'ctdbase': ['mesh:D052496'],
 'disgenet': ['umls_cui:C0271694',
  'umls_cui:C1720859',
  'umls_cui:C1720860',
  'umls_cui:C1720861'],
 'mesh': ['mesh:D052496'],
 'omim': ['omim:151660',
  'omim:604367',
  'omim:608600',
  'omim:613877',
  'omim:615238'],
 'orphanet': ['orphanet:79083',
  'orphanet:2348',
  'orphanet:98306',
  'orphanet:79084',
  'orphanet:280356',
  'orphanet:435651']}

In [18]:
[{x['_id']:[y['gene_name'] for y  in x['genes']]} for x in doc['disgenet']]

[{'umls_cui:C0271694': ['AKT2', 'LMNA', 'PPARG', 'PPP1R3A', 'CAV1']},
 {'umls_cui:C1720859': ['LMNA']},
 {'umls_cui:C1720860': ['LMNA']},
 {'umls_cui:C1720861': ['PPARG']}]

In [19]:
doc['disease_ontology']

{'_id': 'doid:0050440',
 'comment': 'Xref MGI.\\nOMIM mapping confirmed by DO. [SN].',
 'def': 'A lipodystrophy characterized by abnormal subcutaneous adipose tissue distribution beginning in late childhood or early adult life.',
 'is_a': ['doid:811'],
 'name': 'familial partial lipodystrophy',
 'subset': ['DO_MGI_slim'],
 'synonym': ['Dunnigan Syndrome', 'Koberling-Dunnigan Syndrome'],
 'xref': {'mesh': ['D052496'],
  'nci': ['C84708'],
  'omim': ['151660', '604367', '608600', '613877', '615238'],
  'snomedct_us_2016_03_01': ['49292002'],
  'umls_cui': ['C0271694', 'C1720859', 'C1720860', 'C1720861'],
  'url': ['http://en.wikipedia.org/wiki/Familial_partial_lipodystrophy',
   'http://omim.org/entry/608600']}}

In [20]:
doc['ctdbase'][0]['_id']

'mesh:D052496'

In [21]:
[x.get('disease_gene_associations',[]) for x in doc['orphanet']]

[[{'dga_status': 'Assessed',
   'dga_type': 'Disease-causing germline mutation(s) in',
   'gene_name': 'peroxisome proliferator activated receptor gamma',
   'gene_symbol': 'PPARG',
   'gene_type': 'gene with protein product',
   'loci': ['3p25']}],
 [{'dga_status': 'Assessed',
   'dga_type': 'Disease-causing germline mutation(s) in',
   'gene_name': 'lamin A/C',
   'gene_symbol': 'LMNA',
   'gene_type': 'gene with protein product',
   'loci': ['1q22']}],
 [],
 [{'dga_status': 'Not yet assessed',
   'dga_type': 'Candidate gene tested in',
   'gene_name': 'lamin A/C',
   'gene_symbol': 'LMNA',
   'gene_type': 'gene with protein product',
   'loci': ['1q22']}],
 [{'dga_status': 'Assessed',
   'dga_type': 'Disease-causing germline mutation(s) in',
   'gene_name': 'perilipin 1',
   'gene_symbol': 'PLIN1',
   'gene_type': 'gene with protein product',
   'loci': ['15q26']}],
 [{'dga_status': 'Assessed',
   'dga_type': 'Disease-causing germline mutation(s) (loss of function) in',
   'gene_nam

In [22]:
doc['orphanet']

[{'_id': 'orphanet:79083',
  'alternative_term': ['PPARG-related FPLD',
   'Familial partial lipodystrophy type 3',
   'FPLD3'],
  'ave_age_of_onset': ['Adult'],
  'disease_gene_associations': [{'dga_status': 'Assessed',
    'dga_type': 'Disease-causing germline mutation(s) in',
    'gene_name': 'peroxisome proliferator activated receptor gamma',
    'gene_symbol': 'PPARG',
    'gene_type': 'gene with protein product',
    'loci': ['3p25']}],
  'mapping': {'E': ['omim:604367'], 'NTBT': ['icd10cm:E88.1']},
  'parents': ['orphanet:377788'],
  'part_of': ['orphanet:98306'],
  'preferred_label': 'PPARG-related familial partial lipodystrophy',
  'prevalence': [{'mean_value': 10.0,
    'prevalence_geographic': 'Worldwide',
    'prevalence_qualification': 'Case',
    'prevalence_type': 'Cases/families',
    'prevalence_validation_status': 'Validated',
    'source': '16409151[PMID]'},
   {'mean_value': None,
    'prevalence_class': '<1 / 1 000 000',
    'prevalence_geographic': 'Worldwide',
  