# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
disease_id = "0007079"
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  4.699100099969655e-05


## Network from NeDReX

In [44]:
from urllib import request, parse
import json

In [45]:
url = 'https://api.nedrex.net/graph_builder'

In [46]:
myobj = {
    "nodes":["protein"],
    "edges":["protein_interacts_with_protein"],
    "iid_evidence":["exp"],
    "ppi_self_loops": True,
    "taxid":[9606],
    "concise": True,
    "include_omim": True,
    "disgenet_threshold": 0,
    "use_omim_ids": False,
}

In [47]:
data = json.dumps(myobj).encode('utf8')
req =  request.Request(url, data=data) # this will make the method "POST"
resp = request.urlopen(req)

In [48]:
print(resp.read().decode('utf8'))

"e1e15418-2c09-4c9e-8ad5-ed02aed4d5a8"


## Enriched gene values

In [4]:
import gseapy

In [9]:
ID_TYPE_KEY = {'entrez': 'entrezgene', 'ensembl': 'ensembl.gene', 'symbol': 'symbol', 'uniprot': 'uniprot.Swiss-Prot'}

In [51]:
ENRICH_KEY = {'GO_Molecular_Function_2015':'go.MF', 'GO_Biological_Process_2015':'go.BP', 'GO_Cellular_Component_2015':'go.CC', 'KEGG_2016':'pathway.kegg'}

In [5]:
seeds

0    P28223
1    P00325
2    P00326
3    P47869
Name: 0, dtype: object

In [8]:
gene_id_mapping = pd.read_csv("../mapping_files/gene_id_mapping.csv")
gene_id_mapping.head(5)

Unnamed: 0,entrezgene,symbol,uniprot.Swiss-Prot,ensembl.gene
0,1,A1BG,P04217,ENSG00000121410
1,2,A2M,P01023,ENSG00000175899
2,9,NAT1,P18440,ENSG00000171428
3,10,NAT2,P11245,ENSG00000156006
4,12,SERPINA3,P01011,ENSG00000196136


In [11]:
gene_id_mapping[gene_id_mapping[ID_TYPE_KEY['uniprot']].isin(seeds)]['symbol']

73       ADH1B
74       ADH1C
1502    GABRA2
1924     HTR2A
Name: symbol, dtype: object

In [30]:
enrichr_df_target = gseapy.enrichr(gene_list=list(gene_id_mapping[gene_id_mapping[ID_TYPE_KEY['uniprot']].isin(seeds)]['symbol']),
                                               description='atts',
                                               gene_sets=list(ENRICH_KEY.keys()),
                                               cutoff=0.05).results
enrichr_df_target = enrichr_df_target[enrichr_df_target['Adjusted P-value'] < 0.05]
enrichr_df_target



Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,GO_Molecular_Function_2015,alcohol dehydrogenase (NAD) activity (GO:0004022),2/8,8.396276e-07,0.000022,0,0,3331.666667,46611.040785,ADH1C;ADH1B
1,GO_Molecular_Function_2015,"oxidoreductase activity, acting on the CH-OH g...",2/115,1.951745e-04,0.002195,0,0,175.955752,1502.946605,ADH1C;ADH1B
2,GO_Molecular_Function_2015,"oxidoreductase activity, acting on CH-OH group...",2/131,2.532625e-04,0.002195,0,0,154.007752,1275.351143,ADH1C;ADH1B
3,GO_Molecular_Function_2015,serotonin binding (GO:0051378),1/8,1.599128e-03,0.009354,0,0,951.857143,6128.338527,HTR2A
4,GO_Molecular_Function_2015,amine binding (GO:0043176),1/9,1.798886e-03,0.009354,0,0,832.833333,5263.996189,HTR2A
...,...,...,...,...,...,...,...,...,...,...
189,KEGG_2016,Inflammatory mediator regulation of TRP channe...,1/98,1.945771e-02,0.023521,0,0,68.381443,269.389509,HTR2A
190,KEGG_2016,Retrograde endocannabinoid signaling Homo sapi...,1/101,2.004884e-02,0.023521,0,0,66.320000,259.283608,GABRA2
191,KEGG_2016,Metabolic pathways Homo sapiens hsa01100,2/1239,2.115550e-02,0.023521,0,0,15.164915,58.473719,ADH1C;ADH1B
192,KEGG_2016,Serotonergic synapse Homo sapiens hsa04726,1/112,2.221403e-02,0.023521,0,0,59.714715,227.335771,HTR2A


In [31]:
enrichr_df_target[enrichr_df_target["Gene_set"]=="KEGG_2016"]["Term"]

176            Tyrosine metabolism Homo sapiens hsa00350
177         Fatty acid degradation Homo sapiens hsa00071
178             Retinol metabolism Homo sapiens hsa00830
179    Glycolysis / Gluconeogenesis Homo sapiens hsa0...
180    Drug metabolism - cytochrome P450 Homo sapiens...
181    Metabolism of xenobiotics by cytochrome P450 H...
182        Chemical carcinogenesis Homo sapiens hsa05204
183    Neuroactive ligand-receptor interaction Homo s...
184             Nicotine addiction Homo sapiens hsa05033
185             Taste transduction Homo sapiens hsa04742
186              GABAergic synapse Homo sapiens hsa04727
187                   Gap junction Homo sapiens hsa04540
188             Morphine addiction Homo sapiens hsa05032
189    Inflammatory mediator regulation of TRP channe...
190    Retrograde endocannabinoid signaling Homo sapi...
191             Metabolic pathways Homo sapiens hsa01100
192           Serotonergic synapse Homo sapiens hsa04726
193      Calcium signaling path

In [37]:
if len(enrichr_df_target) > 0:
        enrichr_df_target.insert(2, 'Term_ID', enrichr_df_target['Term'].str.extract(r'(GO:[0-9]*|hsa[0-9]*)')[0])

In [50]:
pivot = enrichr_df_target[['Gene_set','Term_ID']].pivot(columns='Gene_set')
pivot['Term_ID']

Gene_set,GO_Biological_Process_2015,GO_Molecular_Function_2015,KEGG_2016
0,,GO:0004022,
1,,GO:0016616,
2,,GO:0016614,
3,,GO:0051378,
4,,GO:0043176,
...,...,...,...
189,,,hsa04750
190,,,hsa04723
191,,,hsa01100
192,,,hsa04726


In [49]:
set(pivot['Term_ID']['KEGG_2016'].dropna())

{'hsa00010',
 'hsa00071',
 'hsa00350',
 'hsa00830',
 'hsa00980',
 'hsa00982',
 'hsa01100',
 'hsa04020',
 'hsa04080',
 'hsa04540',
 'hsa04723',
 'hsa04726',
 'hsa04727',
 'hsa04742',
 'hsa04750',
 'hsa05032',
 'hsa05033',
 'hsa05204'}

In [54]:
dicto = dict()
for key in ENRICH_KEY.keys():
    if key in pivot['Term_ID']:
        dicto[ENRICH_KEY[key]] = set(pivot['Term_ID'][key].dropna())
    else:
        dicto[ENRICH_KEY[key]] = set()
dicto

{'go.MF': {'GO:0001965',
  'GO:0004022',
  'GO:0004890',
  'GO:0004993',
  'GO:0005230',
  'GO:0005253',
  'GO:0005254',
  'GO:0008144',
  'GO:0008227',
  'GO:0008270',
  'GO:0015103',
  'GO:0015108',
  'GO:0015276',
  'GO:0016614',
  'GO:0016616',
  'GO:0016917',
  'GO:0022834',
  'GO:0030594',
  'GO:0043176',
  'GO:0051378'},
 'go.BP': {'GO:0001505',
  'GO:0001659',
  'GO:0003013',
  'GO:0003018',
  'GO:0006066',
  'GO:0006067',
  'GO:0006069',
  'GO:0006109',
  'GO:0006110',
  'GO:0006805',
  'GO:0006821',
  'GO:0006836',
  'GO:0006939',
  'GO:0007200',
  'GO:0007202',
  'GO:0007204',
  'GO:0007210',
  'GO:0007214',
  'GO:0007268',
  'GO:0007613',
  'GO:0009266',
  'GO:0010517',
  'GO:0010518',
  'GO:0010675',
  'GO:0010676',
  'GO:0010863',
  'GO:0014059',
  'GO:0014065',
  'GO:0014820',
  'GO:0014824',
  'GO:0014829',
  'GO:0014848',
  'GO:0015698',
  'GO:0016048',
  'GO:0019229',
  'GO:0030431',
  'GO:0030534',
  'GO:0032845',
  'GO:0034308',
  'GO:0035150',
  'GO:0042220',
  'GO

In [None]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

In [3]:
disease_mapping = pd.read_csv('https://www.disgenet.org/static/disgenet_ap1/files/downloads/disease_mappings.tsv.gz',compression='gzip', sep='\t')
disease_mapping

Unnamed: 0,diseaseId,name,vocabulary,code,vocabularyName
0,C0018923,Hemangiosarcoma,DO,0001816,angiosarcoma
1,C0854893,Angiosarcoma non-metastatic,DO,0001816,angiosarcoma
2,C0033999,Pterygium,DO,0002116,pterygium
3,C0025517,Metabolic Diseases,DO,0014667,disease of metabolism
4,C0155862,Streptococcal pneumonia,DO,0040084,Streptococcus pneumonia
...,...,...,...,...,...
242884,C0279628,Adenocarcinoma Of Esophagus,ORDO,99976,Adenocarcinoma of the esophagus
242885,C0279626,Squamous cell carcinoma of esophagus,ORDO,99977,Squamous cell carcinoma of the esophagus
242886,C0206702,Klatskin Tumor,ORDO,99978,Klatskin tumor
242887,C0007462,Causalgia,ORDO,99994,Complex regional pain syndrome type 2


In [4]:
disease_var_mapping = pd.read_csv('https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_variant_disease_associations.tsv.gz', compression='gzip', sep='\t', dtype=str)
disease_gene_mapping = pd.read_csv('https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz', compression='gzip', sep='\t', dtype=str)
disease_gene_mapping

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.7,0.538,C0001418,Adenocarcinoma,group,C04,Neoplastic Process,0.01,1,2008,2008,1,0,LHGDN
1,1,A1BG,0.7,0.538,C0002736,Amyotrophic Lateral Sclerosis,disease,C18;C10,Disease or Syndrome,0.01,1,2008,2008,1,0,BEFREE
2,1,A1BG,0.7,0.538,C0003578,Apnea,phenotype,C23;C08,Sign or Symptom,0.01,1,2017,2017,1,0,BEFREE
3,1,A1BG,0.7,0.538,C0003864,Arthritis,disease,C05,Disease or Syndrome,0.01,1,2019,2019,1,0,BEFREE
4,1,A1BG,0.7,0.538,C0008373,Cholesteatoma,disease,C17,Disease or Syndrome,0.01,1,2020,2020,1,0,BEFREE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1134937,115804232,CEROX1,,,C0005890,Body Height,phenotype,,Organism Attribute,0.10,1,2019,2019,1,0,GWASCAT
1134938,115891964,MIR223HG,0.861,0.077,C0023418,leukemia,disease,C04,Neoplastic Process,0.01,1,2016,2016,1,0,BEFREE
1134939,115891964,MIR223HG,0.861,0.077,C0023467,"Leukemia, Myelocytic, Acute",disease,C04,Neoplastic Process,0.01,1,2016,2016,1,0,BEFREE
1134940,115891964,MIR223HG,0.861,0.077,C0598766,Leukemogenesis,disease,C23;C04,Neoplastic Process,0.01,1,2016,2016,1,0,BEFREE


In [5]:
mondo_mapping = disease_mapping[disease_mapping['vocabulary']=='MONDO']
mondo_mapping

Unnamed: 0,diseaseId,name,vocabulary,code,vocabularyName
56511,C0022661,"Kidney Failure, Chronic",MONDO,0000001,disease or disorder
56512,C0405580,Adrenal cortical hypofunction,MONDO,0000004,adrenocortical insufficiency
56513,C0001623,Adrenal gland hypofunction,MONDO,0000004,adrenocortical insufficiency
56514,C1859877,Alopecia universalis congenita,MONDO,0000005,"alopecia, isolated"
56515,C0005129,Bernard-Soulier Syndrome,MONDO,0000009,"inherited bleeding disorder, platelet-type"
...,...,...,...,...,...
75018,C0002792,anaphylaxis,MONDO,0100053,anaphylaxis
75019,C0413235,Idiopathic anaphylaxis,MONDO,0100054,idiopathic anaphylaxis
75020,C0851578,Sleep Disorders,MONDO,0100081,sleep disorder
75021,C0175704,LEOPARD Syndrome,MONDO,0100082,LEOPARD syndrome 1


In [6]:
len(mondo_mapping['diseaseId'].unique())

13291

In [7]:
var_mapping = pd.merge(mondo_mapping[['diseaseId','code']], disease_var_mapping[['diseaseId','snpId']], on="diseaseId", how="left")
var_mapping = var_mapping.rename(columns={'code':'mondo', 'snpId':'variant'})
var_mapping = var_mapping[['mondo','variant']].fillna('').groupby(['mondo'], as_index=False).agg(combine_rows)
var_mapping

Unnamed: 0,mondo,variant
0,0000001,"{rs149454410, rs686548, rs112407915, rs1145077..."
1,0000004,"{rs104894897, rs6161, rs1564421528, rs14543280..."
2,0000005,"{rs773764015, rs7014851, rs121434451, rs121434..."
3,0000009,"{rs121908065, rs267606849, rs28933377, rs12190..."
4,0000022,{rs6313}
...,...,...
12349,0100039,"{rs121918792, rs121918622, rs727504136, rs7960..."
12350,0100053,"{rs121913507, rs121913682, rs699, rs1267969615}"
12351,0100054,"{rs121913507, rs121913682}"
12352,0100081,"{rs1481318368, rs104893877, rs1044396, rs20325..."


In [8]:
gene_mapping = pd.merge(mondo_mapping[['diseaseId','code']], disease_gene_mapping[['diseaseId','geneId']], on="diseaseId", how="left")
gene_mapping = gene_mapping.rename(columns={'code':'mondo', 'geneId':'entrezgene'})
gene_mapping = gene_mapping[['mondo','entrezgene']].fillna('').groupby(['mondo'], as_index=False).agg(combine_rows)
gene_mapping

Unnamed: 0,mondo,entrezgene
0,0000001,"{ 2717, 147, 4179, 55349, ..."
1,0000004,"{ 2908, 338433, 6770, 55703, ..."
2,0000005,{ 55806}
3,0000009,"{ 8013, 80739, 2335, 1950, ..."
4,0000022,"{ 278, 51540, 54796, 1326, ..."
...,...,...
12349,0100039,"{ 57468, 112476, 2563, 3785, ..."
12350,0100053,"{ 6007, 1773, 196, 27349, ..."
12351,0100054,{ 3815}
12352,0100081,"{ 148789, 10482, 7167, 7068, ..."


In [2]:
def combine_rows(x):
    return set(filter(None, ';'.join(x).split(';')))

In [21]:
temp = pd.merge(var_mapping[['mondo','variant']], gene_mapping[['mondo','entrezgene']], on="mondo", how="outer")
temp

Unnamed: 0,mondo,variant,entrezgene
0,0000001,"{rs149454410, rs686548, rs112407915, rs1145077...","{ 2717, 147, 4179, 55349, ..."
1,0000004,"{rs104894897, rs6161, rs1564421528, rs14543280...","{ 2908, 338433, 6770, 55703, ..."
2,0000005,"{rs773764015, rs7014851, rs121434451, rs121434...",{ 55806}
3,0000009,"{rs121908065, rs267606849, rs28933377, rs12190...","{ 8013, 80739, 2335, 1950, ..."
4,0000022,{rs6313},"{ 278, 51540, 54796, 1326, ..."
...,...,...,...
12349,0100039,"{rs121918792, rs121918622, rs727504136, rs7960...","{ 57468, 112476, 2563, 3785, ..."
12350,0100053,"{rs121913507, rs121913682, rs699, rs1267969615}","{ 6007, 1773, 196, 27349, ..."
12351,0100054,"{rs121913507, rs121913682}",{ 3815}
12352,0100081,"{rs1481318368, rs104893877, rs1044396, rs20325...","{ 148789, 10482, 7167, 7068, ..."


In [22]:
omim_to_hsa = pd.read_csv('http://rest.genome.jp/link/omim/hsa', names=['hsa','omim','dir'], sep="\t", dtype=str)
omim_to_hsa

Unnamed: 0,hsa,omim,dir
0,hsa:1,omim:138670,equivalent
1,hsa:10,omim:243400,reverse
2,hsa:10,omim:612182,equivalent
3,hsa:100,omim:102700,reverse
4,hsa:100,omim:608958,equivalent
...,...,...,...
22935,hsa:9993,omim:600594,equivalent
22936,hsa:9994,omim:606880,equivalent
22937,hsa:9997,omim:604272,equivalent
22938,hsa:9997,omim:604377,reverse


In [23]:
hsa_to_pathway = pd.read_csv('http://rest.kegg.jp/link/pathway/hsa', names=['hsa','pathway'], sep="\t", dtype=str)
hsa_to_pathway

Unnamed: 0,hsa,pathway
0,hsa:10327,path:hsa00010
1,hsa:124,path:hsa00010
2,hsa:125,path:hsa00010
3,hsa:126,path:hsa00010
4,hsa:127,path:hsa00010
...,...,...
35372,hsa:91860,path:hsa05418
35373,hsa:92,path:hsa05418
35374,hsa:93,path:hsa05418
35375,hsa:9446,path:hsa05418


In [24]:
omim_to_pathway = pd.merge(omim_to_hsa[['hsa','omim']], hsa_to_pathway[['hsa','pathway']], on="hsa", how="inner")[['omim','pathway']]
omim_to_pathway.omim = omim_to_pathway.omim.str.replace('omim:','')
omim_to_pathway.pathway = omim_to_pathway.pathway.str.replace('path:','')
omim_to_pathway

Unnamed: 0,omim,pathway
0,243400,hsa00232
1,243400,hsa00983
2,243400,hsa01100
3,243400,hsa05204
4,612182,hsa00232
...,...,...
66037,611493,hsa04971
66038,613693,hsa04971
66039,604272,hsa05230
66040,604377,hsa05230


In [25]:
disease_ids = pd.read_csv('../mapping_files/disease_id_mapping.csv', dtype=str)
disease_ids

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,doid,ICD-10
0,0002974,603956,363354003,,,,4362,"C53,C53.9"
1,0000311,,,,,,,
2,0001642,,1489008,C0019919,,,13134,"H00,H00.01,H00.03"
3,0000310,,,,,,0050308,
4,0001641,,,,,,13129,
...,...,...,...,...,...,...,...,...
24115,0019900,,,,96160,,,"Q93,Q93.5"
24116,0019902,,766716004,,96168,,,"Q93,Q93.5"
24117,0019901,,,,96164,,,"Q93,Q93.5"
24118,0007928,,,,,,,


In [26]:
omim_to_pathway = pd.merge(disease_ids[['mondo','omim']], omim_to_pathway[['omim','pathway']], on="omim", how="inner")[['mondo','pathway']]
omim_to_pathway

Unnamed: 0,mondo,pathway
0,0002974,hsa01521
1,0002974,hsa04010
2,0002974,hsa04014
3,0002974,hsa04015
4,0002974,hsa04020
...,...,...
30067,0007932,hsa02010
30068,0007930,hsa04512
30069,0007930,hsa04611
30070,0007930,hsa04613


In [27]:
omim_to_pathway = omim_to_pathway[['mondo','pathway']].fillna('').groupby(['mondo'], as_index=False).agg(combine_rows)
omim_to_pathway

Unnamed: 0,mondo,pathway
0,0000908,"{hsa05412, hsa04390, hsa04670, hsa05226, hsa05..."
1,0000909,{hsa04966}
2,0000914,"{hsa05165, hsa05206, hsa01522, hsa05224, hsa05..."
3,0001056,"{hsa04390, hsa04930, hsa04934, hsa04152, hsa04..."
4,0001187,"{hsa04934, hsa05200, hsa04919, hsa04914, hsa04..."
...,...,...
3565,0060764,"{hsa05165, hsa04390, hsa05010, hsa05206, hsa04..."
3566,0100082,"{hsa04920, hsa05220, hsa04670, hsa05211, hsa04..."
3567,0100083,"{hsa05221, hsa04659, hsa05200, hsa05202, hsa04..."
3568,0100104,"{hsa03013, hsa05014}"


In [28]:
print(len(set(omim_to_pathway.mondo)))
print(len(set(temp.mondo)))
print(len(set(temp.mondo).intersection(set(omim_to_pathway.mondo))))

3570
12354
3279


In [29]:
old = pd.read_csv('../mapping_files/disease_att_mapping.csv', dtype=str)
old

Unnamed: 0,mondo,disgenet.genes_related_to_disease,ctd.pathway_related_to_disease,disgenet.variants_related_to_disease
0,0016135,,,
1,0019449,,,
2,0044014,5536;5538;7173;140805;3111;6863;5617;1493;629;...,,rs231775
3,0022733,,,
4,0010674,3423.0,hsa_M00076;hsa_M00078;hsa01100;hsa00531;hsa041...,
...,...,...,...,...
24662,0014805,7874.0,hsa04068;hsa05169;hsa05203,
24663,0032894,,,
24664,0002087,3586;3845;3082;1429;1048;284;672;675;5670;5529...,,rs28897672
24665,0019373,2308;2313;1674;2314;10763;3600;3480;100124696;...,,


In [30]:
old = old[['mondo','ctd.pathway_related_to_disease']].dropna()

In [31]:
print(len(set(old.mondo).intersection(set(omim_to_pathway.mondo))))

3563


In [32]:
len(old)

5317

In [42]:
new_set = set(old.mondo) - set(old.mondo).intersection(set(omim_to_pathway.mondo))

In [43]:
new_set

{'0010091',
 '0016239',
 '0005828',
 '0021697',
 '0001150',
 '0017775',
 '0005440',
 '0004993',
 '0006774',
 '0015517',
 '0005066',
 '0019050',
 '0001347',
 '0002269',
 '0007179',
 '0015280',
 '0006541',
 '0001741',
 '0043472',
 '0002245',
 '0005477',
 '0002909',
 '0011827',
 '0043523',
 '0004849',
 '0005837',
 '0005885',
 '0005345',
 '0021085',
 '0002917',
 '0006564',
 '0018105',
 '0018829',
 '0012589',
 '0006714',
 '0006115',
 '0009832',
 '0018479',
 '0003947',
 '0005218',
 '0043243',
 '0006690',
 '0005314',
 '0003329',
 '0006663',
 '0005091',
 '0006939',
 '0016390',
 '0009441',
 '0003240',
 '0010787',
 '0005357',
 '0015545',
 '0000728',
 '0006642',
 '0002520',
 '0007778',
 '0008429',
 '0001085',
 '0002036',
 '0005517',
 '0001076',
 '0016466',
 '0016642',
 '0024575',
 '0002123',
 '0004922',
 '0018838',
 '0002492',
 '0016013',
 '0011669',
 '0005377',
 '0001476',
 '0005335',
 '0006715',
 '0002363',
 '0002869',
 '0005649',
 '0016296',
 '0019297',
 '0003225',
 '0007034',
 '0006716',
 '00

In [44]:
old[old['mondo'].isin(new_set)]

Unnamed: 0,mondo,ctd.pathway_related_to_disease
55,0005889,hsa05160;hsa04014;hsa05162;hsa04071;hsa04072;h...
58,0005002,hsa05211;hsa05160;hsa04720;hsa04917;hsa04330;h...
69,0018301,hsa04623;hsa05164;hsa04060;hsa04668;hsa04622;h...
74,0005324,hsa04970
97,0005109,hsa05211;hsa05160;hsa04932;hsa_M00351;hsa04514...
...,...,...
24550,0021100,hsa05211;hsa05160;hsa04720;hsa04917;hsa04330;h...
24564,0008627,hsa04659
24581,0004750,hsa05030;hsa04015;hsa05034;hsa04024;hsa04540;h...
24606,0009637,hsa05160;hsa05144;hsa04932;hsa05162;hsa05166;h...


In [35]:
mondo_set = list(set('MONDO:' + temp['mondo']))

In [36]:
md = get_client("disease")
mapping = md.getdiseases(mondo_set, fields='ctd.pathway_related_to_disease.kegg_pathway_id',
                         species='human', returnall=False, as_dataframe=True, df_index=False)
mapping

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-12354...done.


Unnamed: 0,query,_id,_version,ctd.pathway_related_to_disease,ctd.pathway_related_to_disease.kegg_pathway_id,notfound
0,MONDO:0014018,MONDO:0014018,1.0,,,
1,MONDO:0002654,MONDO:0002654,1.0,"[{'kegg_pathway_id': 'hsa04080'}, {'kegg_pathw...",,
2,MONDO:0004891,MONDO:0004891,1.0,,,
3,MONDO:0002033,MONDO:0002033,1.0,,,
4,MONDO:0006674,MONDO:0006674,1.0,,,
...,...,...,...,...,...,...
12349,MONDO:0000173,MONDO:0000173,1.0,,,
12350,MONDO:0020358,MONDO:0020358,1.0,,,
12351,MONDO:0001582,MONDO:0001582,1.0,,,
12352,MONDO:0014839,MONDO:0014839,1.0,,,


In [37]:
test = mapping[['query','ctd.pathway_related_to_disease']].dropna()
test

Unnamed: 0,query,ctd.pathway_related_to_disease
1,MONDO:0002654,"[{'kegg_pathway_id': 'hsa04080'}, {'kegg_pathw..."
9,MONDO:0009514,[{'kegg_pathway_id': 'hsa00564'}]
11,MONDO:0024327,"[{'kegg_pathway_id': 'hsa00260'}, {'kegg_pathw..."
12,MONDO:0005106,"[{'kegg_pathway_id': 'hsa00562'}, {'kegg_pathw..."
13,MONDO:0012240,"[{'kegg_pathway_id': 'hsa04260'}, {'kegg_pathw..."
...,...,...
12331,MONDO:0008759,"[{'kegg_pathway_id': 'hsa00020'}, {'kegg_pathw..."
12334,MONDO:0009615,"[{'kegg_pathway_id': 'hsa00280'}, {'kegg_pathw..."
12339,MONDO:0017439,"[{'kegg_pathway_id': 'hsa04150'}, {'kegg_pathw..."
12343,MONDO:0012276,"[{'kegg_pathway_id': 'hsa04022'}, {'kegg_pathw..."


In [38]:
test2 = mapping[['query','ctd.pathway_related_to_disease.kegg_pathway_id']].dropna()
test2

Unnamed: 0,query,ctd.pathway_related_to_disease.kegg_pathway_id
124,MONDO:0009650,hsa04142
133,MONDO:0011915,hsa04392
154,MONDO:0012420,hsa04530
185,MONDO:0013188,hsa00910
187,MONDO:0013453,hsa04390
...,...,...
11913,MONDO:0012471,hsa03030
11945,MONDO:0044318,hsa04115
11952,MONDO:0012025,hsa05202
12021,MONDO:0012448,hsa04144


In [39]:
test["query"] = test["query"].str.replace('MONDO:', '')
test

Unnamed: 0,query,ctd.pathway_related_to_disease
1,0002654,"[{'kegg_pathway_id': 'hsa04080'}, {'kegg_pathw..."
9,0009514,[{'kegg_pathway_id': 'hsa00564'}]
11,0024327,"[{'kegg_pathway_id': 'hsa00260'}, {'kegg_pathw..."
12,0005106,"[{'kegg_pathway_id': 'hsa00562'}, {'kegg_pathw..."
13,0012240,"[{'kegg_pathway_id': 'hsa04260'}, {'kegg_pathw..."
...,...,...
12331,0008759,"[{'kegg_pathway_id': 'hsa00020'}, {'kegg_pathw..."
12334,0009615,"[{'kegg_pathway_id': 'hsa00280'}, {'kegg_pathw..."
12339,0017439,"[{'kegg_pathway_id': 'hsa04150'}, {'kegg_pathw..."
12343,0012276,"[{'kegg_pathway_id': 'hsa04022'}, {'kegg_pathw..."


In [52]:
omim_to_pathway[omim_to_pathway['mondo'].isin(set(test["query"]))]

Unnamed: 0,mondo,pathway
0,0000908,"{hsa05412, hsa04390, hsa04670, hsa05226, hsa05..."
2,0000914,"{hsa05165, hsa05206, hsa01522, hsa05224, hsa05..."
5,0002629,"{hsa05206, hsa04218, hsa04934, hsa05200, hsa05..."
9,0005298,"{hsa05010, hsa04974, hsa04928, hsa04611, hsa05..."
10,0006277,"{hsa04151, hsa05165, hsa05231, hsa04152, hsa04..."
...,...,...
3461,0044320,"{hsa04930, hsa00052, hsa04066, hsa00500, hsa00..."
3464,0044339,"{hsa04151, hsa05165, hsa04145, hsa04974, hsa04..."
3471,0054549,{hsa04146}
3488,0054698,{hsa03050}


In [53]:
test[test['query'].isin(set(omim_to_pathway["mondo"]))]

Unnamed: 0,query,ctd.pathway_related_to_disease
9,0009514,[{'kegg_pathway_id': 'hsa00564'}]
13,0012240,"[{'kegg_pathway_id': 'hsa04260'}, {'kegg_pathw..."
53,0009061,"[{'kegg_pathway_id': 'hsa02010'}, {'kegg_pathw..."
60,0012392,"[{'kegg_pathway_id': 'hsa00071'}, {'kegg_pathw..."
61,0013910,[{'kegg_pathway_id': 'hsa04080'}]
...,...,...
12324,0009218,"[{'kegg_pathway_id': 'hsa00600'}, {'kegg_pathw..."
12331,0008759,"[{'kegg_pathway_id': 'hsa00020'}, {'kegg_pathw..."
12334,0009615,"[{'kegg_pathway_id': 'hsa00280'}, {'kegg_pathw..."
12343,0012276,"[{'kegg_pathway_id': 'hsa04022'}, {'kegg_pathw..."


In [76]:
pd.set_option('display.max_colwidth', None)
test[test['query']=='0044320']['ctd.pathway_related_to_disease']

3078    [{'kegg_pathway_id': 'hsa00010'}, {'kegg_pathway_id': 'hsa00051'}, {'kegg_pathway_id': 'hsa00052'}, {'kegg_pathway_id': 'hsa00500'}, {'kegg_pathway_id': 'hsa00520'}, {'kegg_pathway_id': 'hsa00524'}, {'kegg_pathway_id': 'hsa01100'}, {'kegg_pathway_id': 'hsa01200'}, {'kegg_pathway_id': 'hsa04066'}, {'kegg_pathway_id': 'hsa04910'}, {'kegg_pathway_id': 'hsa04930'}, {'kegg_pathway_id': 'hsa04973'}, {'kegg_pathway_id': 'hsa05230'}, {'kegg_pathway_id': 'hsa_M00001'}, {'kegg_pathway_id': 'hsa_M00549'}]
Name: ctd.pathway_related_to_disease, dtype: object

In [75]:
omim_to_pathway[omim_to_pathway['mondo']=='0044320']

Unnamed: 0,mondo,pathway
3461,44320,"{hsa04930, hsa00052, hsa04066, hsa00500, hsa00520, hsa04973, hsa05131, hsa05230, hsa01100, hsa01250, hsa04910, hsa00051, hsa00524, hsa00010, hsa01200}"
