# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
disease_id = "0007079"
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  4.699100099969655e-05


## Network from NeDReX

In [44]:
from urllib import request, parse
import json

In [45]:
url = 'https://api.nedrex.net/graph_builder'

In [46]:
myobj = {
    "nodes":["protein"],
    "edges":["protein_interacts_with_protein"],
    "iid_evidence":["exp"],
    "ppi_self_loops": True,
    "taxid":[9606],
    "concise": True,
    "include_omim": True,
    "disgenet_threshold": 0,
    "use_omim_ids": False,
}

In [47]:
data = json.dumps(myobj).encode('utf8')
req =  request.Request(url, data=data) # this will make the method "POST"
resp = request.urlopen(req)

In [48]:
print(resp.read().decode('utf8'))

"e1e15418-2c09-4c9e-8ad5-ed02aed4d5a8"


## TEST

In [1]:
FILES_DIR = "../mapping_files/"

In [2]:
file_names = {'gene_ids': FILES_DIR + 'gene_id_mapping.csv',
                  'gene_atts': FILES_DIR + 'gene_att_mapping.csv',
                  'disorder_ids': FILES_DIR + 'disease_id_mapping.csv',
                  'disorder_atts': FILES_DIR + 'disease_att_mapping.csv',
                  'go_BP': FILES_DIR + 'gene_dist_go_BP.csv',
                  'go_CC': FILES_DIR + 'gene_dist_go_CC.csv',
                  'go_MF': FILES_DIR + 'gene_dist_go_MF.csv',
                  'pathway_kegg': FILES_DIR + 'gene_dist_pathway_kegg.csv',
                  'related_genes': FILES_DIR + 'disease_dist_rel_genes.csv',
                  'related_variants': FILES_DIR + 'disease_dist_rel_variants.csv',
                  'related_pathways': FILES_DIR + 'disease_dist_rel_pathways.csv'}

In [None]:
def _load_file_distances(self, file_readers: list, sep: str, mapping_name: str):
    header=


with open(self.file_names['go_BP']) as f1, open(self.file_names['go_CC']) as f2, \
                    open(self.file_names['go_MF']) as f3, open(self.file_names['pathway_kegg']) as f4:
    

In [6]:
gene_ids = pd.read_csv(file_names['gene_ids'], dtype=str)
gene_ids

Unnamed: 0.1,Unnamed: 0,entrezgene,symbol,uniprot.Swiss-Prot,ensembl.gene
0,0,1,A1BG,P04217,ENSG00000121410
1,1,2,A2M,P01023,ENSG00000175899
2,3,9,NAT1,P18440,ENSG00000171428
3,4,10,NAT2,P11245,ENSG00000156006
4,6,12,SERPINA3,P01011,ENSG00000196136
...,...,...,...,...,...
15986,40547,101928917,HSFX3,A0A1B0GWH4,ENSG00000231473
15987,42116,102724560,LOC102724560,P35520,ENSG00000277983
15988,42327,102725035,LOC102725035,O75022,ENSG00000244184
15989,44589,105372280,GNG14,A0A1W2PPG7,ENSG00000275863


In [11]:
gene_ids[gene_ids['entrezgene']=='12']['symbol'].iloc[0]

'SERPINA3'

In [1]:
import graph_tool as gt

In [2]:
old_network = gt.load_graph("../entrez-go_BP-network.graphml")
new_network = gt.load_graph("../networks/entrez-go_BP-network.graphml")

In [6]:
print(len(old_network.get_vertices()))
print(len(new_network.get_vertices()))

14408
14408


In [7]:
print(len(old_network.get_edges()))
print(len(new_network.get_edges()))

5176518
5176518


In [17]:
old_network.vp.ID[old_network.vertex(9)]  

'10233'

In [18]:
new_network.vp.ID[new_network.vertex(9)]  

'10233'

In [22]:
for e in old_network.iter_edges():
    print(e)
    print(old_network.ep.weight(e))

[0, 1]


TypeError: 'EdgePropertyMap' object is not callable

In [20]:
for e in new_network.iter_edges():
    print(e)
    print(new_network.ep.weight(e))

[0, 1]
[0, 2]
[0, 3]
[0, 4]
[0, 5]
[0, 6]
[0, 7]
[0, 8]
[0, 9]
[0, 10]
[0, 11]
[0, 12]
[0, 13]
[0, 14]
[0, 15]
[0, 16]
[0, 17]
[0, 18]
[0, 19]
[0, 20]
[0, 21]
[0, 22]
[0, 23]
[0, 24]
[0, 25]
[0, 26]
[0, 27]
[0, 28]
[0, 29]
[0, 30]
[0, 31]
[0, 32]
[0, 33]
[0, 34]
[0, 35]
[0, 36]
[0, 37]
[0, 38]
[0, 39]
[0, 40]
[0, 41]
[0, 42]
[0, 43]
[0, 44]
[0, 45]
[0, 46]
[0, 47]
[0, 48]
[0, 49]
[0, 50]
[0, 51]
[0, 52]
[0, 53]
[0, 54]
[0, 55]
[0, 56]
[0, 57]
[0, 58]
[0, 59]
[0, 60]
[0, 61]
[0, 62]
[0, 63]
[0, 64]
[0, 65]
[0, 66]
[0, 67]
[0, 68]
[0, 69]
[0, 70]
[0, 71]
[0, 72]
[0, 73]
[0, 74]
[0, 75]
[0, 76]
[0, 77]
[0, 78]
[0, 79]
[0, 80]
[0, 81]
[0, 82]
[0, 83]
[0, 84]
[0, 85]
[0, 86]
[0, 87]
[0, 88]
[0, 89]
[0, 90]
[0, 91]
[0, 92]
[0, 93]
[0, 94]
[0, 95]
[0, 96]
[0, 97]
[0, 98]
[0, 99]
[0, 100]
[0, 101]
[0, 102]
[0, 103]
[0, 104]
[0, 105]
[0, 106]
[0, 107]
[0, 108]
[0, 109]
[0, 110]
[0, 111]
[0, 112]
[0, 113]
[0, 114]
[0, 115]
[0, 116]
[0, 117]
[0, 118]
[0, 119]
[0, 120]
[0, 121]
[0, 122]
[0, 123]
[

[5, 22]
[5, 23]
[5, 24]
[5, 25]
[5, 26]
[5, 27]
[5, 28]
[5, 29]
[5, 30]
[5, 31]
[5, 32]
[5, 33]
[5, 34]
[5, 35]
[5, 36]
[5, 37]
[5, 38]
[5, 39]
[5, 40]
[5, 41]
[5, 42]
[5, 43]
[5, 44]
[5, 45]
[5, 46]
[5, 47]
[5, 48]
[5, 49]
[5, 50]
[5, 51]
[5, 52]
[5, 53]
[5, 54]
[5, 55]
[5, 56]
[5, 57]
[5, 58]
[5, 59]
[5, 60]
[5, 61]
[5, 62]
[5, 63]
[5, 64]
[5, 65]
[5, 66]
[5, 67]
[5, 68]
[5, 69]
[5, 70]
[5, 71]
[5, 72]
[5, 73]
[5, 74]
[5, 75]
[5, 76]
[5, 77]
[5, 78]
[5, 79]
[5, 80]
[5, 81]
[5, 82]
[5, 83]
[5, 84]
[5, 85]
[5, 86]
[5, 87]
[5, 88]
[5, 89]
[5, 90]
[5, 91]
[5, 92]
[5, 93]
[5, 94]
[5, 95]
[5, 96]
[5, 97]
[5, 98]
[5, 99]
[5, 100]
[5, 101]
[5, 102]
[5, 103]
[5, 104]
[5, 105]
[5, 106]
[5, 107]
[5, 108]
[5, 109]
[5, 110]
[5, 111]
[5, 112]
[5, 113]
[5, 114]
[5, 12123]
[5, 115]
[5, 116]
[5, 117]
[5, 118]
[5, 119]
[5, 120]
[5, 121]
[5, 122]
[5, 123]
[5, 124]
[5, 125]
[5, 126]
[5, 127]
[5, 128]
[5, 129]
[5, 130]
[5, 131]
[5, 132]
[5, 133]
[5, 134]
[5, 135]
[5, 136]
[5, 137]
[5, 138]
[5, 139]
[5, 1

[11, 844]
[11, 845]
[11, 1697]
[11, 180]
[11, 181]
[11, 182]
[11, 183]
[11, 184]
[11, 2671]
[11, 2672]
[11, 185]
[11, 186]
[11, 187]
[11, 188]
[11, 189]
[11, 190]
[11, 191]
[11, 192]
[11, 193]
[11, 2676]
[11, 6557]
[11, 874]
[11, 2678]
[11, 2679]
[11, 194]
[11, 2680]
[11, 2681]
[11, 878]
[11, 2682]
[11, 2683]
[11, 2684]
[11, 880]
[11, 2685]
[11, 2686]
[11, 195]
[11, 2687]
[11, 2688]
[11, 2690]
[11, 2691]
[11, 196]
[11, 197]
[11, 198]
[11, 199]
[11, 200]
[11, 1716]
[11, 1717]
[11, 201]
[11, 892]
[11, 202]
[11, 2702]
[11, 2703]
[11, 2705]
[11, 2706]
[11, 899]
[11, 2708]
[11, 2709]
[11, 1726]
[11, 2710]
[11, 203]
[11, 2713]
[11, 204]
[11, 205]
[11, 4970]
[11, 2714]
[11, 206]
[11, 2715]
[11, 2716]
[11, 2719]
[11, 2720]
[11, 2722]
[11, 2723]
[11, 1731]
[11, 2724]
[11, 920]
[11, 921]
[11, 922]
[11, 2730]
[11, 2731]
[11, 936]
[11, 1740]
[11, 2733]
[11, 2734]
[11, 207]
[11, 2736]
[11, 2738]
[11, 1745]
[11, 943]
[11, 2740]
[11, 944]
[11, 2741]
[11, 2742]
[11, 2743]
[11, 2746]
[11, 1746]
[11, 27

[14, 4492]
[14, 4493]
[14, 4494]
[14, 4495]
[14, 4496]
[14, 4497]
[14, 4498]
[14, 4499]
[14, 313]
[14, 4500]
[14, 314]
[14, 4501]
[14, 4502]
[14, 4503]
[14, 4504]
[14, 4505]
[14, 4506]
[14, 4507]
[14, 4508]
[14, 4509]
[14, 4510]
[14, 4511]
[14, 4512]
[14, 4513]
[14, 4514]
[14, 4515]
[14, 4516]
[14, 4517]
[14, 315]
[14, 4518]
[14, 4519]
[14, 4520]
[14, 4521]
[14, 4522]
[14, 4523]
[14, 4524]
[14, 4525]
[14, 4526]
[14, 4527]
[14, 4528]
[14, 4529]
[14, 4530]
[14, 4531]
[14, 4532]
[14, 4533]
[14, 4534]
[14, 4535]
[14, 4536]
[14, 316]
[14, 317]
[14, 4537]
[14, 4538]
[14, 4539]
[14, 4540]
[14, 4541]
[14, 4542]
[14, 318]
[14, 319]
[14, 4543]
[14, 4544]
[14, 4545]
[14, 4546]
[14, 4547]
[14, 4548]
[14, 4549]
[14, 4550]
[14, 4551]
[14, 4552]
[14, 4553]
[14, 4554]
[14, 1986]
[14, 4555]
[14, 320]
[14, 4556]
[14, 321]
[14, 322]
[14, 323]
[14, 324]
[14, 325]
[14, 4557]
[14, 326]
[14, 327]
[14, 3125]
[14, 328]
[14, 4558]
[14, 4559]
[14, 329]
[14, 330]
[14, 4560]
[14, 4561]
[14, 4562]
[14, 4563]
[14, 4

[24, 6290]
[24, 9829]
[24, 320]
[24, 9830]
[24, 321]
[24, 322]
[24, 323]
[24, 324]
[24, 325]
[24, 326]
[24, 327]
[24, 328]
[24, 329]
[24, 330]
[24, 331]
[24, 332]
[24, 333]
[24, 334]
[24, 335]
[24, 336]
[24, 337]
[24, 338]
[24, 339]
[24, 340]
[24, 6734]
[24, 2001]
[24, 341]
[24, 342]
[24, 343]
[24, 344]
[24, 345]
[24, 346]
[24, 6747]
[24, 347]
[24, 348]
[24, 349]
[24, 350]
[24, 351]
[24, 352]
[24, 353]
[24, 354]
[24, 355]
[24, 356]
[24, 357]
[24, 358]
[24, 3173]
[24, 359]
[24, 360]
[24, 361]
[24, 362]
[24, 363]
[24, 6299]
[24, 364]
[24, 365]
[24, 366]
[24, 367]
[24, 368]
[24, 369]
[24, 370]
[24, 371]
[24, 372]
[24, 373]
[24, 9831]
[24, 374]
[24, 375]
[24, 376]
[24, 377]
[24, 378]
[24, 379]
[24, 5471]
[24, 380]
[24, 381]
[24, 8150]
[24, 382]
[24, 383]
[24, 384]
[24, 6302]
[24, 385]
[24, 386]
[24, 8646]
[24, 387]
[24, 388]
[24, 8647]
[24, 389]
[24, 390]
[24, 391]
[24, 392]
[24, 393]
[24, 8649]
[24, 8650]
[24, 394]
[24, 395]
[24, 9832]
[24, 396]
[24, 2058]
[24, 397]
[24, 398]
[24, 399]
[2

KeyboardInterrupt: 

In [21]:
old_network.list_properties()

ID             (vertex)  (type: string)
weight         (edge)    (type: double)


In [25]:
print(new_network.ep.weight)

<EdgePropertyMap object with value type 'double', for Graph 0x7f8949c705e0, at 0x7f8949c70f10>


In [41]:
df = pd.read_csv('../mapping_files/gene_att_mapping.csv', header=0, dtype=str)
df

Unnamed: 0,entrezgene,go.BP,go.CC,go.MF,pathway.kegg
0,5747,"{'GO:0018108', 'GO:0043066', 'GO:0007229', 'GO...","{'GO:0005737', 'GO:0036064', 'GO:0005829', 'GO...","{'GO:0004713', 'GO:0019901', 'GO:0019903', 'GO...","{'hsa04510', 'hsa05163', 'hsa04360', 'hsa05417..."
1,84666,"{'GO:0007165', 'GO:0050673'}","{'GO:0005615', 'GO:0005575'}","{'GO:0005179', 'GO:0003674'}",set()
2,4712,"{'GO:0042775', 'GO:0006120', 'GO:0032981'}","{'GO:0005654', 'GO:0031966', 'GO:0005747', 'GO...","{'GO:0008137', 'GO:0005515'}","{'hsa05415', 'hsa05016', 'hsa04714', 'hsa05022..."
3,3073,"{'GO:0005975', 'GO:0030203', 'GO:0006689', 'GO...","{'GO:0070062', 'GO:0005829', 'GO:0016020', 'GO...","{'GO:0004563', 'GO:0005515', 'GO:0046982', 'GO...","{'hsa00531', 'hsa04142', 'hsa00603', 'hsa00520..."
4,27130,"{'GO:0090090', 'GO:0016055', 'GO:1904108'}","{'GO:0005737', 'GO:0005874', 'GO:0005929', 'GO...","{'GO:0005516', 'GO:0005515'}",{'hsa04310'}
...,...,...,...,...,...
15986,152559,"{'GO:0034067', 'GO:0010977', 'GO:0033137', 'GO...","{'GO:0016021', 'GO:0005794', 'GO:0000139'}","{'GO:0038023', 'GO:0005515'}",set()
15987,23371,"{'GO:0035264', 'GO:0048871', 'GO:0035556', 'GO...","{'GO:0043025', 'GO:0005886', 'GO:0030425', 'GO...","{'GO:0019900', 'GO:0042802', 'GO:0005515', 'GO...",set()
15988,84518,{'GO:0031424'},"{'GO:0001533', 'GO:0005737'}",{'GO:0005515'},set()
15989,2560,"{'GO:0007268', 'GO:0006811', 'GO:1902476', 'GO...","{'GO:0034707', 'GO:1902711', 'GO:0030425', 'GO...","{'GO:0015276', 'GO:0004890', 'GO:0050811', 'GO...","{'hsa04723', 'hsa04727', 'hsa04080', 'hsa05032..."


In [45]:
print(type(df['go.BP'].iloc[0]))
print(type(df['entrezgene'].iloc[0]))

<class 'str'>
<class 'str'>


In [47]:
test=[{},{"GO","TI"}]
test

[{}, {'GO', 'TI'}]

In [49]:
for t in test:
    print(";".join(t))


GO;TI


In [17]:
def combine_rows(x):
    return set(filter(None, x.split(';')))
df[df.columns[1:]] = df[df.columns[1:]].fillna('').applymap(combine_rows)

In [18]:
df

Unnamed: 0,entrezgene,go.BP,go.CC,go.MF,pathway.kegg
0,339345,"{GO:0007283, GO:0017148, GO:0045835, GO:190015...","{GO:0005737, GO:0000932, GO:0048471, GO:0005634}","{GO:0008270, GO:0003729, GO:0005515}",{}
1,349667,"{GO:0007166, GO:0010977, GO:0031103}","{GO:0070062, GO:0043005, GO:0005576, GO:004665...",{GO:0038023},{}
2,56204,{},{},{GO:0005515},{}
3,8492,"{GO:0031638, GO:0006887, GO:0006897}","{GO:0043083, GO:0043195, GO:0005886, GO:003042...","{GO:0004252, GO:0008236, GO:0005044}",{}
4,3630,"{GO:0060267, GO:0045818, GO:0060266, GO:003288...","{GO:0005788, GO:0005576, GO:0000139, GO:003311...","{GO:0042802, GO:0005179, GO:0005158, GO:000551...","{hsa04913, hsa04910, hsa04140, hsa04950, hsa04..."
...,...,...,...,...,...
15986,10085,"{GO:0010811, GO:0007155}","{GO:0070062, GO:0062023, GO:1903561}","{GO:0005178, GO:0005201, GO:0005509}",{}
15987,9470,"{GO:1905618, GO:0031047, GO:0006413, GO:0017148}","{GO:0000932, GO:0005829, GO:0016281, GO:000573...","{GO:0003743, GO:0008135, GO:0000339, GO:000372...","{hsa04211, hsa04150, hsa04910, hsa04151, hsa04..."
15988,1673,"{GO:0050830, GO:0031640, GO:0042742, GO:005082...","{GO:0005576, GO:0005796, GO:0005615}","{GO:0031731, GO:0042056, GO:0005515}","{hsa05150, hsa04621, hsa04657}"
15989,8386,"{GO:0007186, GO:0050911, GO:0007608}","{GO:0016021, GO:0005886}","{GO:0004984, GO:0004930}",{hsa04740}


In [18]:
for index, value in enumerate(df.entrezgene[15986:], start=15986):
    print(index, value)

15986 10085
15987 9470
15988 1673
15989 8386
15990 6509


In [19]:
for index, value in enumerate(df.entrezgene[15986:]):
    print(index, value)

0 10085
1 9470
2 1673
3 8386
4 6509


In [50]:
import scipy.sparse as sp

In [51]:
row  = [0, 3, 1, 0]
col  = [0, 3, 1, 2]
data = [4, 5, 7, 9]
mat = sp.coo_matrix((data, (row, col)), shape=(4, 4))
mat.toarray()

array([[4, 0, 9, 0],
       [0, 7, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 5]])

In [52]:
row  = [0, 4, 0]
col  = [4, 2, 0]
data = [7, 7, 7]
mat2 = sp.coo_matrix((data, (row, col)), shape=(5, 5))
mat2.toarray()

array([[7, 0, 0, 0, 7],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 7, 0, 0]])

In [53]:
mat.data

array([4, 5, 7, 9])

In [54]:
mat2.data

array([7, 7, 7])

In [55]:
row  = np.concatenate((mat.row, mat2.row), axis=None)
col  = np.concatenate((mat.col, mat2.col), axis=None)
data = np.concatenate((mat.data, mat2.data), axis=None)
mat3 = sp.csr_matrix((data, (row, col)), shape=(5, 5))
mat3.toarray()

array([[11,  0,  9,  0,  7],
       [ 0,  7,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  5,  0],
       [ 0,  0,  7,  0,  0]])

In [50]:
for index, value in enumerate(mat3.row):
    print(index, value, mat3.col[index])

0 0 0
1 3 3
2 1 1
3 0 2
4 0 4
5 4 2
6 0 0


In [49]:
mat3.row

array([0, 3, 1, 0, 0, 4, 0], dtype=int32)

In [51]:
mat3.col

array([0, 3, 1, 2, 4, 2, 0], dtype=int32)

In [87]:
mat3.tocsc()[0,0]

11

In [56]:
print(mat3)

  (0, 0)	11
  (0, 2)	9
  (0, 4)	7
  (1, 1)	7
  (3, 3)	5
  (4, 2)	7


In [57]:
print(mat3.tocsc())

  (0, 0)	11
  (1, 1)	7
  (0, 2)	9
  (4, 2)	7
  (3, 3)	5
  (0, 4)	7


In [58]:
print(mat3.tocsr())

  (0, 0)	11
  (0, 2)	9
  (0, 4)	7
  (1, 1)	7
  (3, 3)	5
  (4, 2)	7


In [62]:
mati = mat3.tocoo()

In [64]:
print(mati.row)

[0 0 0 1 3 4]


In [67]:
import timeit
start = timeit.default_timer()
for i in range(10000):
    mat3.tocsc()[(0,0)]
stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  0.7228160180020495


In [68]:
import timeit
start = timeit.default_timer()
mat4 =  mat3.tocsc()
for i in range(10000):
    mat4[(0,0)]
stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  0.11830322400055593


In [81]:
def transform_disgenet_mapping(mapping: pd.DataFrame, file, col_old, col_new):
    disease_mapping = pd.read_csv(file, compression='gzip', sep='\t', dtype=str)
    df = pd.merge(mapping[['diseaseId', 'mondo']], disease_mapping[['diseaseId', col_old]],
                  on="diseaseId", how="left")
    df = df.rename(columns={col_old: col_new})
    df[col_new] = df[col_new].str.strip()
    df = df[['mondo', col_new]].fillna('').groupby(['mondo'], as_index=False).agg(combine_rows)
    return df

def combine_rows(x):
    return set(filter(None, ';'.join(x).split(';')))

In [82]:
disgenet_mapping = pd.read_csv("https://www.disgenet.org/static/disgenet_ap1/files/downloads/disease_mappings.tsv.gz", compression='gzip', sep='\t', dtype=str)
disgenet_mapping = disgenet_mapping[disgenet_mapping['vocabulary'] == 'MONDO'][['diseaseId', 'code']]
disgenet_mapping = disgenet_mapping.rename(columns={'code': 'mondo'})

var_mapping = transform_disgenet_mapping(mapping=disgenet_mapping, file="https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_variant_disease_associations.tsv.gz", col_old='snpId',
                                                col_new='disgenet.variants_related_to_disease')
gene_mapping = transform_disgenet_mapping(mapping=disgenet_mapping, file="https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz", col_old='geneId',
                                                 col_new='disgenet.genes_related_to_disease')
disease_att_mapping = pd.merge(var_mapping[['mondo', 'disgenet.variants_related_to_disease']],
                                   gene_mapping[['mondo', 'disgenet.genes_related_to_disease']],
                                   on="mondo", how="outer")

In [83]:
disease_att_mapping

Unnamed: 0,mondo,disgenet.variants_related_to_disease,disgenet.genes_related_to_disease
0,0000001,"{rs1952034, rs374118649, rs6954996, rs7830, rs...","{3606, 3630, 23771, 4282, 5730, 54106, 10269, ..."
1,0000004,"{rs104894118, rs770374710, rs121918654, rs6161...","{1583, 51322, 5579, 4791, 1452, 4173, 2710, 62..."
2,0000005,"{rs7014851, rs121434448, rs773764015, rs121434...",{55806}
3,0000009,"{rs759081917, rs121908061, rs1012488531, rs121...","{1432, 23218, 117195, 259249, 55278, 947, 9370..."
4,0000022,{rs6313},"{627, 83881, 60498, 359, 412, 276, 1326, 3855,..."
...,...,...,...
12349,0100039,"{rs267608511, rs398123593, rs267608493, rs1219...","{170302, 728458, 10678, 2290, 2044, 501, 6335,..."
12350,0100053,"{rs699, rs121913682, rs1267969615, rs121913507}","{3606, 7220, 26511, 3559, 105371045, 6566, 570..."
12351,0100054,"{rs121913682, rs121913507}",{3815}
12352,0100081,"{rs10766075, rs1481892, rs11824092, rs34637584...","{8243, 60675, 8492, 23394, 57007, 259249, 4502..."


In [73]:
def set_to_string(x, sep: str = ';'):
    return sep.join(x)

In [84]:
df = disease_att_mapping.copy()
df[df.columns[1:]] = df[df.columns[1:]].fillna('').applymap(set_to_string)
df

Unnamed: 0,mondo,disgenet.variants_related_to_disease,disgenet.genes_related_to_disease
0,0000001,rs1952034;rs374118649;rs6954996;rs7830;rs93494...,3606;3630;23771;4282;5730;54106;10269;4772;478...
1,0000004,rs104894118;rs770374710;rs121918654;rs6161;rs7...,1583;51322;5579;4791;1452;4173;2710;6288;16902...
2,0000005,rs7014851;rs121434448;rs773764015;rs121434451,55806
3,0000009,rs759081917;rs121908061;rs1012488531;rs1219080...,1432;23218;117195;259249;55278;947;9370;3553;7...
4,0000022,rs6313,627;83881;60498;359;412;276;1326;3855;278;554;...
...,...,...,...
12349,0100039,rs267608511;rs398123593;rs267608493;rs12190967...,170302;728458;10678;2290;2044;501;6335;6513;63...
12350,0100053,rs699;rs121913682;rs1267969615;rs121913507,3606;7220;26511;3559;105371045;6566;570;100505...
12351,0100054,rs121913682;rs121913507,3815
12352,0100081,rs10766075;rs1481892;rs11824092;rs34637584;rs6...,8243;60675;8492;23394;57007;259249;4502;25814;...


In [87]:
disease_mapping = pd.read_csv("../mapping_files/disease_id_mapping.csv", dtype=str)
disease_mapping

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,doid,ICD-10
0,0002974,603956,363354003,,,,4362,"C53.9,C53"
1,0000311,,,,,,,
2,0001642,,1489008,C0019919,,,13134,"H00.01,H00,H00.03"
3,0000310,,,,,,0050308,
4,0001641,,,,,,13129,
...,...,...,...,...,...,...,...,...
24115,0019900,,,,96160,,,"Q93,Q93.5"
24116,0019902,,766716004,,96168,,,"Q93,Q93.5"
24117,0019901,,,,96164,,,"Q93,Q93.5"
24118,0007928,,,,,,,


In [89]:
omim_to_hsa = pd.read_csv("http://rest.genome.jp/link/omim/hsa", names=['hsa', 'omim', 'dir'], sep="\t", dtype=str)
hsa_to_pathway = pd.read_csv("http://rest.kegg.jp/link/pathway/hsa", names=['hsa', 'pathway'], sep="\t", dtype=str)
omim_to_pathway = pd.merge(omim_to_hsa[['hsa', 'omim']], hsa_to_pathway[['hsa', 'pathway']], on="hsa", how="inner")
omim_to_pathway.omim = omim_to_pathway.omim.str.replace('omim:', '')
omim_to_pathway.pathway = omim_to_pathway.pathway.str.replace('path:', '')
omim_to_pathway = pd.merge(disease_mapping[['mondo', 'omim']], omim_to_pathway[['omim', 'pathway']], on="omim",
                               how="inner")
omim_to_pathway = omim_to_pathway[['mondo', 'pathway']].fillna('').groupby(['mondo'], as_index=False).agg(
        combine_rows)
omim_to_pathway.rename(columns={'pathway': 'ctd.pathway_related_to_disease'}, inplace=True)


In [99]:
omim_to_pathway[omim_to_pathway['mondo']=="0000004"]

Unnamed: 0,mondo,ctd.pathway_related_to_disease


In [98]:
omim_to_pathway

Unnamed: 0,mondo,ctd.pathway_related_to_disease
0,0000908,"{hsa04390, hsa05100, hsa05412, hsa05213, hsa05..."
1,0000909,{hsa04966}
2,0000914,"{hsa04371, hsa04658, hsa05206, hsa04919, hsa05..."
3,0001056,"{hsa04640, hsa04380, hsa04360, hsa04936, hsa04..."
4,0001187,"{hsa04360, hsa04720, hsa05213, hsa05230, hsa05..."
...,...,...
3565,0060764,"{hsa04390, hsa05206, hsa04310, hsa05217, hsa05..."
3566,0100082,"{hsa04722, hsa04360, hsa05211, hsa05130, hsa05..."
3567,0100083,"{hsa05202, hsa04659, hsa05220, hsa04530, hsa05..."
3568,0100104,"{hsa05014, hsa03013}"


In [None]:
mapping = dm.get_attributes_from_database(
        missing=['MONDO:' + x for x in set(disease_att_mapping.mondo) - set(omim_to_pathway.mondo)],
        attributes=['ctd.pathway_related_to_disease'])
mapping = pd.concat([omim_to_pathway, mapping])