# Testing environment for DoSE

## Setup

### Load libraries

In [1]:
import pandas as pd
import numpy as np 
import gseapy
from biothings_client import get_client

### Define data

In [2]:
seeds_file = "Input/0007079.txt"
betweenness_file = "Input/0007079_added_200_dmd_betweenness_hub_0.01.txt"
significance_file = "Input/0007079_added_200_dmd_significance_hub_1.txt"
diseases_file = "Input/ICD10_commROCG_raw.txt"
disease_clusters_file = "Input/ICD10_commROCG_cluster.txt"

### Load data

In [3]:
disease_id = "0007079"
seeds = pd.read_csv(seeds_file, sep="\t", header=None)[0]
betweenness = pd.read_csv(betweenness_file, sep="\t")['node']
significance = pd.read_csv(significance_file, sep="\t")['node']
diseases = pd.read_csv(diseases_file, sep="\t", header=None)
disease_clusters = pd.read_csv(disease_clusters_file, sep="\t", header=None)

In [4]:
import timeit
start = timeit.default_timer()

stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  4.699100099969655e-05


## Network from NeDReX

In [44]:
from urllib import request, parse
import json

In [45]:
url = 'https://api.nedrex.net/graph_builder'

In [46]:
myobj = {
    "nodes":["protein"],
    "edges":["protein_interacts_with_protein"],
    "iid_evidence":["exp"],
    "ppi_self_loops": True,
    "taxid":[9606],
    "concise": True,
    "include_omim": True,
    "disgenet_threshold": 0,
    "use_omim_ids": False,
}

In [47]:
data = json.dumps(myobj).encode('utf8')
req =  request.Request(url, data=data) # this will make the method "POST"
resp = request.urlopen(req)

In [48]:
print(resp.read().decode('utf8'))

"e1e15418-2c09-4c9e-8ad5-ed02aed4d5a8"


## TEST

In [1]:
FILES_DIR = "../mapping_files/"

In [2]:
file_names = {'gene_ids': FILES_DIR + 'gene_id_mapping.csv',
                  'gene_atts': FILES_DIR + 'gene_att_mapping.csv',
                  'disorder_ids': FILES_DIR + 'disease_id_mapping.csv',
                  'disorder_atts': FILES_DIR + 'disease_att_mapping.csv',
                  'go_BP': FILES_DIR + 'gene_dist_go_BP.csv',
                  'go_CC': FILES_DIR + 'gene_dist_go_CC.csv',
                  'go_MF': FILES_DIR + 'gene_dist_go_MF.csv',
                  'pathway_kegg': FILES_DIR + 'gene_dist_pathway_kegg.csv',
                  'related_genes': FILES_DIR + 'disease_dist_rel_genes.csv',
                  'related_variants': FILES_DIR + 'disease_dist_rel_variants.csv',
                  'related_pathways': FILES_DIR + 'disease_dist_rel_pathways.csv'}

In [None]:
def _load_file_distances(self, file_readers: list, sep: str, mapping_name: str):
    header=


with open(self.file_names['go_BP']) as f1, open(self.file_names['go_CC']) as f2, \
                    open(self.file_names['go_MF']) as f3, open(self.file_names['pathway_kegg']) as f4:
    

In [6]:
gene_ids = pd.read_csv(file_names['gene_ids'], dtype=str)
gene_ids

Unnamed: 0.1,Unnamed: 0,entrezgene,symbol,uniprot.Swiss-Prot,ensembl.gene
0,0,1,A1BG,P04217,ENSG00000121410
1,1,2,A2M,P01023,ENSG00000175899
2,3,9,NAT1,P18440,ENSG00000171428
3,4,10,NAT2,P11245,ENSG00000156006
4,6,12,SERPINA3,P01011,ENSG00000196136
...,...,...,...,...,...
15986,40547,101928917,HSFX3,A0A1B0GWH4,ENSG00000231473
15987,42116,102724560,LOC102724560,P35520,ENSG00000277983
15988,42327,102725035,LOC102725035,O75022,ENSG00000244184
15989,44589,105372280,GNG14,A0A1W2PPG7,ENSG00000275863


In [11]:
gene_ids[gene_ids['entrezgene']=='12']['symbol'].iloc[0]

'SERPINA3'

In [1]:
import graph_tool as gt

In [4]:
old_network = gt.load_graph("Input/mondo-related_variants-network.graphml")
new_network = gt.load_graph("Input/mondo_variant_based.gt")

In [5]:
print(len(old_network.get_vertices()))
print(len(new_network.get_vertices()))

7509
8743


In [6]:
print(len(old_network.get_edges()))
print(len(new_network.get_edges()))

677241
607188


In [7]:
old_network.vp.ID[old_network.vertex(9)]  

'0000358'

In [8]:
new_network.vp.ID[new_network.vertex(9)]  

'mondo.0008855'

In [16]:
old_ids = set()
for vertex in old_network.get_vertices():
    old_ids.add('mondo.'+old_network.vp.ID[old_network.vertex(vertex)])
old_ids

{'mondo.0009757',
 'mondo.0024677',
 'mondo.0014210',
 'mondo.0011645',
 'mondo.0007043',
 'mondo.0008671',
 'mondo.0014904',
 'mondo.0020320',
 'mondo.0024544',
 'mondo.0010393',
 'mondo.0012083',
 'mondo.0011603',
 'mondo.0006896',
 'mondo.0014586',
 'mondo.0010261',
 'mondo.0007252',
 'mondo.0020645',
 'mondo.0013582',
 'mondo.0007001',
 'mondo.0043079',
 'mondo.0014392',
 'mondo.0011838',
 'mondo.0011702',
 'mondo.0005814',
 'mondo.0019307',
 'mondo.0010137',
 'mondo.0009518',
 'mondo.0019716',
 'mondo.0006745',
 'mondo.0011057',
 'mondo.0012056',
 'mondo.0005424',
 'mondo.0010580',
 'mondo.0014405',
 'mondo.0006037',
 'mondo.0008322',
 'mondo.0013197',
 'mondo.0021129',
 'mondo.0008995',
 'mondo.0017658',
 'mondo.0013455',
 'mondo.0019341',
 'mondo.0013755',
 'mondo.0024333',
 'mondo.0007950',
 'mondo.0012912',
 'mondo.0024570',
 'mondo.0019404',
 'mondo.0000334',
 'mondo.0003227',
 'mondo.0012273',
 'mondo.0011512',
 'mondo.0000153',
 'mondo.0005098',
 'mondo.0004069',
 'mondo.00

In [17]:
new_ids = set()
for vertex in new_network.get_vertices():
    new_ids.add(new_network.vp.ID[new_network.vertex(vertex)])
new_ids

{'mondo.0009757',
 'mondo.0024677',
 'mondo.0014210',
 'mondo.0011645',
 'mondo.0007043',
 'mondo.0008671',
 'mondo.0014904',
 'mondo.0020320',
 'mondo.0024544',
 'mondo.0010393',
 'mondo.0012083',
 'mondo.0011603',
 'mondo.0008895',
 'mondo.0013764',
 'mondo.0011533',
 'mondo.0011966',
 'mondo.0013353',
 'mondo.0006896',
 'mondo.0014586',
 'mondo.0018991',
 'mondo.0014218',
 'mondo.0010261',
 'mondo.0007252',
 'mondo.0020645',
 'mondo.0013582',
 'mondo.0007001',
 'mondo.0043079',
 'mondo.0014392',
 'mondo.0011639',
 'mondo.0011838',
 'mondo.0014478',
 'mondo.0010503',
 'mondo.0011702',
 'mondo.0019307',
 'mondo.0005814',
 'mondo.0010137',
 'mondo.0009518',
 'mondo.0019716',
 'mondo.0005831',
 'mondo.0011057',
 'mondo.0006745',
 'mondo.0012056',
 'mondo.0005424',
 'mondo.0010580',
 'mondo.0014405',
 'mondo.0006037',
 'mondo.0008322',
 'mondo.0013197',
 'mondo.0021129',
 'mondo.0008995',
 'mondo.0017658',
 'mondo.0013455',
 'mondo.0019341',
 'mondo.0013755',
 'mondo.0015025',
 'mondo.00

In [18]:
new_ids-old_ids

{'mondo.0011664',
 'mondo.0010713',
 'mondo.0014622',
 'mondo.0012285',
 'mondo.0009252',
 'mondo.0008659',
 'mondo.0010387',
 'mondo.0054861',
 'mondo.0012792',
 'mondo.0010053',
 'mondo.0013349',
 'mondo.0008885',
 'mondo.0009501',
 'mondo.0008895',
 'mondo.0013764',
 'mondo.0011533',
 'mondo.0013834',
 'mondo.0007272',
 'mondo.0012731',
 'mondo.0011966',
 'mondo.0014280',
 'mondo.0013353',
 'mondo.0018991',
 'mondo.0014218',
 'mondo.0009434',
 'mondo.0011639',
 'mondo.0014080',
 'mondo.0013886',
 'mondo.0014478',
 'mondo.0013840',
 'mondo.0010503',
 'mondo.0021013',
 'mondo.0014542',
 'mondo.0013384',
 'mondo.0013787',
 'mondo.0014889',
 'mondo.0014821',
 'mondo.0005831',
 'mondo.0033013',
 'mondo.0015009',
 'mondo.0013473',
 'mondo.0011336',
 'mondo.0015025',
 'mondo.0014996',
 'mondo.0060585',
 'mondo.0013190',
 'mondo.0014518',
 'mondo.0015001',
 'mondo.0044724',
 'mondo.0014257',
 'mondo.0014317',
 'mondo.0009927',
 'mondo.0013875',
 'mondo.0012251',
 'mondo.0013605',
 'mondo.00

In [23]:
old_network.list_properties()

ID             (vertex)  (type: string)
weight         (edge)    (type: double)


In [24]:
new_network.list_properties()

ID             (vertex)  (type: string)
JI             (edge)    (type: double)


In [28]:
import graph_tool.all as gt

In [36]:
gt.find_vertex(old_network, old_network.vp.ID, '0011664')

[]

In [37]:
gt.find_vertex(new_network, new_network.vp.ID, 'mondo.0011664')

[<Vertex object with index '2023' at 0x7f3892940040>]

In [43]:
new_network.vertex('2023')

<Vertex object with index '2023' at 0x7f3892940740>

In [44]:
new_network.vp.ID[new_network.vertex('2023')]

'mondo.0011664'

In [48]:
for e in new_network.vertex('2023').out_edges():
       print(e)

In [66]:
for e in old_network.edges():
    source = 'mondo.'+old_network.vp.ID[e.source()]
    target = 'mondo.'+old_network.vp.ID[e.target()]
    source_new = gt.find_vertex(new_network, new_network.vp.ID, source)
    target_new = gt.find_vertex(new_network, new_network.vp.ID, target)
    found = False
    for w in source_new[0].out_neighbors():
        if target == new_network.vp.ID[w]:
            found = True
    if not found:
        print(source, target)
        break

mondo.0000001 mondo.0000266


In [21]:
import pandas as pd
df = pd.read_csv('../mapping_files/disease_att_mapping.csv', header=0, dtype=str)
df

Unnamed: 0,mondo,disgenet.variants_related_to_disease,disgenet.genes_related_to_disease,ctd.pathway_related_to_disease
0,0000001,rs347685;rs281874762;rs3761548;rs9349417;rs161...,3558;3493;2806;4968;27031;26762;7421;84735;713...,
1,0000004,rs6161;rs104894897;rs775130992;rs770374710;rs1...,6662;5551;51741;89884;145624;23742;4214;1452;6...,hsa_M00107;hsa04913;hsa01100;hsa04925;hsa00140
2,0000005,rs121434451;rs7014851;rs773764015;rs121434448,55806,
3,0000009,rs121909752;rs879254764;rs1012488531;rs1305880...,2160;1432;3239;10598;3690;1351;7965;3553;2815;...,
4,0000022,rs6313,4842;51540;60498;6783;165918;1326;627;276;551;...,
...,...,...,...,...
24115,0400000,,,
24116,0400002,,,
24117,0400003,,,
24118,0400004,,,


In [74]:
split1 = set(df[df['mondo']=='0000001']['disgenet.variants_related_to_disease'].iloc[0].split(';'))
set(split1)

{'rs10109414',
 'rs10137082',
 'rs10178409',
 'rs1020120',
 'rs1020608562',
 'rs102274',
 'rs102275',
 'rs1024611',
 'rs1033182',
 'rs10404257',
 'rs1042636',
 'rs1042713',
 'rs1044261',
 'rs1044498',
 'rs1045642',
 'rs1047891',
 'rs104894833',
 'rs1049255',
 'rs1061170',
 'rs1066621',
 'rs10774021',
 'rs1077989',
 'rs10783124',
 'rs10794720',
 'rs10808565',
 'rs10887800',
 'rs10906850',
 'rs10951982',
 'rs11011653',
 'rs11089781',
 'rs11089788',
 'rs11123169',
 'rs11128347',
 'rs112201728',
 'rs112329286',
 'rs112407915',
 'rs11320420',
 'rs114425659',
 'rs1145077',
 'rs115007604',
 'rs115489112',
 'rs11571317',
 'rs115747230',
 'rs11614913',
 'rs11622435',
 'rs1162592300',
 'rs11643718',
 'rs11645800',
 'rs116510623',
 'rs11662622',
 'rs1171614',
 'rs1171616',
 'rs117329947',
 'rs117897666',
 'rs117935223',
 'rs11864909',
 'rs1188383936',
 'rs11959928',
 'rs12032578',
 'rs12134854',
 'rs12137135',
 'rs1217691063',
 'rs121908525',
 'rs121908529',
 'rs121913059',
 'rs121917864',
 'rs12

In [75]:
split2 = df[df['mondo']=='0000266']['disgenet.variants_related_to_disease'].iloc[0].split(';')
set(split2)

{'rs1840680', 'rs368060', 'rs4986790', 'rs76763715'}

In [78]:
set(split1).intersection(set(split2))

{'rs4986790'}

In [49]:
for t in test:
    print(";".join(t))


GO;TI


In [17]:
def combine_rows(x):
    return set(filter(None, x.split(';')))
df[df.columns[1:]] = df[df.columns[1:]].fillna('').applymap(combine_rows)

In [18]:
df

Unnamed: 0,entrezgene,go.BP,go.CC,go.MF,pathway.kegg
0,339345,"{GO:0007283, GO:0017148, GO:0045835, GO:190015...","{GO:0005737, GO:0000932, GO:0048471, GO:0005634}","{GO:0008270, GO:0003729, GO:0005515}",{}
1,349667,"{GO:0007166, GO:0010977, GO:0031103}","{GO:0070062, GO:0043005, GO:0005576, GO:004665...",{GO:0038023},{}
2,56204,{},{},{GO:0005515},{}
3,8492,"{GO:0031638, GO:0006887, GO:0006897}","{GO:0043083, GO:0043195, GO:0005886, GO:003042...","{GO:0004252, GO:0008236, GO:0005044}",{}
4,3630,"{GO:0060267, GO:0045818, GO:0060266, GO:003288...","{GO:0005788, GO:0005576, GO:0000139, GO:003311...","{GO:0042802, GO:0005179, GO:0005158, GO:000551...","{hsa04913, hsa04910, hsa04140, hsa04950, hsa04..."
...,...,...,...,...,...
15986,10085,"{GO:0010811, GO:0007155}","{GO:0070062, GO:0062023, GO:1903561}","{GO:0005178, GO:0005201, GO:0005509}",{}
15987,9470,"{GO:1905618, GO:0031047, GO:0006413, GO:0017148}","{GO:0000932, GO:0005829, GO:0016281, GO:000573...","{GO:0003743, GO:0008135, GO:0000339, GO:000372...","{hsa04211, hsa04150, hsa04910, hsa04151, hsa04..."
15988,1673,"{GO:0050830, GO:0031640, GO:0042742, GO:005082...","{GO:0005576, GO:0005796, GO:0005615}","{GO:0031731, GO:0042056, GO:0005515}","{hsa05150, hsa04621, hsa04657}"
15989,8386,"{GO:0007186, GO:0050911, GO:0007608}","{GO:0016021, GO:0005886}","{GO:0004984, GO:0004930}",{hsa04740}


In [18]:
for index, value in enumerate(df.entrezgene[15986:], start=15986):
    print(index, value)

15986 10085
15987 9470
15988 1673
15989 8386
15990 6509


In [19]:
for index, value in enumerate(df.entrezgene[15986:]):
    print(index, value)

0 10085
1 9470
2 1673
3 8386
4 6509


In [50]:
import scipy.sparse as sp

In [51]:
row  = [0, 3, 1, 0]
col  = [0, 3, 1, 2]
data = [4, 5, 7, 9]
mat = sp.coo_matrix((data, (row, col)), shape=(4, 4))
mat.toarray()

array([[4, 0, 9, 0],
       [0, 7, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 5]])

In [52]:
row  = [0, 4, 0]
col  = [4, 2, 0]
data = [7, 7, 7]
mat2 = sp.coo_matrix((data, (row, col)), shape=(5, 5))
mat2.toarray()

array([[7, 0, 0, 0, 7],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 7, 0, 0]])

In [53]:
mat.data

array([4, 5, 7, 9])

In [54]:
mat2.data

array([7, 7, 7])

In [4]:
for i in range(2,2):
    print(i)

In [55]:
row  = np.concatenate((mat.row, mat2.row), axis=None)
col  = np.concatenate((mat.col, mat2.col), axis=None)
data = np.concatenate((mat.data, mat2.data), axis=None)
mat3 = sp.csr_matrix((data, (row, col)), shape=(5, 5))
mat3.toarray()

array([[11,  0,  9,  0,  7],
       [ 0,  7,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  5,  0],
       [ 0,  0,  7,  0,  0]])

In [50]:
for index, value in enumerate(mat3.row):
    print(index, value, mat3.col[index])

0 0 0
1 3 3
2 1 1
3 0 2
4 0 4
5 4 2
6 0 0


In [49]:
mat3.row

array([0, 3, 1, 0, 0, 4, 0], dtype=int32)

In [51]:
mat3.col

array([0, 3, 1, 2, 4, 2, 0], dtype=int32)

In [87]:
mat3.tocsc()[0,0]

11

In [56]:
print(mat3)

  (0, 0)	11
  (0, 2)	9
  (0, 4)	7
  (1, 1)	7
  (3, 3)	5
  (4, 2)	7


In [57]:
print(mat3.tocsc())

  (0, 0)	11
  (1, 1)	7
  (0, 2)	9
  (4, 2)	7
  (3, 3)	5
  (0, 4)	7


In [58]:
print(mat3.tocsr())

  (0, 0)	11
  (0, 2)	9
  (0, 4)	7
  (1, 1)	7
  (3, 3)	5
  (4, 2)	7


In [62]:
mati = mat3.tocoo()

In [64]:
print(mati.row)

[0 0 0 1 3 4]


In [67]:
import timeit
start = timeit.default_timer()
for i in range(10000):
    mat3.tocsc()[(0,0)]
stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  0.7228160180020495


In [68]:
import timeit
start = timeit.default_timer()
mat4 =  mat3.tocsc()
for i in range(10000):
    mat4[(0,0)]
stop = timeit.default_timer()
print('Time: ', stop - start)  

Time:  0.11830322400055593


In [81]:
def transform_disgenet_mapping(mapping: pd.DataFrame, file, col_old, col_new):
    disease_mapping = pd.read_csv(file, compression='gzip', sep='\t', dtype=str)
    df = pd.merge(mapping[['diseaseId', 'mondo']], disease_mapping[['diseaseId', col_old]],
                  on="diseaseId", how="left")
    df = df.rename(columns={col_old: col_new})
    df[col_new] = df[col_new].str.strip()
    df = df[['mondo', col_new]].fillna('').groupby(['mondo'], as_index=False).agg(combine_rows)
    return df

def combine_rows(x):
    return set(filter(None, ';'.join(x).split(';')))

In [82]:
disgenet_mapping = pd.read_csv("https://www.disgenet.org/static/disgenet_ap1/files/downloads/disease_mappings.tsv.gz", compression='gzip', sep='\t', dtype=str)
disgenet_mapping = disgenet_mapping[disgenet_mapping['vocabulary'] == 'MONDO'][['diseaseId', 'code']]
disgenet_mapping = disgenet_mapping.rename(columns={'code': 'mondo'})

var_mapping = transform_disgenet_mapping(mapping=disgenet_mapping, file="https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_variant_disease_associations.tsv.gz", col_old='snpId',
                                                col_new='disgenet.variants_related_to_disease')
gene_mapping = transform_disgenet_mapping(mapping=disgenet_mapping, file="https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz", col_old='geneId',
                                                 col_new='disgenet.genes_related_to_disease')
disease_att_mapping = pd.merge(var_mapping[['mondo', 'disgenet.variants_related_to_disease']],
                                   gene_mapping[['mondo', 'disgenet.genes_related_to_disease']],
                                   on="mondo", how="outer")

In [83]:
disease_att_mapping

Unnamed: 0,mondo,disgenet.variants_related_to_disease,disgenet.genes_related_to_disease
0,0000001,"{rs1952034, rs374118649, rs6954996, rs7830, rs...","{3606, 3630, 23771, 4282, 5730, 54106, 10269, ..."
1,0000004,"{rs104894118, rs770374710, rs121918654, rs6161...","{1583, 51322, 5579, 4791, 1452, 4173, 2710, 62..."
2,0000005,"{rs7014851, rs121434448, rs773764015, rs121434...",{55806}
3,0000009,"{rs759081917, rs121908061, rs1012488531, rs121...","{1432, 23218, 117195, 259249, 55278, 947, 9370..."
4,0000022,{rs6313},"{627, 83881, 60498, 359, 412, 276, 1326, 3855,..."
...,...,...,...
12349,0100039,"{rs267608511, rs398123593, rs267608493, rs1219...","{170302, 728458, 10678, 2290, 2044, 501, 6335,..."
12350,0100053,"{rs699, rs121913682, rs1267969615, rs121913507}","{3606, 7220, 26511, 3559, 105371045, 6566, 570..."
12351,0100054,"{rs121913682, rs121913507}",{3815}
12352,0100081,"{rs10766075, rs1481892, rs11824092, rs34637584...","{8243, 60675, 8492, 23394, 57007, 259249, 4502..."


In [73]:
def set_to_string(x, sep: str = ';'):
    return sep.join(x)

In [84]:
df = disease_att_mapping.copy()
df[df.columns[1:]] = df[df.columns[1:]].fillna('').applymap(set_to_string)
df

Unnamed: 0,mondo,disgenet.variants_related_to_disease,disgenet.genes_related_to_disease
0,0000001,rs1952034;rs374118649;rs6954996;rs7830;rs93494...,3606;3630;23771;4282;5730;54106;10269;4772;478...
1,0000004,rs104894118;rs770374710;rs121918654;rs6161;rs7...,1583;51322;5579;4791;1452;4173;2710;6288;16902...
2,0000005,rs7014851;rs121434448;rs773764015;rs121434451,55806
3,0000009,rs759081917;rs121908061;rs1012488531;rs1219080...,1432;23218;117195;259249;55278;947;9370;3553;7...
4,0000022,rs6313,627;83881;60498;359;412;276;1326;3855;278;554;...
...,...,...,...
12349,0100039,rs267608511;rs398123593;rs267608493;rs12190967...,170302;728458;10678;2290;2044;501;6335;6513;63...
12350,0100053,rs699;rs121913682;rs1267969615;rs121913507,3606;7220;26511;3559;105371045;6566;570;100505...
12351,0100054,rs121913682;rs121913507,3815
12352,0100081,rs10766075;rs1481892;rs11824092;rs34637584;rs6...,8243;60675;8492;23394;57007;259249;4502;25814;...


In [87]:
disease_mapping = pd.read_csv("../mapping_files/disease_id_mapping.csv", dtype=str)
disease_mapping

Unnamed: 0,mondo,omim,snomedct,umls,orpha,mesh,doid,ICD-10
0,0002974,603956,363354003,,,,4362,"C53.9,C53"
1,0000311,,,,,,,
2,0001642,,1489008,C0019919,,,13134,"H00.01,H00,H00.03"
3,0000310,,,,,,0050308,
4,0001641,,,,,,13129,
...,...,...,...,...,...,...,...,...
24115,0019900,,,,96160,,,"Q93,Q93.5"
24116,0019902,,766716004,,96168,,,"Q93,Q93.5"
24117,0019901,,,,96164,,,"Q93,Q93.5"
24118,0007928,,,,,,,


In [89]:
omim_to_hsa = pd.read_csv("http://rest.genome.jp/link/omim/hsa", names=['hsa', 'omim', 'dir'], sep="\t", dtype=str)
hsa_to_pathway = pd.read_csv("http://rest.kegg.jp/link/pathway/hsa", names=['hsa', 'pathway'], sep="\t", dtype=str)
omim_to_pathway = pd.merge(omim_to_hsa[['hsa', 'omim']], hsa_to_pathway[['hsa', 'pathway']], on="hsa", how="inner")
omim_to_pathway.omim = omim_to_pathway.omim.str.replace('omim:', '')
omim_to_pathway.pathway = omim_to_pathway.pathway.str.replace('path:', '')
omim_to_pathway = pd.merge(disease_mapping[['mondo', 'omim']], omim_to_pathway[['omim', 'pathway']], on="omim",
                               how="inner")
omim_to_pathway = omim_to_pathway[['mondo', 'pathway']].fillna('').groupby(['mondo'], as_index=False).agg(
        combine_rows)
omim_to_pathway.rename(columns={'pathway': 'ctd.pathway_related_to_disease'}, inplace=True)


In [99]:
omim_to_pathway[omim_to_pathway['mondo']=="0000004"]

Unnamed: 0,mondo,ctd.pathway_related_to_disease


In [98]:
omim_to_pathway

Unnamed: 0,mondo,ctd.pathway_related_to_disease
0,0000908,"{hsa04390, hsa05100, hsa05412, hsa05213, hsa05..."
1,0000909,{hsa04966}
2,0000914,"{hsa04371, hsa04658, hsa05206, hsa04919, hsa05..."
3,0001056,"{hsa04640, hsa04380, hsa04360, hsa04936, hsa04..."
4,0001187,"{hsa04360, hsa04720, hsa05213, hsa05230, hsa05..."
...,...,...
3565,0060764,"{hsa04390, hsa05206, hsa04310, hsa05217, hsa05..."
3566,0100082,"{hsa04722, hsa04360, hsa05211, hsa05130, hsa05..."
3567,0100083,"{hsa05202, hsa04659, hsa05220, hsa04530, hsa05..."
3568,0100104,"{hsa05014, hsa03013}"


In [None]:
mapping = dm.get_attributes_from_database(
        missing=['MONDO:' + x for x in set(disease_att_mapping.mondo) - set(omim_to_pathway.mondo)],
        attributes=['ctd.pathway_related_to_disease'])
mapping = pd.concat([omim_to_pathway, mapping])