In [2]:
import pandas as pd
import os 
import json
import csv
from biomed_apis import *
from biomedkg_utils import *

### Alignment: Disease to Pathway
MESH Disease, KEGG Pathway

In [3]:
kegg_dis2mesh_df = pd.read_csv('input/KEGG/kegg_disease_to_mesh_and_omim.csv')

In [15]:
mesh2kegg_disease = dict()
kegg_disease2mesh = dict()

for row_index in range(0,len(kegg_dis2mesh_df)):
                       
    try:
        # KEGG Disease, MeSH Disease
        kegg_disease = kegg_dis2mesh_df['KEGG Disease'].iloc[row_index]
        mesh_diseases = kegg_dis2mesh_df['MeSH'].iloc[row_index].split('; ')
    except:
        continue
    
    # KEGG Disease -is- MeSH Disease
    for mesh_disease in mesh_diseases:
        if type(mesh_disease) == str and type(kegg_disease) == str:
            mesh2kegg_disease.setdefault(mesh_disease,set()).add(kegg_disease)
            kegg_disease2mesh.setdefault(kegg_disease,set()).add(mesh_disease)
print(len(mesh2kegg_disease), 'MeSH-is-KEGG diseases')
print(len(kegg_disease2mesh), 'KEGG-is-MeSH diseases')

2156 MeSH-is-KEGG diseases
2088 KEGG-is-MeSH diseases


### Edges: Disease to Pathway
MESH Disease, KEGG Pathway

In [16]:
! curl https://rest.kegg.jp/link/pathway/disease > input/KEGG/kegg_pathway_to_disease.tsv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1944    0  1944    0     0   2670      0 --:--:-- --:--:-- --:--:--  2674


In [21]:
mesh_disease2kegg_pathway = dict()
kegg_pathway2mesh_disease = dict()

for line in open('input/KEGG/kegg_pathway_to_disease.tsv'):
    line = line.split('\t')
    try:
        # MeSH Disease, KEGG Pathway
        kegg_disease = line[0]
        mesh_diseases = kegg_disease2mesh[kegg_disease]
        kegg_pathway = line[1].strip().replace('path:','path_')
            
        # MeSH Disease - KEGG Pathway
        for mesh_disease in mesh_diseases:
            mesh_disease2kegg_pathway.setdefault(mesh_disease,set()).add(kegg_pathway)
            kegg_pathway2mesh_disease.setdefault(kegg_pathway,set()).add(mesh_disease)
    except:
        continue

print(str(len(mesh_disease2kegg_pathway)), 'MeSH Diseases - '+\
     str(len(kegg_pathway2mesh_disease)), 'KEGG Pathways')

file = 'Disease_(MeSH)_2_Pathway_(KEGG).csv'
outpath = os.path.join('output/disease2pathway',file)
output_edgefile_onerel_noweight(
    outpath = os.path.join('output/disease2pathway',file),
    columns = ['Disease (MeSH)','Pathway (KEGG)','Relationship'],
    dictionary = mesh_disease2kegg_pathway,
    rel = '-disease_involves->',
    prefix_col1='MeSH_Disease:',
    prefix_col2='KEGG_Pathway:'
)
df = pd.read_csv(outpath)
df.to_csv(os.path.join('output/edges', file), index=False)
df.to_csv(os.path.join('output/edges to use/', file), index=False)

91 MeSH Diseases - 63 KEGG Pathways


# Reactome Pathway

In [2]:
import pandas as pd
import csv
import math
import json

In [None]:
# Download the Reactome database as a Neo4j graph: https://reactome.org/download-data/
# https://reactome.org/download/current/reactome.graphdb.tgz or https://reactome.org/download/current/reactome.graphdb.dump

Neo4j Query:
    MATCH p=(pw:Pathway)-[]-(dis:Disease)
    WHERE toLower(pw.speciesName) = 'homo sapiens'
    RETURN dis

In [3]:
disease_nodes = pd.read_csv('input/human_disease_with_pathways_involved.csv')[['_id', 'identifier']].drop_duplicates()

db2doid = dict(zip(disease_nodes._id, disease_nodes.identifier))
db2doid = {int(dbId):int(doid) for dbId, doid in db2doid.items() if not math.isnan(doid) and not math.isnan(dbId)}

In [4]:
db2doid

{135593: 162,
 139026: 80600,
 168230: 5520,
 168232: 1612,
 168237: 3247,
 190150: 1909,
 190362: 9256,
 223885: 9111,
 245187: 10652,
 245231: 1686,
 293985: 5614,
 344221: 104,
 418757: 934,
 419870: 8469,
 522497: 2945,
 522543: 526,
 531658: 399,
 604837: 50117,
 920440: 417,
 1003151: 1324,
 1003213: 3490,
 1003462: 60233,
 1005364: 50469,
 1005505: 14291,
 1006076: 1107,
 1006131: 8552,
 1006312: 11984,
 1006705: 3717,
 1006836: 8712,
 1007005: 3068,
 1150238: 60053,
 1150279: 150,
 1150358: 9970,
 1173642: 1919,
 1192956: 80001,
 1284935: 14667,
 1447557: 2355,
 1447584: 14735,
 1447815: 12678,
 1447817: 7427,
 1539646: 12134,
 1548362: 12259,
 1548758: 60002,
 1549251: 2452,
 1551182: 5419,
 1551230: 50156,
 1553487: 80599,
 1553576: 11573,
 1613784: 100,
 1614541: 11976,
 1614955: 11338,
 1615721: 11405,
 1615808: 50338,
 1619680: 80006,
 1619841: 4972,
 1619843: 5600,
 1619845: 3264,
 1622120: 12960,
 1622121: 2340,
 1623302: 3910,
 1624967: 234,
 1625551: 1540,
 1625735: 26

In [12]:
pathway_nodes = pd.read_excel('input/human_pathways_involved_in_diseases.xlsx')[['_id', 'stId','displayName', 'name']].drop_duplicates()
edges = pd.read_excel('input/human_pathway_disease_edges.xlsx').drop_duplicates()

db2stID = dict(zip(pathway_nodes._id, pathway_nodes.stId))
db2reactome_id = {int(dbId):reactome_id for dbId, reactome_id in db2stID.items() if not math.isnan(dbId)}

#display(disease_nodes.tail())
#display(pathway_nodes.tail())
#display(edges.tail())

In [6]:
edges

Unnamed: 0,_start,_end,_type
0,1727823,1727620,disease
1,1727361,1727366,disease
2,1618882,531658,disease
3,1630862,135593,disease
4,1674335,1674341,disease
...,...,...,...
801,1674785,1674813,disease
802,1674785,1674811,disease
803,1815048,1815054,disease
804,598415,139026,disease


In [7]:
# Edges
edge_list = list()

for i in range(0,len(edges)):
    pathway = edges['_start'].iloc[i]
    disease = edges['_end'].iloc[i]
    rel = edges['_type'].iloc[i]
    
    try:
        pathway = db2reactome_id[pathway]
        disease = db2doid[disease]        
        edge_list.append([pathway, disease])
        
    except:
        print(pathway, disease)
        continue

In [8]:
doid2mesh = json.load(open('output/disease2disease/doid2mesh.json'))

In [9]:
# Output Edges
file = 'Disease_(MeSH)_2_Pathway_(Reactome).csv'
with open(os.path.join('output/disease2pathway', file),'w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Disease (MeSH)','Pathway (Reactome)','Relationship'])
    relationship = '-disease_involves->'
    
    for edge in edge_list:
        pathway = edge[0]
        doid_disease = edge[1]
        
        try:
            mesh_diseases = doid2mesh['DOID:'+str(doid_disease)]
            for mesh_disease in mesh_diseases:
                writer.writerow(['MeSH_Disease:'+mesh_disease, 'Reactome_Pathway:'+pathway, relationship])
        except:
            continue
            
df = pd.read_csv(os.path.join('output/disease2pathway', file)).drop_duplicates()
df.to_csv(os.path.join('output/edges/', file), index=False)
df.to_csv(os.path.join('output/edges to use/', file), index=False)

In [13]:
desc_length = 10
desc_embed = [[1],[2],[3]]


gap = desc_length - len(desc_embed)
desc_embed += desc_embed * int(gap/len(desc_embed)) + desc_embed[: int(gap % len(desc_embed))]
print('gap', gap)
print('padded', desc_embed)

gap 7
padded [[1], [2], [3], [1], [2], [3], [1], [2], [3], [1]]


In [57]:
df

Unnamed: 0,Disease (MeSH),Pathway (Reactome),Relationship
0,Reactome_Pathway:R-HSA-2206308,MeSH_Disease:D009083,-disease_involves->
1,Reactome_Pathway:R-HSA-3828062,MeSH_Disease:D006008,-disease_involves->
2,Reactome_Pathway:R-HSA-9635644,MeSH_Disease:D014375,-disease_involves->
3,Reactome_Pathway:R-HSA-9634285,MeSH_Disease:D009369,-disease_involves->
4,Reactome_Pathway:R-HSA-4755609,MeSH_Disease:D012174,-disease_involves->
...,...,...,...
656,Reactome_Pathway:R-HSA-5683177,MeSH_Disease:D007003,-disease_involves->
657,Reactome_Pathway:R-HSA-3645790,MeSH_Disease:D009369,-disease_involves->
658,Reactome_Pathway:R-HSA-9649948,MeSH_Disease:D009369,-disease_involves->
659,Reactome_Pathway:R-HSA-9702632,MeSH_Disease:D009369,-disease_involves->


In [54]:
len(set(df['Disease (MeSH)']))

590

In [55]:
len(set(df['Pathway (Reactome)']))

154

In [56]:
len(df)

660