In [1]:
# Import libraries
import numpy as np
import pandas as pd
from urllib.request import urlopen
import xml.etree.ElementTree as ET

In [2]:
# Get a list of all human pathways
path_list = []
pathway_hsa = urlopen('https://rest.kegg.jp/list/pathway/hsa').read().decode('utf-8')
pathway_hsa = pathway_hsa.split('\n')
for line in pathway_hsa:
    line = line.split('\t')
    path_list.append(line[0])
print('The first five human pathways are ', path_list[0:5])

The first five human pathways are  ['hsa01100', 'hsa01200', 'hsa01210', 'hsa01212', 'hsa01230']


In [3]:
# Define a function called network_table with two inputs: the webpage for a KEGG pathway kgml file and the current network table
# The function appends new interactions to the network table and then outputs that updated table
def network_table(webpage, network):
    
    # Import data
    with urlopen(webpage) as f:
        root = ET.parse(f).getroot()
        
    # Create a dataframe for all of the reactions in the xml file
    reaction_list = []
    for x in root.iter('relation'):
        reaction_list.append(x.attrib)
    if len(reaction_list)==0:
        return None
    else:
        reaction_df = pd.DataFrame(reaction_list)
        reaction_df.head()
        
    # Create a dictionary linking each entry ID to a KEGG ID
    id_list = []
    for x in root.iter('entry'):
        id_list.append(x.attrib)
    id_df = pd.DataFrame(id_list)
    id_dict = dict(zip(id_df['id'], id_df['name']))
    
    # Replace the entry IDs in the reaction dataframe with the KEGG IDs
    if 'entry1' in reaction_df.columns:
        reaction_df['entry1'].replace(to_replace = id_dict, inplace = True)

    if 'entry2' in reaction_df.columns:
        reaction_df['entry2'].replace(to_replace = id_dict, inplace = True)
        
    # Create the network table
    for i in range(len(reaction_df)):
        sources = reaction_df.loc[i, 'entry1'].split(' ')
        targets = reaction_df.loc[i, 'entry2'].split(' ')
        for source in sources:
            if (source[:3]=='cpd') or (source[:3]=='hsa'):
                for target in targets:
                    if (target[:3]=='cpd') or (target[:3]=='hsa'):
                        reaction_type = reaction_df.loc[i, 'type']
                        if [source, target, reaction_type] not in network:
                            network.append([source, target, reaction_type])
    
    return network

# Call the network_table function for each path in the list of human pathways
network_list = []
for path in path_list:
    if len(path)>0:
        kegg_webpage = 'https://rest.kegg.jp/get/' + path + '/kgml'
        network_table(kegg_webpage, network_list)

network_df = pd.DataFrame(network_list, columns = ['source', 'target', 'type'])
network_df.head()

Unnamed: 0,source,target,type
0,hsa:130589,hsa:3098,ECrel
1,hsa:130589,hsa:3099,ECrel
2,hsa:130589,hsa:3101,ECrel
3,hsa:130589,hsa:80201,ECrel
4,hsa:130589,hsa:2645,ECrel


In [4]:
len(network_df)

58350