### KEGG Pathways 

**Output:** KEGG Pathways along with the associated genes

In [1]:
import requests, os
import numpy as np
import pandas as pd 
from io import StringIO

<div class="alert alert-block alert-info">
<b>Note:</b>      
KEGG Pathways and their associated genes are obtained using the KEGG REST API.
</div>

In [2]:
get_local_data_path = lambda folders, fname: os.path.normpath('../local_data/'+'/'+'/'.join(folders) +'/'+ fname)
file_kegg_pathways_info =  get_local_data_path(['processed'], 'kegg_pathways.csv')

In [3]:
# gets pathways belonging to Homo sapiens
pathway_response = requests.get("http://rest.kegg.jp/list/pathway/hsa")

In [4]:
pathways = pd.read_csv(StringIO(pathway_response.text), sep='\t', header=None).rename(columns={0: 'Code', 1: 'Name'})
print("Dimensions: ", pathways.shape)
pathways[:2]

Dimensions:  (347, 2)


Unnamed: 0,Code,Name
0,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...
1,path:hsa00020,Citrate cycle (TCA cycle) - Homo sapiens (human)


In [5]:
# Manipulating the output to suit our needs
pathways['Code'] = pathways['Code'].apply(lambda x: x[5:])
pathways['Name'] = pathways['Name'].apply(lambda x: x.split('- Homo')[0].strip())
pathways[:2]

Unnamed: 0,Code,Name
0,hsa00010,Glycolysis / Gluconeogenesis
1,hsa00020,Citrate cycle (TCA cycle)


In [6]:
query = "http://rest.kegg.jp/get/"

def get_genes_by_pathway(pathway_code):
    response = requests.get(query+pathway_code)    
    if('GENE' not in response.text):
        return np.nan
    genes_by_pathway = []
    for line in response.text.split("\n"):
        section = line[:12].strip()  # section names appear in the first 12 characters only 
        if not section == "":        # changing the section if 
            current_section = section
        
        if current_section == "GENE" and '; ' in line:
            gene_identifiers, gene_description = line[12:].split("; ")
            gene_symbol = gene_identifiers.split()[1]
            genes_by_pathway.append(gene_symbol)
        elif current_section == 'COMPOUND': # Compound section appears after Gene section 
            break;                          # => skip the rest of the lines
            
    return genes_by_pathway

In [7]:
pathways['Genes'] = pathways['Code'].apply(get_genes_by_pathway)

In [8]:
pathways.dropna(subset=['Genes'], inplace=True)
pathways.to_csv(file_kegg_pathways_info, index=False)
pathways[:2]

Unnamed: 0,Code,Name,Genes
0,hsa00010,Glycolysis / Gluconeogenesis,"[HK3, HK1, HK2, HKDC1, GCK, GPI, PFKM, PFKP, P..."
1,hsa00020,Citrate cycle (TCA cycle),"[CS, ACLY, ACO2, ACO1, IDH1, IDH2, IDH3B, IDH3..."
