## Load modules

In [38]:
import re
import pandas as pd
from numpy import NaN as NaN
import json
from collections import defaultdict
from bs4 import BeautifulSoup

## File paths

In [2]:
!pwd -P

/home/jovyan/work


In [3]:
# file paths in
reactions_file_name       = '/home/jovyan/data/KEGG_dowload/KEGG_reactions.tsv'
compounds_file_name       = '/home/jovyan/data/KEGG_dowload/KEGG_compounds.tsv'
glycans_file_name         = '/home/jovyan/data/KEGG_dowload/KEGG_glycans.tsv'

glycans_link_name         = '/home/jovyan/data/KEGG_dowload/gl-to-cpd-api.txt'
compounds_brite_file_name = '/home/jovyan/data/KEGG_dowload/br08001.json'

In [4]:
# file paths out
compound_entities_file      = '/home/jovyan/data/import/KEGG_compound_entities.tsv'
reaction_entities_file      = '/home/jovyan/data/import/KEGG_reaction_entities.tsv'
product_relationship_file   = '/home/jovyan/data/import/KEGG_relationship_PRODUCT.tsv'
substrate_relationship_file = '/home/jovyan/data/import/KEGG_relationship_SUBSTRATE.tsv'

## Nonspecific functions

In [5]:
def list_to_string(x):
    return ",".join([str(i) for i in x])

## Reaction vertices

In [6]:
def detag(x):
    try:
        soup = BeautifulSoup(x, "html5lib")
        return list(set([x.strip('\"').upper() for x in soup.text.split(" // ")]))
    except Exception as e:
        return [] 

In [7]:
df_reactions = pd.read_csv(reactions_file_name, sep="\t", na_values=[": NULL"])

In [8]:
# Create synonym list
df_reactions["SYNONYMS"] = [df_reactions["NAME"][i].split("; ") if df_reactions["NAME"][i] is not NaN
                        else [df_reactions["ENTRY"][i]] 
                        for i in range(df_reactions.shape[0])]
pd.isnull(df_reactions["SYNONYMS"]).sum()

0

In [9]:
# replace name
df_reactions["NAME"] = df_reactions["SYNONYMS"].apply(lambda x: '"%s"'%x[0].replace('"', "'"))

In [10]:
# split enzymes
df_reactions['ENZYME'] = df_reactions['ENZYME'].apply(lambda x: ["%s"%t for t in re.split("\s+", x)] 
                                                      if x is not NaN 
                                                      else [])

In [11]:
# get the pathway ids 
df_reactions["PATHWAY"] = [re.findall(r'rn[0-9]*', s) if s is not NaN 
                           else [] 
                           for s in df_reactions["PATHWAY"]]

In [12]:
# double quotes for the rest
df_reactions['EQUATION'] = df_reactions['EQUATION'].apply(lambda x: '"%s"'%x)
df_reactions['DEFINITION'] = df_reactions['DEFINITION'].apply(lambda x: '"%s"'%x.strip().replace('"', "'"))

In [13]:
# all lists to strings
df_reactions["PATHWAY"] = df_reactions["PATHWAY"].apply(list_to_string)
df_reactions["SYNONYMS"] = df_reactions["SYNONYMS"].apply(list_to_string)
df_reactions["ENZYME"] = df_reactions["ENZYME"].apply(list_to_string)

In [14]:
df_reactions = df_reactions[["ENTRY", "NAME", "SYNONYMS", "DEFINITION", "EQUATION", "ENZYME", "PATHWAY"]]

In [15]:
df_reactions.columns = ['ID', 
                        'NAME',
                        'SYNONYMS',
                        'NAME_EQUATION', 
                        'EQUATION', 
                        'EC_NUMBERS', 
                        'PATHWAY']

In [16]:
df_reactions.set_index("ID", drop=False, inplace=True)

### Some reactions are defined by glycans

In [17]:
# glycan links
f = lambda x:x.split(":")[1]
df_gl_to_c = pd.read_csv(glycans_link_name, sep="\t", 
                         header=None, index_col=0, names=["gl", "compound"], 
                         converters={0:f, 1:f})
# remove glycans with multiple compound links (need to add seperately)
df_gl_to_c = df_gl_to_c[~df_gl_to_c.index.duplicated(False)]

### Edges

In [18]:
glycans_to_add = []
with open(product_relationship_file, 'w') as out_product:
    with open(substrate_relationship_file, 'w') as out_substrate:
        out_product.write(  '%s\t%s\t%s\n'%('rxnID', 'cpdID', 'STOICHIOMETRY') )
        out_substrate.write('%s\t%s\t%s\n'%('rxnID', 'cpdID', 'STOICHIOMETRY') )
        for i, row in df_reactions.iterrows():
            ID = row['ID']
            eqn = row['EQUATION']

            try:
                substrates, products = eqn.strip('"').split(' <=> ')
            except ValueError:
                print('Failed at reaction %s, eqn is %s'%(row['ID'], eqn))
                break

            substrate_list = []
            product_list = []
            stochiometry_dict = {}

            pattern = '^(.*?)\s*([a-zA-Z]{1}[\d]+)(.*?)$'
            for targets, direction, compound_list in [
                (substrates, 'substrate', substrate_list), 
                (products,   'product',   product_list)]:

                for t in targets.split(' + '):
                    stoichiometry_a, target, stoichiometry_b = re.match(pattern, t).groups()
                    if target[0] == "G":
                        try:
                            target = df_gl_to_c.loc[target]["compound"]
                        except KeyError:
                            glycans_to_add.append(target)

                    compound_list.append(target)

                    if stoichiometry_a: 
                        stoichiometry = stoichiometry_a.strip("(").strip(")")
                    elif stoichiometry_b:
                        stoichiometry = stoichiometry_b.strip("(").strip(")")
                    else:
                        stoichiometry = 1

                    stochiometry_dict[target] = stoichiometry

            for compound_list, file_ in ([substrate_list, out_substrate], 
                                         [product_list,   out_product]):
                
                for target in compound_list:
                    stoichiometry = stochiometry_dict[target]
                    s = '%s\t%s\t%s\n'%(ID, target, stoichiometry)
                    file_.write(s)

In [19]:
df_reactions.to_csv(reaction_entities_file, encoding="utf-8", quoting=3, sep='\t', index=False)

## Compound vertices
Includes all compounds, and any glycans referenced to in reactions that were not replaced by a compound. 

In [20]:
df_compounds = pd.read_csv(compounds_file_name, sep="\t", na_values=[": NULL"])

In [56]:
df_glycans = pd.read_csv(glycans_file_name, sep="\t", na_values=[": NULL"])
df_glycans.set_index("ENTRY", drop=False, inplace=True)
df_glycans = df_glycans.loc[list(set(glycans_to_add))]
df_glycans.columns = df_compounds.columns

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
df_compounds = df_compounds.append(df_glycans)

In [60]:
df_compounds.reset_index(drop=True, inplace=True)

In [64]:
# Create synonym list
df_compounds["SYNONYMS"] = [df_compounds["NAME"][i].split("; ") if df_compounds["NAME"][i] is not NaN
                            else [df_compounds["ENTRY"][i]] 
                            for i in range(df_compounds.shape[0])]
pd.isnull(df_compounds["SYNONYMS"]).sum()

0

In [65]:
# replace name
df_compounds["NAME"] = df_compounds["SYNONYMS"].apply(lambda x: "%s"%x[0])

In [66]:
# get the pathway ids 
df_compounds["PATHWAY"] = [re.findall(r'map[0-9]*', s) if s is not NaN 
                           else [] 
                           for s in df_compounds["PATHWAY"]]

In [67]:
# brite hierachy of compounds
with open(compounds_brite_file_name, "r") as handle:
    j = json.load(handle)

def tree(k):
    parent = re.sub('\[.*\]', '', k["name"]).strip()
    children = k["children"]
    for d in children:
        child = re.sub('\[.*\]', '', d["name"]).strip()
        if not "children" in d.keys():
            child = child.split(" ")[0]
            
        parents[child].append(parent)
        if "children" in d.keys():
            tree(d)
parents = defaultdict(list)
tree(j)

In [68]:
# dumb peptides linking to peptides
parents["Peptides"].pop(1)

'Peptides'

In [69]:
def get_ancestors(c):
    
    def recursive_ancestors(c, ancestors):
        if c in parents:
            for p in parents[c]:
                ancestors.append(p)
                ancestors = recursive_ancestors(p, ancestors)
        return ancestors
    
    ancestors = recursive_ancestors(c, [])
    return ancestors

In [70]:
df_compounds["BRITE_HIERARCHY"] = df_compounds["ENTRY"].apply(get_ancestors)

In [71]:
# all lists to strings
df_compounds["PATHWAY"] = df_compounds["PATHWAY"].apply(list_to_string)
df_compounds["SYNONYMS"] = df_compounds["SYNONYMS"].apply(list_to_string)
df_compounds["BRITE_HIERARCHY"] = df_compounds["BRITE_HIERARCHY"].apply(list_to_string)

In [72]:
df_compounds = df_compounds[["ENTRY", "NAME", "SYNONYMS", "FORMULA", "PATHWAY", "BRITE_HIERARCHY"]]

In [73]:
df_compounds.columns = ['ID', 
                        'NAME', 
                        'SYNONYMS', 
                        'FORMULA', 
                        'PATHWAY', 
                        'BRITE_HIERARCHY']

In [74]:
df_compounds.to_csv(compound_entities_file, encoding="utf-8", quoting=3, sep='\t', index=False)