In [1]:
# Import Biopython modules to interact with KEGG
from Bio.KEGG import REST

# Import Pandas, so we can use dataframes
import pandas as pd
import io
import numpy as np

In [2]:
#https://widdowquinn.github.io/2018-03-06-ibioic/02-sequence_databases/09-KEGG_programming.html#kegg_info

In [3]:
def to_df(result):
    return pd.read_table(io.StringIO(result), header=None)

In [4]:
result = REST.kegg_info("kegg").read()
to_df(result)

Unnamed: 0,0
0,kegg Kyoto Encyclopedia of Genes a...
1,"kegg Release 104.0+/10-04, Oct 22"
2,Kanehisa Laboratories
3,"pathway 969,028 entries"
4,"brite 327,261 entries"
5,module 536 entries
6,"orthology 25,420 entries"
7,"genome 20,280 entries"
8,"genes 43,753,269 entries"
9,"compound 18,991 entries"


In [5]:
# Find all compounds with mass between 300 and 310 units
result = REST.kegg_find("compound", "300-310/mol_weight").read()
to_df(result)

Unnamed: 0,0,1
0,cpd:C00051,307.323480
1,cpd:C00200,306.336960
2,cpd:C00219,304.466880
3,cpd:C00239,307.197122
4,cpd:C00270,309.269860
...,...,...
522,cpd:C22103,304.509940
523,cpd:C22382,309.269860
524,cpd:C22393,307.969102
525,cpd:C22509,306.166002


In [6]:
# Find all compounds with mass between 300 and 310 units

#REST.kegg_find("compound", "C7H10O5", "formula") ## for chemical formula "C7H10O5"
#REST.kegg_find("compound", 174.05, "exact_mass") ## for 174.045 =< exact mass < 174.055
#REST.kegg_find("compound", 300:310, "mol_weight") ## for 300 =< molecular weight =< 310

In [7]:
kegg_ids = [
    'cpd:C00031',
    'cpd:C00089',
    'cpd:C00794',
    'cpd:C00392',
    'cpd:C00095',
    'cpd:C00033',
    'cpd:C00025',
    'cpd:C00158',
    'cpd:C00072',
    'cpd:C00180',
    'cpd:C00714',
    'cpd:C08822',
    'cpd:C01935',
    'cpd:C12285',
    'cpd:C11045',
    'cpd:C09189',
    'cpd:C00009',
    'cpd:C12603',
    'cpd:C00157',
    'cpd:C01768',
    'cpd:C08818',
    'cpd:C08815',
    'cpd:C00369',
    'cpd:C01424',
    'cpd:C01424',
    'D01544',
    'K10122',
    'D06333',
    'C03323',
    'D05355',
    'D07622',
    'D00936',
    'D02060'
    
]
len(kegg_ids)    

33

In [8]:
#COMPOUND https://www.genome.jp/entry/C00031
#DRUG https://www.genome.jp/entry/D02060
#ORTHOLOGY https://www.genome.jp/entry/K10122

In [9]:
result = REST.kegg_get("D02060").read()
print(result)

ENTRY       D02060                      Drug
NAME        Potassium chloride (JP18/USP);
            K-dur (TN);
            KCL (TN);
            Kaon-Cl (TN);
            Klor-con (TN);
            Klotrix (TN)
PRODUCT     POTASSIUM CHLORIDE (ATLANTIC BIOLOGICALS CORP.) cc912caf-f4d9-4c08-ba9f-45c5e8ed8060
            K-TAB (AbbVie) 6594df99-d8ce-49b9-3fbe-9ec7cdc9199b
            POTASSIUM CHLORIDE (Aphena Pharma Solutions - Tennessee) 81a53ecd-2612-4efe-9491-415fff3f86e5
            KLOR-CON (Aphena Pharma Solutions - Tennessee) e02f9d3e-d744-4020-b90c-215f2ae69e16
            POTASSIUM CHLORIDE (AvKARE) 8bb030b7-104a-373b-e053-2995a90ab580
            POTASSIUM CHLORIDE (Baxter Healthcare Corporation) 092ddee4-572d-4771-8d95-880cea01097e
            POTASSIUM CHLORIDE (Clinical Solutions Wholesale) 850b5b94-05a7-468c-8b90-77a1d45cd5c1
            POTASSIUM CHLORIDE (DIRECT RX) f5adfe63-62a8-4453-a0f0-db3076e01d78
            POTASSIUM CHLORIDE (Genus Lifesciences) 14cd12ee-a4a3-465

In [10]:
def _get_kegg(kegg_id):
    kegg_output = REST.kegg_get(kegg_id).read()
    results = {}
    for line in kegg_output.split('\n'):
        splits = line.split()
        if not line.startswith(' '):    
            if len(splits) > 0:
                key = splits[0]
                value = ' '.join(splits[1:])
                results[key] = value
        else:
            results[key] += ' '.join(splits)
    return pd.DataFrame(results, index=[kegg_id])


_get_kegg_v = np.vectorize(_get_kegg)

def get_kegg_info(kegg_ids):
    if isinstance(kegg_ids, str):
        kegg_ids = [kegg_ids]
    return pd.concat(_get_kegg_v(kegg_ids), sort=False)

In [11]:
d = _get_kegg("D02060")
d.columns

Index(['ENTRY', 'NAME', 'PRODUCT', 'FORMULA', 'EXACT_MASS', 'MOL_WEIGHT',
       'CLASS', 'REMARK', 'EFFICACY', 'INTERACTION', 'BRITE', 'DBLINKS',
       'ATOM', 'BOND', '///'],
      dtype='object')

In [12]:
columns = list(d.columns)
columns

['ENTRY',
 'NAME',
 'PRODUCT',
 'FORMULA',
 'EXACT_MASS',
 'MOL_WEIGHT',
 'CLASS',
 'REMARK',
 'EFFICACY',
 'INTERACTION',
 'BRITE',
 'DBLINKS',
 'ATOM',
 'BOND',
 '///']

In [13]:
df = pd.DataFrame([], columns = columns)
df

Unnamed: 0,ENTRY,NAME,PRODUCT,FORMULA,EXACT_MASS,MOL_WEIGHT,CLASS,REMARK,EFFICACY,INTERACTION,BRITE,DBLINKS,ATOM,BOND,///


In [14]:
for k in kegg_ids:
    d = _get_kegg(k)
    df = pd.concat([df, d])
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,ENTRY,NAME,PRODUCT,FORMULA,EXACT_MASS,MOL_WEIGHT,CLASS,REMARK,EFFICACY,INTERACTION,...,///,COMMENT,REACTION,PATHWAY,ENZYME,MODULE,BRACKET,SYMBOL,GENES,REFERENCE
0,C00031 Compound,D-Glucose;Grape sugar;Dextrose;Glucose;D-Gluco...,,C6H12O6,180.0634,180.1559,,Same as: D00009,,,...,,alpha-D-Glucose [CPD:C00267]beta-D-Glucose [CP...,R00010 R00015 R00028 R00049 R00063 R00299 R003...,map00010 Glycolysis / Gluconeogenesismap00030 ...,1.1.1.118 1.1.1.119 1.1.1.121 1.1.1.3591.1.1.3...,,,,,
1,C00089 Compound,Sucrose;Cane sugar;Saccharose;1-alpha-D-Glucop...,,C12H22O11,342.1162,342.2965,,Same as: D00025 G00370,,,...,,,R00015 R00039 R00801 R00802 R00803 R00805 R008...,map00052 Galactose metabolismmap00500 Starch a...,1.1.99.13 2.4.1.4 2.4.1.5 2.4.1.72.4.1.9 2.4.1...,,,,,
2,C00794 Compound,D-Sorbitol;D-Glucitol;L-Gulitol;Sorbitol,,C6H14O6,182.079,182.1718,,Same as: D00096,,,...,,,R00874 R00875 R01697 R01787 R02865 R02866 R028...,map00051 Fructose and mannose metabolismmap000...,1.1.1.14 1.1.1.15 1.1.1.21 1.1.1.2891.1.3.41 1...,,,,,
3,C00392 Compound,Mannitol;D-Mannitol,,C6H14O6,182.079,182.1718,,Same as: D00062,,,...,,,R00865 R00868 R00870 R02167 R02704 R05698 R07135,map00051 Fructose and mannose metabolismmap011...,1.1.1.11 1.1.1.67 1.1.1.138 1.1.1.2551.1.2.2 1...,,,,,
4,C00095 Compound,D-Fructose;Levulose;Fruit sugar;D-arabino-Hexu...,,C6H12O6,180.0634,180.1559,,,,,...,,D-Fructose (pyranose) [CPD:C05003]D-Fructose (...,R00760 R00801 R00803 R00806 R00865 R00866 R008...,map00051 Fructose and mannose metabolismmap000...,1.1.1.11 1.1.1.14 1.1.1.15 1.1.1.671.1.1.124 1...,,,,,
5,C00033 Compound,Acetate;Acetic acid;Ethanoic acid,,C2H4O2,60.0211,60.052,,Same as: D00010,,,...,,,R00227 R00229 R00235 R00315 R00316 R00317 R003...,map00010 Glycolysis / Gluconeogenesismap00430 ...,1.1.1.318 1.1.1.319 1.2.1.3 1.2.1.41.2.1.5 1.2...,"M00357 Methanogenesis, acetate => methaneM0057...",,,,
6,C00025 Compound,L-Glutamate;L-Glutamic acid;L-Glutaminic acid;...,,C5H9NO4,147.0532,147.1293,,Same as: D00007,,,...,,,R00021 R00093 R00114 R00239 R00243 R00245 R002...,map00220 Arginine biosynthesismap00250 Alanine...,1.2.1.88 1.4.1.2 1.4.1.3 1.4.1.41.4.1.13 1.4.1...,"M00015 Proline biosynthesis, glutamate => prol...",,,,
7,C00158 Compound,"Citrate;Citric acid;2-Hydroxy-1,2,3-propanetri...",,C6H8O7,192.027,192.1235,,Same as: D00037,,,...,,,R00351 R00352 R00362 R01322 R01323 R01324 R013...,map00020 Citrate cycle (TCA cycle)map00250 Ala...,2.3.3.1 2.3.3.3 2.3.3.8 2.3.3.162.8.3.10 4.1.3...,"M00009 Citrate cycle (TCA cycle, Krebs cycle)M...",,,,
8,C00072 Compound,Ascorbate;Ascorbic acid;L-Ascorbate;L-Ascorbic...,,C6H8O6,176.0321,176.1241,,Same as: D00018,,,...,,,R00068 R00095 R00640 R00643 R00644 R00645 R006...,map00053 Ascorbate and aldarate metabolismmap0...,1.1.3.8 1.3.2.3 1.3.3.12 1.6.5.41.8.5.1 1.10.3...,"M00114 Ascorbate biosynthesis, plants, fructos...",,,,
9,C00180 Compound,Benzoate;Benzoic acid;Benzenecarboxylic acid;P...,,C7H6O2,122.0368,122.1213,,Same as: D00038,,,...,,,R01295 R01419 R01420 R01421 R01422 R01423 R014...,map00362 Benzoate degradationmap00621 Dioxin d...,1.2.1.7 1.2.1.28 1.14.12.10 1.14.13.-1.14.14.9...,"M00538 Toluene degradation, toluene => benzoat...",,,,


In [15]:
df[['ENTRY', 'NAME']]

Unnamed: 0,ENTRY,NAME
0,C00031 Compound,D-Glucose;Grape sugar;Dextrose;Glucose;D-Gluco...
1,C00089 Compound,Sucrose;Cane sugar;Saccharose;1-alpha-D-Glucop...
2,C00794 Compound,D-Sorbitol;D-Glucitol;L-Gulitol;Sorbitol
3,C00392 Compound,Mannitol;D-Mannitol
4,C00095 Compound,D-Fructose;Levulose;Fruit sugar;D-arabino-Hexu...
5,C00033 Compound,Acetate;Acetic acid;Ethanoic acid
6,C00025 Compound,L-Glutamate;L-Glutamic acid;L-Glutaminic acid;...
7,C00158 Compound,"Citrate;Citric acid;2-Hydroxy-1,2,3-propanetri..."
8,C00072 Compound,Ascorbate;Ascorbic acid;L-Ascorbate;L-Ascorbic...
9,C00180 Compound,Benzoate;Benzoic acid;Benzenecarboxylic acid;P...


In [16]:
#df.to_csv('kegg.csv', index=False)

In [17]:
df.columns

Index(['ENTRY', 'NAME', 'PRODUCT', 'FORMULA', 'EXACT_MASS', 'MOL_WEIGHT',
       'CLASS', 'REMARK', 'EFFICACY', 'INTERACTION', 'BRITE', 'DBLINKS',
       'ATOM', 'BOND', '///', 'COMMENT', 'REACTION', 'PATHWAY', 'ENZYME',
       'MODULE', 'BRACKET', 'SYMBOL', 'GENES', 'REFERENCE'],
      dtype='object')