In [1]:
import pubchempy as pcp
import pandas as pd
from libchebipy import ChebiEntity, Name, Comment, Formula, DatabaseAccession, Reference, CompoundOrigin, Relation
from chembl_webresource_client.new_client import new_client
from Bio.KEGG import REST
import io
import requests
import numpy as np
from chemspipy import ChemSpider

In [2]:
compound_names = ['glucose']
chebi_id = "17234"
chembl_ids = ['CHEMBL2108700']
kegg_ids = ['cpd:C00031']
chemspider = [96749]
short_compound_list = 'glucose'

In [3]:
columns = ['cid',
 'name',
 'atom_stereo_count',
 'atoms',
 'bond_stereo_count',
 'bonds',
 'cactvs_fingerprint',
 'canonical_smiles',
 'charge',
 'complexity',
 'conformer_id_3d',
 'conformer_rmsd_3d',
 'coordinate_type',
 'covalent_unit_count',
 'defined_atom_stereo_count',
 'defined_bond_stereo_count',
 'effective_rotor_count_3d',
 'elements',
 'exact_mass',
 'feature_selfoverlap_3d',
 'fingerprint',
 'h_bond_acceptor_count',
 'h_bond_donor_count',
 'heavy_atom_count',
 'inchi',
 'inchikey',
 'isomeric_smiles',
 'isotope_atom_count',
 'iupac_name',
 'mmff94_energy_3d',
 'mmff94_partial_charges_3d',
 'molecular_formula',
 'molecular_weight',
 'monoisotopic_mass',
 'multipoles_3d',
 'pharmacophore_features_3d',
 'record',
 'rotatable_bond_count',
 'shape_fingerprint_3d',
 'shape_selfoverlap_3d',
 'tpsa',
 'undefined_atom_stereo_count',
 'undefined_bond_stereo_count',
 'volume_3d',
 'xlogp']

In [4]:
df = pd.DataFrame([], columns = columns)
for compound_name in compound_names:
    c =  pcp.get_compounds(compound_name, 'name')
    if len(c) != 0:
        c = c[0]
        print(f"{compound_name}:{c}")
        d = pcp.compounds_to_frame(c)
        d.insert(0, "name", compound_name)
        d.insert(0, "cid", c.cid)
        df = pd.concat([df, d])
df.reset_index(drop=True, inplace=True)

glucose:Compound(5793)


In [5]:
pubchem = df[['cid', 'name', 'atoms', 'atom_stereo_count', 'bonds', 'bond_stereo_count', 'charge', 'exact_mass', 'inchi', 'inchikey']]
pubchem

Unnamed: 0,cid,name,atoms,atom_stereo_count,bonds,bond_stereo_count,charge,exact_mass,inchi,inchikey
0,5793,glucose,"[{'aid': 1, 'number': 8, 'element': 'O', 'x': ...",5,"[{'aid1': 1, 'aid2': 9, 'order': 1}, {'aid1': ...",0,0,180.0633881,InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2...,WQZGKKKJIJFFOK-GASJEMHNSA-N


In [6]:
chebi_entity = ChebiEntity(chebi_id=chebi_id)
name = chebi_entity.get_name()
names = chebi_entity.get_names()
charge = chebi_entity.get_charge()
formula = chebi_entity.get_formula()
formulae = chebi_entity.get_formulae()
inchi = chebi_entity.get_inchi()
inchi_key = chebi_entity.get_inchi_key()
mol = chebi_entity.get_mol()
smiles = chebi_entity.get_smiles()
mass = chebi_entity.get_mass()
columns = [
    'id', 'name', 'names', 'charge', 'formula', 'formulae', 'inchi', 'inchi_key', 'mol', 'smiles', 'mass'
] 

row = [chebi_id, name, names, charge, formula, formulae, inchi, inchi_key, mol, smiles, mass]
chebi = pd.DataFrame([row], columns =columns)
chebi

Unnamed: 0,id,name,names,charge,formula,formulae,inchi,inchi_key,mol,smiles,mass
0,17234,glucose,"[{'_Name__name': 'Glucose', '_Name__typ': 'SYN...",0,C6H12O6,"[{'_Formula__formula': 'C6H12O6', '_Formula__s...",,,,,180.15588


In [7]:
molecule = new_client.molecule
mols = molecule.filter(molecule_chembl_id__in=chembl_ids)
display(len(mols))
df = pd.DataFrame(mols)
chembl = df[['pref_name', 'chebi_par_id', 'molecule_chembl_id']]
chembl

1

Unnamed: 0,pref_name,chebi_par_id,molecule_chembl_id
0,GLUCOSE OXIDASE,,CHEMBL2108700


In [8]:
def _get_kegg(kegg_id):
    kegg_output = REST.kegg_get(kegg_id).read()
    results = {}
    for line in kegg_output.split('\n'):
        splits = line.split()
        if not line.startswith(' '):    
            if len(splits) > 0:
                key = splits[0]
                value = ' '.join(splits[1:])
                results[key] = value
        else:
            results[key] += ' '.join(splits)
    return pd.DataFrame(results, index=[kegg_id])


_get_kegg_v = np.vectorize(_get_kegg)

def get_kegg_info(kegg_ids):
    if isinstance(kegg_ids, str):
        kegg_ids = [kegg_ids]
    return pd.concat(_get_kegg_v(kegg_ids), sort=False)
d = _get_kegg(kegg_ids[0])

In [9]:
kegg = d[['ENTRY', 'NAME', 'FORMULA', 'EXACT_MASS', 'MOL_WEIGHT', 'ATOM', 'BOND']]
kegg

Unnamed: 0,ENTRY,NAME,FORMULA,EXACT_MASS,MOL_WEIGHT,ATOM,BOND
cpd:C00031,C00031 Compound,D-Glucose;Grape sugar;Dextrose;Glucose;D-Gluco...,C6H12O6,180.0634,180.1559,121 C1y C 24.2781 -16.53722 C1y C 24.2781 -17....,121 1 2 12 1 3 13 1 4 1 #Up4 2 5 15 2 6 1 #Dow...


In [10]:
base_url = 'https://chem.nlm.nih.gov/api/data/name/inlist'
data = 'complete'
out_format = 'tsv'  # tsv json
api_url = f"{base_url}/{short_compound_list}?data={data}&format={out_format}"
response = requests.get(api_url)

In [11]:
def to_df(result):
    return pd.read_table(io.StringIO(result))

df = to_df(response.text)
chemid = df[['RN / ID', 'Name', 'InChIKey', 'Formula', 'MeSH', 'InChI', 'SMILES']].iloc[[0]]
chemid

Unnamed: 0,RN / ID,Name,InChIKey,Formula,MeSH,InChI,SMILES
0,50-99-7,Glucose [JAN],GZCGUPFRVQAUEE-SLPGGIOYSA-N,C6-H12-O6,Glucose,InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h...,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C=O


In [12]:
df.columns

Index(['RN / ID', 'Last Modified', 'Name', 'InChIKey', 'Formula', 'Weight',
       'Has 3D', 'MeSH', 'Citations', 'InChI', 'SMILES', 'Type', 'Type ID',
       'Data', 'Sources', 'URL', 'Tox Organism', 'Tox Test Type', 'Tox Route',
       'Tox Dose', 'Tox Effects', 'Tox PubMed ID', 'PP Property', 'PP Data',
       'PP Value', 'PP Units', 'PP Temperature', 'PP Source'],
      dtype='object')

In [13]:
cs = ChemSpider('ud4RRF0uPgvBMTU844GFzF80QlNeSMsL')
info = cs.get_details(chemspider[0])
df = pd.DataFrame.from_dict(info, orient='index').T
chemspider = df[['id', 'smiles', 'formula', 'averageMass', 'molecularWeight', 'commonName', 'mol2D', 'mol3D']]
chemspider

Unnamed: 0,id,smiles,formula,averageMass,molecularWeight,commonName,mol2D,mol3D
0,96749,C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)O,C_{6}H_{12}O_{6},180.1559,180.1559,D-(+)-Glucose,MolHeader\n ELEMENTL09121714332D\nDotmatics E...,107526\n Marvin 01040808023D \n\n 2...


In [14]:
display(pubchem, chebi, chembl, kegg, chemid, chemspider)


Unnamed: 0,cid,name,atoms,atom_stereo_count,bonds,bond_stereo_count,charge,exact_mass,inchi,inchikey
0,5793,glucose,"[{'aid': 1, 'number': 8, 'element': 'O', 'x': ...",5,"[{'aid1': 1, 'aid2': 9, 'order': 1}, {'aid1': ...",0,0,180.0633881,InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2...,WQZGKKKJIJFFOK-GASJEMHNSA-N


Unnamed: 0,id,name,names,charge,formula,formulae,inchi,inchi_key,mol,smiles,mass
0,17234,glucose,"[{'_Name__name': 'Glucose', '_Name__typ': 'SYN...",0,C6H12O6,"[{'_Formula__formula': 'C6H12O6', '_Formula__s...",,,,,180.15588


Unnamed: 0,pref_name,chebi_par_id,molecule_chembl_id
0,GLUCOSE OXIDASE,,CHEMBL2108700


Unnamed: 0,ENTRY,NAME,FORMULA,EXACT_MASS,MOL_WEIGHT,ATOM,BOND
cpd:C00031,C00031 Compound,D-Glucose;Grape sugar;Dextrose;Glucose;D-Gluco...,C6H12O6,180.0634,180.1559,121 C1y C 24.2781 -16.53722 C1y C 24.2781 -17....,121 1 2 12 1 3 13 1 4 1 #Up4 2 5 15 2 6 1 #Dow...


Unnamed: 0,RN / ID,Name,InChIKey,Formula,MeSH,InChI,SMILES
0,50-99-7,Glucose [JAN],GZCGUPFRVQAUEE-SLPGGIOYSA-N,C6-H12-O6,Glucose,InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h...,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C=O


Unnamed: 0,id,smiles,formula,averageMass,molecularWeight,commonName,mol2D,mol3D
0,96749,C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)O,C_{6}H_{12}O_{6},180.1559,180.1559,D-(+)-Glucose,MolHeader\n ELEMENTL09121714332D\nDotmatics E...,107526\n Marvin 01040808023D \n\n 2...


In [23]:
pubchem.inchi[0], chebi.inchi[0], chemid.InChI[0]

('InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1',
 None,
 'InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h1,3-6,8-12H,2H2/t3-,4+,5+,6+/m0/s1')

In [24]:
pubchem.inchikey[0], chebi.inchi_key[0], chemid.InChIKey[0]

('WQZGKKKJIJFFOK-GASJEMHNSA-N', None, 'GZCGUPFRVQAUEE-SLPGGIOYSA-N')

In [27]:
pubchem.name[0], chebi.name[0], chebi.names[0], chembl.pref_name[0], kegg.NAME[0], chemid.Name[0], chemspider.commonName[0]

('glucose',
 'glucose',
 [{'_Name__name': 'Glucose', '_Name__typ': 'SYNONYM', '_Name__source': 'KEGG COMPOUND', '_Name__adapted': False, '_Name__language': 'en'},
  {'_Name__name': 'glucose', '_Name__typ': 'IUPAC NAME', '_Name__source': 'IUPAC', '_Name__adapted': False, '_Name__language': 'en'},
  {'_Name__name': 'Glukose', '_Name__typ': 'SYNONYM', '_Name__source': 'ChEBI', '_Name__adapted': False, '_Name__language': 'en'},
  {'_Name__name': 'Glc', '_Name__typ': 'SYNONYM', '_Name__source': 'JCBN', '_Name__adapted': False, '_Name__language': 'en'},
  {'_Name__name': 'gluco-hexose', '_Name__typ': 'IUPAC NAME', '_Name__source': 'IUPAC', '_Name__adapted': False, '_Name__language': 'en'},
  {'_Name__name': 'DL-glucose', '_Name__typ': 'SYNONYM', '_Name__source': 'ChEBI', '_Name__adapted': False, '_Name__language': 'en'}],
 'GLUCOSE OXIDASE',
 'D-Glucose;Grape sugar;Dextrose;Glucose;D-Glucopyranose',
 'Glucose [JAN]',
 'D-(+)-Glucose')

In [28]:
chebi.formula[0], kegg.FORMULA[0], chemid.Formula[0], chemspider.formula[0]

('C6H12O6', 'C6H12O6', 'C6-H12-O6', 'C_{6}H_{12}O_{6}')

In [29]:
pubchem.exact_mass[0], chebi.mass[0], kegg.EXACT_MASS[0], chemspider.averageMass[0]

('180.06338810', 180.15588, '180.0634', 180.1559)

In [33]:
chebi.smiles[0], chemid.SMILES[0], chemspider.smiles[0]

(None,
 'OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C=O',
 'C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)O')

In [35]:
kegg.MOL_WEIGHT[0], chemspider.molecularWeight[0]

('180.1559', 180.1559)

In [42]:
chebi.mol[0], print(chemspider.mol2D[0]), print(chemspider.mol3D[0])

MolHeader
  ELEMENTL09121714332D
Dotmatics Elemental
 12 11  0  0  1  0            999 V2000
    0.0000   -3.4538    0.0000 O   0  0  0  0  0
    1.3318   -3.4538    0.0000 C   0  0  0  0  0
    1.9976   -2.3050    0.0000 C   0  0  0  0  0
    1.3318   -1.1488    0.0000 O   0  0  0  0  0
    3.3294   -2.3050    0.0000 C   0  0  0  0  0
    3.9879   -3.4538    0.0000 O   0  0  0  0  0
    3.9879   -1.1488    0.0000 C   0  0  0  0  0
    3.3294    0.0000    0.0000 O   0  0  0  0  0
    5.3197   -1.1488    0.0000 C   0  0  0  0  0
    5.9856   -2.3050    0.0000 O   0  0  0  0  0
    5.9856    0.0000    0.0000 C   0  0  0  0  0
    7.3173    0.0000    0.0000 O   0  0  0  0  0
  1  2  1  0  0  0
  2  3  1  0  0  0
  3  4  1  1  0  0
  3  5  1  0  0  0
  5  6  1  6  0  0
  5  7  1  0  0  0
  7  8  1  6  0  0
  7  9  1  0  0  0
  9 10  1  6  0  0
  9 11  1  0  0  0
 11 12  2  0  0  0
M  END

107526
  Marvin  01040808023D          

 24 23  0  0  1  0            999 V2000
    2.1549   -0.3369 

(None, None, None)