# notebook to extract chebi data and hierarchies 

In [1]:
import rdflib
import pandas as pd
import gzip

ChEBI database obtained from the owl onthology file

https://www.ebi.ac.uk/chebi/downloadsForward.do

release from 2024-08-01

In [4]:
CHEBI = f'chebi.owl'
c=rdflib.Graph()
c.parse(CHEBI,format='xml')

<Graph identifier=N9bb99d134e554131b146d81dbaf7ba8b (<class 'rdflib.graph.Graph'>)>

### ChEBI dataframe

In [7]:
query="""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX chebi: <http://purl.obolibrary.org/obo/chebi/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT ?chebi ?charge ?smiles ?inchi ?inchikey ?label ?formula ?mass 
WHERE {
  ?chebi chebi:charge ?charge .
  ?chebi chebi:formula ?formula .
  ?chebi chebi:smiles ?smiles .
  ?chebi chebi:inchi ?inchi .
  ?chebi chebi:inchikey ?inchikey .
  ?chebi chebi:mass ?mass .
  ?chebi rdfs:label ?label 
}
"""
result = c.query(query)
chebi_df = pd.DataFrame(result.bindings).applymap(str).rename(columns=str)
chebi_df = chebi_df.drop_duplicates(ignore_index=True)
chebi_df.head()

Unnamed: 0,charge,chebi,formula,inchi,inchikey,label,mass,smiles
0,0,http://purl.obolibrary.org/obo/CHEBI_10,C36H38N2O6,InChI=1S/C36H38N2O6/c1-37-13-11-23-18-31(41-3)...,XGEAUXVPBXUBKN-NSOVKSMOSA-N,(+)-Atherospermoline,594.698,COc1cc2CCN(C)[C@H]3Cc4ccc(Oc5cc(C[C@@H]6N(C)CC...
1,0,http://purl.obolibrary.org/obo/CHEBI_100,C16H14O4,InChI=1S/C16H14O4/c1-18-10-3-5-11-13-8-19-14-6...,NSRJSISNDPOJOP-BBRMVZONSA-N,(-)-medicarpin,270.27996,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21
2,0,http://purl.obolibrary.org/obo/CHEBI_10000,C25H30O5,InChI=1S/C25H30O5/c1-15(2)6-5-7-16(3)8-9-30-19...,KZPCPZBBGCTGCN-LZYBPNLTSA-N,Vismione D,410.504,CC(C)=CCC\C(C)=C\COc1cc(O)c2c(O)c3C(=O)CC(C)(O...
3,0,http://purl.obolibrary.org/obo/CHEBI_100000,C22H26N2O3,InChI=1S/C22H26N2O3/c1-27-15-21(26)24-19(13-23...,FOQJOAXBJHEYGA-URVUXULASA-N,"(2S,3S,4R)-3-[4-(3-cyclopentylprop-1-ynyl)phen...",366.454,COCC(=O)N1[C@H]([C@H]([C@H]1C#N)C2=CC=C(C=C2)C...
4,0,http://purl.obolibrary.org/obo/CHEBI_100001,C22H25F3N4O4,"InChI=1S/C22H25F3N4O4/c23-22(24,25)15-3-5-16(6...",JHFKDWZHMFMOKL-QYZOEREBSA-N,"N-[(2R,3S,6R)-2-(hydroxymethyl)-6-[2-[[oxo-[4-...",466.454,C1C[C@@H]([C@@H](O[C@H]1CCNC(=O)NC2=CC=C(C=C2)...


In [11]:
chebi_df.shape

(178148, 8)

In [19]:
with gzip.open('chebiDf.tsv.gz', 'wb') as f:
    chebi_df.to_csv(f,sep='\t',index=False)

### ChEBI hierarchy

In [8]:
query="""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX chebi: <http://purl.obolibrary.org/obo/chebi/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT ?child ?parent
WHERE {
  OPTIONAL { ?child rdfs:subClassOf ?parent }
}
"""
h1 = c.query(query)
h1 = pd.DataFrame(h1.bindings).applymap(str).rename(columns=str)
h1=h1[h1['child'].apply(lambda x: 'CHEBI' in x )]
h1=h1[h1['parent'].apply(lambda x: 'CHEBI' in x )]
h1 = h1.drop_duplicates(ignore_index=True)
h1.head()

Unnamed: 0,child,parent
0,http://purl.obolibrary.org/obo/CHEBI_10,http://purl.obolibrary.org/obo/CHEBI_133004
1,http://purl.obolibrary.org/obo/CHEBI_11,http://purl.obolibrary.org/obo/CHEBI_133004
2,http://purl.obolibrary.org/obo/CHEBI_132893,http://purl.obolibrary.org/obo/CHEBI_133004
3,http://purl.obolibrary.org/obo/CHEBI_132895,http://purl.obolibrary.org/obo/CHEBI_133004
4,http://purl.obolibrary.org/obo/CHEBI_16777,http://purl.obolibrary.org/obo/CHEBI_133004


In [10]:
h1.shape

(279063, 2)

In [23]:
with gzip.open('chebiHierarchy.tsv.gz', 'wb') as f:
    h1.to_csv(f,sep='\t',index=False)