# notebook to extract chebi data and hierarchies 

In [1]:
import rdflib
import pandas as pd
import gzip
import networkx as nx
import pickle

ChEBI database obtained from the owl onthology file

https://www.ebi.ac.uk/chebi/downloadsForward.do

release from 2024-08-01

In [2]:
CHEBI = f'chebi.owl'
c=rdflib.Graph()
c.parse(CHEBI,format='xml')

<Graph identifier=N46e56bbd97064ca181c37ea65f5bea6d (<class 'rdflib.graph.Graph'>)>

### ChEBI dataframe

In [7]:
query="""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX chebi: <http://purl.obolibrary.org/obo/chebi/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT ?chebi ?charge ?smiles ?inchi ?inchikey ?label ?formula ?mass 
WHERE {
  ?chebi chebi:charge ?charge .
  ?chebi chebi:formula ?formula .
  ?chebi chebi:smiles ?smiles .
  ?chebi chebi:inchi ?inchi .
  ?chebi chebi:inchikey ?inchikey .
  ?chebi chebi:mass ?mass .
  ?chebi rdfs:label ?label 
}
"""
result = c.query(query)
chebi_df = pd.DataFrame(result.bindings).applymap(str).rename(columns=str)
chebi_df = chebi_df.drop_duplicates(ignore_index=True)
chebi_df.head()

Unnamed: 0,charge,chebi,formula,inchi,inchikey,label,mass,smiles
0,0,http://purl.obolibrary.org/obo/CHEBI_10,C36H38N2O6,InChI=1S/C36H38N2O6/c1-37-13-11-23-18-31(41-3)...,XGEAUXVPBXUBKN-NSOVKSMOSA-N,(+)-Atherospermoline,594.698,COc1cc2CCN(C)[C@H]3Cc4ccc(Oc5cc(C[C@@H]6N(C)CC...
1,0,http://purl.obolibrary.org/obo/CHEBI_100,C16H14O4,InChI=1S/C16H14O4/c1-18-10-3-5-11-13-8-19-14-6...,NSRJSISNDPOJOP-BBRMVZONSA-N,(-)-medicarpin,270.27996,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21
2,0,http://purl.obolibrary.org/obo/CHEBI_10000,C25H30O5,InChI=1S/C25H30O5/c1-15(2)6-5-7-16(3)8-9-30-19...,KZPCPZBBGCTGCN-LZYBPNLTSA-N,Vismione D,410.504,CC(C)=CCC\C(C)=C\COc1cc(O)c2c(O)c3C(=O)CC(C)(O...
3,0,http://purl.obolibrary.org/obo/CHEBI_100000,C22H26N2O3,InChI=1S/C22H26N2O3/c1-27-15-21(26)24-19(13-23...,FOQJOAXBJHEYGA-URVUXULASA-N,"(2S,3S,4R)-3-[4-(3-cyclopentylprop-1-ynyl)phen...",366.454,COCC(=O)N1[C@H]([C@H]([C@H]1C#N)C2=CC=C(C=C2)C...
4,0,http://purl.obolibrary.org/obo/CHEBI_100001,C22H25F3N4O4,"InChI=1S/C22H25F3N4O4/c23-22(24,25)15-3-5-16(6...",JHFKDWZHMFMOKL-QYZOEREBSA-N,"N-[(2R,3S,6R)-2-(hydroxymethyl)-6-[2-[[oxo-[4-...",466.454,C1C[C@@H]([C@@H](O[C@H]1CCNC(=O)NC2=CC=C(C=C2)...


In [11]:
chebi_df.shape

(178148, 8)

In [19]:
with gzip.open('chebiDf.tsv.gz', 'wb') as f:
    chebi_df.to_csv(f,sep='\t',index=False)

### ChEBI hierarchy

In [3]:
query="""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX chebi: <http://purl.obolibrary.org/obo/chebi/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT ?child ?parent
WHERE {
  OPTIONAL { ?child rdfs:subClassOf ?parent }
}
"""
h1 = c.query(query)
h1 = pd.DataFrame(h1.bindings).applymap(str).rename(columns=str)
h1=h1[h1['child'].apply(lambda x: 'CHEBI' in x )]
h1=h1[h1['parent'].apply(lambda x: 'CHEBI' in x )]
h1 = h1.drop_duplicates(ignore_index=True)
h1.head()

  h1 = pd.DataFrame(h1.bindings).applymap(str).rename(columns=str)


Unnamed: 0,child,parent
0,http://purl.obolibrary.org/obo/CHEBI_10,http://purl.obolibrary.org/obo/CHEBI_133004
1,http://purl.obolibrary.org/obo/CHEBI_11,http://purl.obolibrary.org/obo/CHEBI_133004
2,http://purl.obolibrary.org/obo/CHEBI_132893,http://purl.obolibrary.org/obo/CHEBI_133004
3,http://purl.obolibrary.org/obo/CHEBI_132895,http://purl.obolibrary.org/obo/CHEBI_133004
4,http://purl.obolibrary.org/obo/CHEBI_16777,http://purl.obolibrary.org/obo/CHEBI_133004


In [10]:
h1.shape

(279063, 2)

In [23]:
with gzip.open('chebiHierarchy.tsv.gz', 'wb') as f:
    h1.to_csv(f,sep='\t',index=False)

Try to build a three based on the hierarchy

In [4]:
h1['child'] = h1['child'].str.extract(r'CHEBI_(\d+)').astype(int)
h1['parent'] = h1['parent'].str.extract(r'CHEBI_(\d+)').astype(int)

In [5]:
G = nx.DiGraph()
G.add_edges_from(h1[['parent', 'child']].itertuples(index=False, name=None))

def get_all_descendants(graph, node):
    return list(nx.descendants(graph, node))

with open("hierarchy_chebi.pkl", "wb") as f:
    pickle.dump(G, f)

In [6]:
# Example
descendants = get_all_descendants(G, 36916)
print(f"Total descendants: {len(descendants)}")
print(f"Descendants: {descendants}")

Total descendants: 4310
Descendants: [90114, 32775, 32776, 32786, 32787, 49183, 73760, 73769, 196658, 196659, 16436, 196660, 32822, 24636, 32828, 196669, 196670, 196671, 32833, 196672, 73793, 73794, 90181, 32837, 41032, 32841, 196682, 32846, 90191, 32852, 57431, 32856, 57434, 16475, 57436, 32860, 41055, 32864, 57443, 32868, 32872, 90218, 131183, 73841, 57459, 196729, 196730, 73851, 73850, 73853, 73854, 16512, 57473, 73858, 196741, 196742, 196743, 196744, 196745, 57482, 90251, 57484, 73868, 73865, 180369, 32914, 57491, 57492, 57493, 180370, 73873, 78186, 57505, 8354, 196772, 196774, 196775, 196776, 196777, 196778, 57515, 196779, 196780, 229554, 41139, 229556, 229557, 229555, 229559, 229558, 196787, 196793, 196794, 196795, 196796, 196797, 89589, 78191, 78192, 78193, 196808, 229577, 89592, 196811, 229582, 73936, 73937, 57554, 73938, 196822, 32991, 32992, 32993, 32994, 32995, 89597, 106723, 180458, 180459, 33004, 33005, 33006, 57580, 57578, 57581, 33002, 33009, 33003, 33008, 33007, 155887,

In [14]:
def get_ancestry(graph, node):
    return list(nx.ancestors(graph, node))

# Example: Find ancestors of node 10
ancestors = get_ancestry(G, 10)
print(f"Ancestors: {ancestors}")

Ancestors: [33285, 51143, 23367, 22315, 133004, 50860, 33675, 33582, 33579, 24431, 33302, 35352, 24921, 22750]
