In [36]:

from aurelian.agents.chemistry_agent import get_chebi_adapter, ChemicalStructure

CLASSES = {
    "monoterpenoid": 10,
    "sesquiterpenoid": 15,
    "diterpenoid": 20,
    "sesterterpenoid": 25,
    "triterpenoid": 30,
    "other": None,
}

In [37]:
chebi = get_chebi_adapter()
session = chebi.session

In [38]:
def get_formula(curie: str):
    return chebi.entity_metadata_map(curie).get("obo:chebi/formula", None)

In [39]:
from semsql.sqla.semsql import Statements
from oaklib.datamodels.vocabulary import IS_A

suspects = []
for c, expected_carbons in CLASSES.items():
    if not expected_carbons:
        continue
    print(f"Searching for {c}")
    curies = chebi.curies_by_label(c)
    curie = curies[0] if curies else None
    structures = list(chebi.descendants(curie, [IS_A]))
    q = session.query(Statements.subject, Statements.value).filter(Statements.subject.in_(structures))
    q = q.filter(Statements.predicate == "obo:chebi/formula")
    for s, formula in q:
        # split into atoms; e.g. C10H16 => {"C": 10, "H": 16}
        import re
        atom_counts = dict(re.findall(r'([A-Z][a-z]*)(\d*)', formula))
        ac = atom_counts.get("C", 0)
        cc = int(ac) if ac else 1
        if cc >= expected_carbons * 2 or cc * 2 <= expected_carbons:
            #print(s, formula)
            suspects.append((c, s, cc, formula))

Searching for monoterpenoid
Searching for sesquiterpenoid
Searching for diterpenoid
Searching for sesterterpenoid
Searching for triterpenoid


In [41]:
len(suspects)

775

In [43]:
len([s for s in suspects if s[0] == "triterpenoid"])

51

In [44]:
suspects.reverse()

In [45]:
from pydantic import BaseModel
from pydantic_ai import Agent

class Classification(BaseModel):
    chemical_class: str
    explanation: str
    confidence: str

agent = Agent(
    model='openai:gpt-4o',
      system_prompt=f"""You are an expert chemist, able to interpret
      chemical structure diagrams and classify them.
      Look at the provided structure and classify it in one of the following categories:
      {", ".join(CLASSES.keys())}.
      When providing the explanation, show your full reasoning, explaining any anomalies in the structure.
      Actually look at the structure, and describe it, rather than just counting carbons.
      I want a detailed description of what you see.
      If you don't know, say so. Give a confidence level for all classifications, LOW, MEDIUM, or HIGH.
      """,
    result_type=Classification)

In [46]:
from pydantic_ai import ImageUrl
import nest_asyncio

nest_asyncio.apply()

results = []
for c, s, cc, formula in suspects:
    
    structure = ChemicalStructure.from_id(s)
    structure.name = chebi.label(s)
    img_url = ImageUrl(url=structure.chebi_image_url)
    q = (
        f"This structure is classified in CHEBI as a {c}, yet it has {cc} carbons."
        f" Its formula is {formula}."
        f" And its name is {structure.name}."
        f" Give the correct classification."
    )
    q = f"""What is the correct classification for this structure?
    I suspect it is a terpenoid, in one of the following categories:
    {", ".join(CLASSES.keys())}.
    """
    result = agent.run_sync([q, img_url])
    agrees = result.data.chemical_class.lower() == c.lower()
    print(agrees, c, s, cc, formula)
    print(result.data)
    results.append((agrees, c, s, q, result))

True triterpenoid CHEBI:9771 63 C63H98O29
chemical_class='triterpenoid' explanation='The chemical structure in the image shows a large carbon skeleton with six isoprene units (30 carbon atoms organized into rings). The structure includes a multi-ring system typical for a triterpene, specifically resembling a cucurbitane skeleton in its core, characteristic of triterpenes. Additionally, there are multiple glycosidic linkages (the sugar moieties attached) which are common modifications on triterpenoid aglycones, often seen in triterpenoid saponins. These features match the classification of a triterpenoid.' confidence='HIGH'
True triterpenoid CHEBI:9110 70 C70H104O32
chemical_class='triterpenoid' explanation='The structure displayed consists of 30 carbon atoms forming a series of interconnected rings. This configuration is characteristic of triterpenoids, which are comprised of six isoprene units. The additional functional groups and ring configurations are common in complex triterpenoid

ModelHTTPError: status_code: 400, model_name: gpt-4o, body: {'message': 'Timeout while downloading https://www.ebi.ac.uk/chebi/displayImage.do?defaultImage=true&imageIndex=0&chebiId=71980.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_image_url'}

In [47]:
print("Done")

Done


In [48]:
len([r for r in results if r[0]])

43

In [49]:
import pandas as pd

In [50]:
df = pd.DataFrame(results, columns=["agrees", "expected", "structure", "question", "result"])

In [52]:
df

Unnamed: 0,agrees,expected,structure,question,result
0,True,triterpenoid,CHEBI:9771,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...
1,True,triterpenoid,CHEBI:9110,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...
2,True,triterpenoid,CHEBI:74453,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...
3,True,triterpenoid,CHEBI:74448,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...
4,True,triterpenoid,CHEBI:69373,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...
...,...,...,...,...,...
102,False,diterpenoid,CHEBI:8046,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...
103,False,diterpenoid,CHEBI:80196,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...
104,False,diterpenoid,CHEBI:80117,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...
105,False,diterpenoid,CHEBI:79068,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...


In [53]:
IMG_URL_TMPL = "https://www.ebi.ac.uk/chebi/displayImage.do?defaultImage=true&imageIndex=0&chebiId={n}"


def img_url(chebi_id: str) -> str:
    url = IMG_URL_TMPL.format(n=chebi_id.replace("CHEBI:", ""))
    return f'<img src="{url}" width="120" style="max-height: 100px; object-fit: contain; display: block; margin: auto;" onerror="this.style.display=\'none\';"/>'


def cell_div(text: str, img: str) -> str:
    return f'''
        <div style="display: flex; flex-direction: column; align-items: center; text-align: center;">
            <span style="font-size: 12px; margin-bottom: 5px;">{text}</span>
            {img}
        </div>
        '''

def cell_from_row(row):
    return cell_div(chebi.label(row["structure"]), img_url(row["structure"]))

In [54]:
df["img"] = df.apply(cell_from_row, axis=1)

In [55]:
df["llm_classification"] = df["result"].apply(lambda r: r.data.chemical_class)
df["llm_explanation"] = df["result"].apply(lambda r: r.data.explanation)

In [56]:
df

Unnamed: 0,agrees,expected,structure,question,result,img,llm_classification,llm_explanation
0,True,triterpenoid,CHEBI:9771,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...,"\n <div style=""display: flex; flex-dire...",triterpenoid,The chemical structure in the image shows a la...
1,True,triterpenoid,CHEBI:9110,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...,"\n <div style=""display: flex; flex-dire...",triterpenoid,The structure displayed consists of 30 carbon ...
2,True,triterpenoid,CHEBI:74453,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...,"\n <div style=""display: flex; flex-dire...",triterpenoid,The structure displayed contains a series of s...
3,True,triterpenoid,CHEBI:74448,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...,"\n <div style=""display: flex; flex-dire...",triterpenoid,This chemical structure can be classified as a...
4,True,triterpenoid,CHEBI:69373,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...,"\n <div style=""display: flex; flex-dire...",triterpenoid,"The structure provided is quite complex, featu..."
...,...,...,...,...,...,...,...,...
102,False,diterpenoid,CHEBI:8046,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...,"\n <div style=""display: flex; flex-dire...",triterpenoid,The structure in the image is a complex molecu...
103,False,diterpenoid,CHEBI:80196,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...,"\n <div style=""display: flex; flex-dire...",other,The structure in the image is of beta-carotene...
104,False,diterpenoid,CHEBI:80117,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...,"\n <div style=""display: flex; flex-dire...",triterpenoid,The structure is a large molecule consisting o...
105,False,diterpenoid,CHEBI:79068,What is the correct classification for this st...,AgentRunResult(data=Classification(chemical_cl...,"\n <div style=""display: flex; flex-dire...",other,The structure shown is not a typical terpenoid...


In [57]:
!mkdir -p output

In [58]:
df[[
    "agrees",
    "expected",
    "structure",
    "img",
    "llm_classification",
    "llm_explanation",
]].to_html("output/terpenoids.html", escape=False, render_links=True, index=False)

In [59]:
df.to_csv("output/terpenoids.csv", index=False)


In [61]:
df.to_markdown("output/terpenoids.md", index=False, tablefmt="github")


In [60]:
orig_results = results