In [1]:
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel

load_dotenv(override=True)

True

In [2]:
nanozymes_df = pd.read_excel("nanozymes.xlsx")

In [3]:
syngony_dict = {
    "amorphous": 0,
    "triclinic": 1,
    "monoclinic": 2,
    "orthorhombic": 3,
    "tetragonal": 4,
    "trigonal": 5,
    "hexagonal": 6,
    "cubic": 7,
}

syngony_dict_reversed = {
    0: "amorphous",
    1: "triclinic",
    2: "monoclinic",
    3: "orthorhombic",
    4: "tetragonal",
    5: "trigonal",
    6: "hexagonal",
    7: "cubic",
}

In [4]:
nanozymes_df["syngony"] = nanozymes_df["Syngony"].map(syngony_dict_reversed)

In [5]:
main_cs = ['amorphous', 'triclinic', 'monoclinic', 'orthorhombic', 'tetragonal', 'trigonal', 'hexagonal', 'cubic']

In [6]:
not_in_main_cs = nanozymes_df[~nanozymes_df['syngony'].isin(main_cs)]

In [7]:
nanozymes_df.drop(not_in_main_cs.index, axis=0, inplace=True)

In [8]:
nanozymes_df.reset_index(inplace=True)

In [9]:
class Response(BaseModel):
    syngony: str

client = OpenAI()

In [10]:
prompt = """You are an expert in the field of nanozymes. Your task is to analyze formulas of nanomaterials and provide the crystal system (syngony) for each. The syngony should be one of the following: ['amorphous', 'triclinic', 'monoclinic', 'orthorhombic', 'tetragonal', 'trigonal', 'hexagonal', 'cubic']"""

In [11]:
nanozymes_df['predicted_syngony'] = ''

In [12]:
df_sampled = nanozymes_df.sample(100)

In [13]:
df_sampled.reset_index(inplace=True)

In [15]:
df_sampled

Unnamed: 0,level_0,index,formula,activity,Syngony,"length, nm","width, nm","depth, nm",surface,pol,...,ReactionType,"C min, mM","C max, mM","C(const), mM",Ccat(mg/mL),ph,"temp, °C",link,syngony,predicted_syngony
0,318,329,Pt,peroxidase,7.0,4.40,4.40,4.40,naked,0,...,TMB + H2O2,0.0625,0.5,1,,4.0,35.0,https://doi.org/10.1016/j.snb.2017.07.108,cubic,
1,868,1017,Fe2O3,peroxidase,5.0,30.00,30.00,30.00,naked,0,...,TMB + H2O2,0.0100,1.0,700,0.0625,3.5,,https://doi.org/10.1021/acsami.7b13835,trigonal,
2,78,78,Fe3O4,peroxidase,7.0,246.60,246.60,246.60,naked,0,...,TMB + H2O2,0.0500,0.3,20,0.166243,4.0,40.0,https://doi.org/10.1016/j.procbio.2019.05.014,cubic,
3,463,494,CoS,peroxidase,6.0,3000.00,3000.00,3000.00,naked,0,...,H2O2 + TMB,0.0010,20.0,0.15,0.5,3.5,25.0,https://doi.org/10.1039/C6RA16619A,hexagonal,
4,610,665,ZnS,peroxidase,7.0,10.00,10.00,10.00,naked,0,...,TMB + H2O2,0.0010,1.0,10,0.00625,4.0,40.0,https://doi.org/10.1021/acssuschemeng.9b04043,cubic,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,870,1019,Fe2O3,peroxidase,5.0,33.00,33.00,33.00,naked,0,...,TMB + H2O2,0.0100,1.0,700,0.0625,3.5,,https://doi.org/10.1021/acsami.7b13835,trigonal,
96,419,448,Co4N,peroxidase,7.0,2500.00,80.00,80.00,naked,0,...,H2O2 + TMB,1.0000,80.0,0.8,0.015,4.0,25.0,https://doi.org/10.1021/acsami.7b09861,cubic,
97,61,61,Co3O4/CeO2,peroxidase,7.0,2000.00,200.00,200.00,naked,poly(N-Vinylpyrrolidone),...,TMB + H2O2,1.0000,3.0,1.2,3,6.0,30.0,https://doi.org/10.1016/j.snb.2019.01.068,cubic,
98,673,751,NiS,peroxidase,7.0,320.00,320.00,320.00,naked,0,...,H2O2 + TMB,0.6000,20.0,0.25,0.025,4.0,25.0,https://doi.org/10.1007/s00216-018-1423-x,cubic,


In [16]:
for i in range(df_sampled.shape[0]):
    formula = df_sampled.loc[i, 'formula']
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"Formula: {formula}"},
        ],
        response_format=Response,
    )
    message = completion.choices[0].message
    if not message.parsed:
        print(i, df_sampled.loc[i, 'formula'], message.refusal)
        continue
    # print(f"Real: {df_sampled.loc[i, 'syngony']}, predicted: {message.parsed.syngony}, equal: {df_sampled.loc[i, 'syngony']==message.parsed.syngony}")
    df_sampled.loc[i, 'predicted_syngony'] = message.parsed.syngony

In [17]:
df_sampled['equal'] = df_sampled['syngony'] == df_sampled['predicted_syngony']

In [19]:
sum(df_sampled['equal']) / len(df_sampled)

0.86


In [23]:
df_sampled

Unnamed: 0,level_0,index,formula,activity,Syngony,"length, nm","width, nm","depth, nm",surface,pol,...,"C min, mM","C max, mM","C(const), mM",Ccat(mg/mL),ph,"temp, °C",link,syngony,predicted_syngony,equal
88,59,59,CoO/CeO2,peroxidase,7.0,1500.0,100.0,100.0,naked,0,...,0.300,3.00,0.1,0.083333,3.6,25.0,https://doi.org/10.1016/j.apmt.2019.03.009,cubic,cubic,True
26,474,505,ZnS,peroxidase,7.0,7.5,7.5,7.5,Tetrakis(4-carboxyphenyl)porphine,0,...,0.040,0.10,250,0.04,3.8,25.0,https://doi.org/10.1016/j.snb.2017.05.069,cubic,cubic,True
28,485,516,CoS,peroxidase,6.0,130.0,130.0,130.0,naked,dextran,...,0.001,1000.00,10,0.02,7.4,,https://doi.org/10.1002/cbic.202000066,hexagonal,hexagonal,True
5,662,740,Fe3O4,peroxidase,7.0,650.0,650.0,650.0,histidine,0,...,0.005,0.80,900,0.002,4.5,,https://doi.org/10.1039/C6CC08542C,cubic,cubic,True
58,827,952,Fe2O3,peroxidase,7.0,38.8,38.8,38.8,naked,poly(N-Vinylpyrrolidone),...,0.185,0.37,390,0.003,3.0,,https://doi.org/10.1088/1361-6528/aaddc2,cubic,trigonal,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22,946,1108,Fe2O3,peroxidase,7.0,35.0,35.0,35.0,naked,0,...,0.080,0.32,987,0.1,4.0,,https://doi.org/10.1039/C6RA00963H,cubic,trigonal,False
87,923,1083,Hg0.01Pt,peroxidase,7.0,2.5,2.5,2.5,citric acid,0,...,,,,,,,https://doi.org/10.1021/ac503544w,cubic,cubic,True
48,600,645,Ce2(WO4)3,peroxidase,2.0,500.0,500.0,50.0,naked,0,...,1.000,25.00,0.8,0.07,4.0,25.0,https://doi.org/10.1016/j.saa.2020.118499,monoclinic,monoclinic,True
8,279,290,CeO2,oxidase,7.0,5.0,5.0,5.0,poly(acrylic acid),0,...,0.300,1.50,0,,7.0,37.0,https://doi.org/10.1002/anie.200805279,cubic,cubic,True
