In [14]:
import pandas as pd
max_N=50

In [15]:
dataset_df = pd.read_csv('dataset/qm9_with_mol.csv')

In [16]:
def filter_molecular_weight(input_weight, dataset_df):
    filtered_df = dataset_df[(dataset_df['molecular weight']<=input_weight+5) & (dataset_df['molecular weight']>=input_weight-5)]
    if len(filtered_df)>max_N:
        filtered_df = filtered_df.sample(max_N)
    return filtered_df
    
functional_group_columns = dataset_df.columns[dataset_df.columns.get_loc("Hydroxyl"):dataset_df.columns.get_loc("Anhydride")+1]

def process_row(row):
    # Basic structure of the JSON object
    json_item = {
        "SMILES": row["SMILES"],
        "molecular weight": row["molecular weight"]
    }

    # Computed properties
    properties = ["mu", "alpha", "homo", "lumo", "gap", "U0", "U", "H", "G"]
    for prop in properties:
        if pd.notna(row[prop]):
            json_item[prop] = row[prop]

    # Functional groups
    functional_groups = row[functional_group_columns]
    if functional_groups.any():
        json_item["functional group"] = functional_groups.index[functional_groups].tolist()

    return json_item
    
def to_reference_text(filtered_df):
    plain_text_list = []
    for index, row in filtered_df.iterrows():
        tmp=[]
        for key, value in process_row(row).items():
            if isinstance(value, list):
                value = ", ".join(value)  # Convert list to string
            tmp.append(f"{key}:{value}")
        plain_text_list.append("  ".join(tmp))
    reference_text = "\n".join(plain_text_list)
    return reference_text

In [17]:
from openai import OpenAI
from tqdm import tqdm
import json
import re
# client = OpenAI(api_key = ) 

In [18]:
with open('testset.jsonl', 'r') as file:
    json_line = file.readlines()

gt_list = []
property_list = []
for i in json_line:
    item = json.loads(i)
    gt_list.append(item['SMILES'])
    item.pop('SMILES')
    property_list.append(item)

In [19]:
def prepare_prompt(properties):
    tmp = []
    for key, value in properties.items():
        if isinstance(value, list):
            value = ", ".join(value)  # Convert list to string
        tmp.append(f"{key}: {value}")
    plain_property = "\n".join(tmp)
    
    with open('rag_mol_prompt_user.txt', 'r') as f:
        user_prompt = f.read()
    user_prompt = user_prompt.replace('{PROPERTIES}', plain_property)
    

    mol_weight = float(properties['molecular weight'].replace('g/mol', ''))
    filtered_dataset = filter_molecular_weight(mol_weight, dataset_df=dataset_df)

    reference_text = to_reference_text(filtered_dataset)

    with open('rag_mol_rule_auto_cot_prompt_assistant.txt', 'r') as f:
        assi_prompt = f.read()
    assi_prompt = assi_prompt.replace('{QM9_REFERENCE}', reference_text)

    return user_prompt, assi_prompt

In [20]:
model = "gpt-3.5-turbo"
top10_list = []

for gt, properties in tqdm(zip(gt_list, property_list), total=len(gt_list)):

    user_prompt, assi_prompt = prepare_prompt(properties)
        
    completion = client.chat.completions.create(
          model=model,
          temperature=0,
          messages=[{"role": "user", "content": user_prompt},
                   {"role": "system", "content": assi_prompt}])
    if gt in completion.choices[0].message.content:
        top10_list.append(1)
    else:
        top10_list.append(0)

100%|███████████████████████████████████████████| 50/50 [11:38<00:00, 13.97s/it]


In [22]:
print("Model:%s  Accuracy:%.4f" %(model, sum(top10_list)/50))

Model:gpt-3.5-turbo  Accuracy:0.2400


In [23]:
model = "gpt-4-turbo"
top10_list = []

for gt, properties in tqdm(zip(gt_list, property_list), total=len(gt_list)):

    user_prompt, assi_prompt = prepare_prompt(properties)
        
    completion = client.chat.completions.create(
          model=model,
          temperature=0,
          messages=[{"role": "user", "content": user_prompt},
                   {"role": "system", "content": assi_prompt}])
    if gt in completion.choices[0].message.content:
        top10_list.append(1)
    else:
        top10_list.append(0)

100%|███████████████████████████████████████████| 50/50 [22:15<00:00, 26.70s/it]


In [24]:
print("Model:%s  Accuracy:%.4f" %(model, sum(top10_list)/50))

Model:gpt-4-turbo  Accuracy:0.2600


In [7]:
import openai
openai.api_key = "EMPTY"
openai.base_url = "http://localhost:8000/v1/"

model = "vicuna-7b-v1.5-16k"
top10_list = []

for gt, properties in tqdm(zip(gt_list, property_list), total=len(gt_list)):

    user_prompt, assi_prompt = prepare_prompt(properties)
        
    completion = openai.chat.completions.create(
          model=model,
          temperature=0,
          max_tokens=500,
          messages=[{"role": "user", "content": user_prompt},
                   {"role": "system", "content": assi_prompt}])
    if gt in completion.choices[0].message.content:
        top10_list.append(1)
    else:
        top10_list.append(0)

print("Model:%s  Accuracy:%.4f" %(model, sum(top10_list)/50))

100%|███████████████████████████████████████████| 50/50 [21:29<00:00, 25.79s/it]

Model:vicuna-7b-v1.5-16k  Accuracy:0.0400





In [20]:
import openai
openai.api_key = "EMPTY"
openai.base_url = "http://localhost:8001/v1/"

model = "vicuna-13b-v1.5-16k"
top10_list = []

for gt, properties in tqdm(zip(gt_list, property_list), total=len(gt_list)):

    user_prompt, assi_prompt = prepare_prompt(properties)
        
    completion = openai.chat.completions.create(
          model=model,
          temperature=0,
          max_tokens=500,
          messages=[{"role": "user", "content": user_prompt},
                   {"role": "system", "content": assi_prompt}])
    if gt in completion.choices[0].message.content:
        top10_list.append(1)
    else:
        top10_list.append(0)

print("Model:%s  Accuracy:%.4f" %(model, sum(top10_list)/50))

100%|███████████████████████████████████████████| 50/50 [33:59<00:00, 40.78s/it]

Model:vicuna-13b-v1.5-16k  Accuracy:0.0200



