In [125]:
import csv
import urllib
import requests
import pandas as pd
import re
from openai import OpenAI
model = "gpt-3.5-turbo"
client = OpenAI(api_key = ) # replace this with your own api key

max_N=100
dataset_df = pd.read_csv('dataset/qm9_with_mol.csv')

def filter_molecular_weight(input_weight, dataset_df): #filter related molecules
    filtered_df = dataset_df[(dataset_df['molecular weight']<=input_weight+5) & (dataset_df['molecular weight']>=input_weight-5)]
    if len(filtered_df)>max_N:
        filtered_df = filtered_df.sample(max_N)
    return filtered_df

functional_group_columns = dataset_df.columns[dataset_df.columns.get_loc("Hydroxyl"):dataset_df.columns.get_loc("Anhydride")+1]

def process_row(row):
    # Basic structure of the JSON object
    json_item = {
        "SMILES": row["SMILES"],
        "molecular weight": row["molecular weight"]
    }

    # Computed properties
    properties = ["mu", "alpha", "homo", "lumo", "gap", "U0", "U", "H", "G"]
    for prop in properties:
        if pd.notna(row[prop]):
            json_item[prop] = row[prop]

    # Functional groups
    functional_groups = row[functional_group_columns]
    if functional_groups.any():
        json_item["functional group"] = functional_groups.index[functional_groups].tolist()

    return json_item
    
def to_reference_text(filtered_df):
    plain_text_list = []
    for index, row in filtered_df.iterrows():
        tmp=[]
        for key, value in process_row(row).items():
            if isinstance(value, list):
                value = ", ".join(value)  # Convert list to string
            tmp.append(f"{key}:{value}")
        plain_text_list.append("  ".join(tmp))
    reference_text = "\n".join(plain_text_list)
    return reference_text

def query2prompt(query):
    use_rag = False
    if 'molecular weight' in query.keys():
        use_rag=True

    tmp=[]
    for key, value in query.items():
        if isinstance(value, list):
            value = ", ".join(value)  # Convert list to string
        tmp.append(f"{key}: {value}")
    
    plain_property = "\n".join(tmp)

    if use_rag:
        with open('rag_mol_prompt_user.txt', 'r') as f:
            user_prompt = f.read()
        user_prompt = user_prompt.replace('{PROPERTIES}', plain_property)
    
        with open('rag_mol_prompt_user.txt', 'r') as f:
            user_prompt = f.read()
        user_prompt = user_prompt.replace('{PROPERTIES}', plain_property)
        
    
        mol_weight = float(query['molecular weight'].replace('g/mol', ''))
        filtered_dataset = filter_molecular_weight(mol_weight, dataset_df=dataset_df)
    
        reference_text = to_reference_text(filtered_dataset)
    
        with open('rag_mol_rule_prompt_assistant.txt', 'r') as f:
            assi_prompt = f.read()
        assi_prompt = assi_prompt.replace('{QM9_REFERENCE}', reference_text)
    
        return use_rag, (user_prompt, assi_prompt)
        
    else:
        with open('simple_prompt.txt', 'r') as f:
            prompt = f.read()
            prompt = prompt.replace('{PROPERTIES}', plain_property)
        return use_rag, prompt
    

def findsmilesurl(output_string): #find SMILES
    prompt = "Please review this paragraph and extract the SMILES expressions, then return give me a python list of the expressions that you just extraced:\n{PARAGRAPH}\nPython list:"
    prompt = prompt.replace("{PARAGRAPH}", output_string)
    completion = client.chat.completions.create(
          model=model,
          temperature=0,
          messages=[{"role": "user", "content": prompt}])
    
    extracted_output = completion.choices[0].message.content

    match = re.search(r'\[(.*?)\]', extracted_output)
    if match:
        list_str = match.group(0)
        try:
            smiles_list = eval(list_str)
        except:
            return
        smiles_list = list(set(smiles_list))
        url_list = [smiles2url(smiles) for smiles in smiles_list]
        for i, (smiles, url) in enumerate(zip(smiles_list, url_list)):
            print("%d: %s %s"%(i+1,smiles,url))
        


def smiles2url(smiles):  # return reference website
# use PubChem to search url
    search_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{}/cids/TXT"
    encoded_smiles = urllib.parse.quote(smiles)  # URL encoding of SMILES strings
    response = requests.get(search_url.format(encoded_smiles))
    if response.status_code == 200:
        # Extracting the CID from the response
        cid = response.text.strip()
        if cid == "0":
            compound_url = ''
        else:
            compound_url = f'https://pubchem.ncbi.nlm.nih.gov/compound/{cid}'
    else:
        compound_url = ''
    return compound_url

In [133]:
def search_molecule_with_LLM(qeury):
    use_rag, prompt = query2prompt(query)
    
    if use_rag:
        user_prompt, assi_prompt = prompt
      
        completion = client.chat.completions.create(
              model=model,
              temperature=0,
              messages=[{"role": "user", "content": user_prompt},
                       {"role": "system", "content": assi_prompt}])
    else:
        completion = client.chat.completions.create(
              model=model,
              temperature=0,
              messages=[{"role": "user", "content": prompt}])
        
    print(completion.choices[0].message.content)
    findsmilesurl(completion.choices[0].message.content)

In [134]:
query = {'molecular weight': '98.07 g/mol',
         'density': '1.038 g/cm3',
         'melting point': '-148 C',
         'boiling point': '-12 C',
         'vapor pressure': '2770 mmHg',
         'functional group': ['Fluoro']}

In [135]:
search_molecule_with_LLM(query)

1. SMILES: CC1=NC=C(O)N1
2. SMILES: CC1=CN=CC=C1
3. SMILES: CC1=CN=CN=C1
4. SMILES: CC1=C(N)OC=C1
5. SMILES: CC1=C(N)N=NN1
6. SMILES: CC1=CNN=C1C
7. SMILES: CC1=C(O)N=CN1
8. SMILES: CC1=CC(O)=CO1
9. SMILES: CC1=CC(O)=NO1
10. SMILES: CC1=CN=CN=C1
1: CC1=C(N)N=NN1 https://pubchem.ncbi.nlm.nih.gov/compound/12284866
2: CC1=CC(O)=CO1 https://pubchem.ncbi.nlm.nih.gov/compound/54160801
3: CC1=CN=CC=C1 https://pubchem.ncbi.nlm.nih.gov/compound/7970
4: CC1=CC(O)=NO1 https://pubchem.ncbi.nlm.nih.gov/compound/24781
5: CC1=CN=CN=C1 https://pubchem.ncbi.nlm.nih.gov/compound/74859
6: CC1=CNN=C1C https://pubchem.ncbi.nlm.nih.gov/compound/137735
7: CC1=NC=C(O)N1 https://pubchem.ncbi.nlm.nih.gov/compound/21622656
8: CC1=C(O)N=CN1 https://pubchem.ncbi.nlm.nih.gov/compound/54352844
9: CC1=C(N)OC=C1 https://pubchem.ncbi.nlm.nih.gov/compound/19918535
