In [1]:
import openai
import langchain
import pickle
import json
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
import jsonlines
import pprint
from collections import defaultdict
from tqdm import tqdm

In [3]:
api_key = 'your_api_key'

# Few-shot

In [2]:
train_data = defaultdict(list)
test_data = defaultdict(list)

for i in range(5):
    with jsonlines.open(f'train_{i}.jsonl') as f:
        for line in f.iter():
            train_data[i].append(line)

    with jsonlines.open(f'test_{i}.jsonl') as f:
        for line in f.iter():
            test_data[i].append(line)

In [3]:
n_example = 100
system_prompt = train_data[0][0]['messages'][0]['content']
prompt = f"""{system_prompt}

begin!

"""

rn = 0
for i in range(n_example * 5):
    precursor = train_data[0][i]['messages'][1]['content']
    output = train_data[0][i]['messages'][2]['content'].replace("{", "{{").replace("}", "}}")

    if "\", \"" not in precursor:
        continue

    prompt += f"Precursor: {precursor}\nOutput: {output}\n\n"
    rn += 1
    if rn >= n_example:
        break

prompt += "Precursor: {precursor}\nOutput: "


In [4]:
print (prompt)

You act like a MOF synthesis expert. I will give you precursors of MOF and you have to suggest the appropriate synthesis conditions for this MOF. You have to suggest the synthesis conditions in JSON format and contain these categories : ['precursor', 'synthesis_method', 'solvent', 'temperature', 'time', 'pressure', 'cooling', 'pH_adjustment', 'washing', 'filtration', 'drying'].

begin!

Precursor: ["Cd(NO3)2\u22c54H2O", "H2bismip", "4,4'-bipy"]
Output: {{"precursors": ["Cd(NO3)2\u22c54H2O", "H2bismip", "4,4'-bipy"], "synthesis_method": "chemical synthesis", "solvent": ["water", "DMF"], "temperature": "110 \u00b0C", "time": "1 day", "pressure": "1 atm", "cooling": true, "pH_adjustment": false, "washing": false, "filtration": false, "drying": false}}

Precursor: ["NiCl2\u22c56H2O", "1,4-butylenediphosphonic acid"]
Output: {{"precursors": ["NiCl2\u22c56H2O", "1,4-butylenediphosphonic acid"], "synthesis_method": "chemical synthesis", "solvent": ["water", "DMSO"], "temperature": "25 \u00b0C

In [5]:
template = PromptTemplate(template=prompt, input_variables=['precursor'])

In [8]:
model = 'gpt-3.5-turbo'

llm = ChatOpenAI(model=model, openai_api_key=api_key)
chain = LLMChain(prompt=template, llm=llm)


true_label, prediction_label = list(), list()

for test in tqdm(test_data[0]):
    precursor = test['messages'][1]['content']
    true = test['messages'][2]['content']

    try:
        prediction = chain.run(precursor=precursor, stop=['Precursor:'])
        true = json.loads(true.strip())
        prediction = json.loads(prediction.strip())
    except Exception as e:
        print (e)
    else:
        true_label.append(true)
        prediction_label.append(prediction)

 15%|█▌        | 324/2094 [11:27<57:58,  1.96s/it]  Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: HTTP code 520 from API (<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>


<title>api.openai.com | 520: Web server is returning an unknown error</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />


</head>
<body>
<div id="cf-wra

In [9]:
import pickle

with open('true_label_fewshot100_gpt3_save.pickle', 'wb') as f:
    pickle.dump(true_label, f)

with open('prediction_label_fewshot100_gpt3_save.pickle', 'wb') as f:
    pickle.dump(prediction_label, f)

In [10]:
model = 'gpt-4o'

llm = ChatOpenAI(model=model, openai_api_key=api_key)
chain = LLMChain(prompt=template, llm=llm)


true_label_4o, prediction_label_4o = list(), list()

for test in tqdm(test_data[0]):
    precursor = test['messages'][1]['content']
    true = test['messages'][2]['content']

    try:
        prediction: str = chain.run(precursor=precursor, stop=['Precursor:']).strip()
        if prediction.startswith("```json") and prediction.endswith("```"):
            prediction = prediction.replace("```json", "").replace("```", "").strip()
        true = json.loads(true.strip())
        prediction = json.loads(prediction.strip())
    except Exception as e:
        print (e)
    else:
        true_label_4o.append(true)
        prediction_label_4o.append(prediction)

100%|██████████| 2094/2094 [1:59:04<00:00,  3.41s/it]  


In [11]:
with open('true_label_fewshot100_gpt4o_save.pickle', 'wb') as f:
    pickle.dump(true_label_4o, f)

with open('prediction_label_fewshot100_gpt4o_save.pickle', 'wb') as f:
    pickle.dump(prediction_label_4o, f)

# Zero-shot

In [17]:
system_prompt = train_data[0][0]['messages'][0]['content']
prompt = system_prompt + """
Output format must be:
{{
    "precursors": List[str],
    "synthesis_method": str,
    "solvent": List[str],
    "temperature": str (e.g. 25 °C),
    "time": str (e.g. 2 h),
    "pressure": str (e.g. 1 atm),
    "cooling": bool,
    "pH_adjustment": bool,
    "washing": False or List[str],
    "filtration": bool,
    "drying": bool
}}

- Each element of solvent and washing must be one of this list:
['water', 'DMF', 'MeOH', 'EtOH', 'acetonitrile', 'DMSO', 'DMA', 'CH2Cl2', 'CHCl3', 'THF', 'toluene', 'acetone', 'Et3N', 'DEF', 'pyridine', 'benzene', 'NMP', 'isopropanol', 'AcOH', 'dioxane', 'diethyl ether', 'ethylene glycol', 'DMAC', 'cyclohexane', 'formic acid', 'hexane', 'TEA', 'BuOH', 'DMI', 'nitrobenzene', 'Ethanol', 'DEA', 'DMAE', 'dichlorobenzene', 'TBA', 'DME', 'formamide', 'ethyl acetate', 'TPA', 'isobutanol', 'NMA', 'DMAc']
- "synthesis method" must be one of this list:
['chemical synthesis', 'solvothermal synthesis', 'sonochemical synthesis', 'hydrothermal synthesis']
- If pressure is augotenous, write "autogenous"
- You can use "min", "h", "days", and "weeks" in unit of time.

begin!

"""
prompt += "Precursor: {precursor}\nOutput: """


In [18]:
print(prompt)

You act like a MOF synthesis expert. I will give you precursors of MOF and you have to suggest the appropriate synthesis conditions for this MOF. You have to suggest the synthesis conditions in JSON format and contain these categories : ['precursor', 'synthesis_method', 'solvent', 'temperature', 'time', 'pressure', 'cooling', 'pH_adjustment', 'washing', 'filtration', 'drying'].
Output format must be:
{{
    "precursors": List[str],
    "synthesis_method": str,
    "solvent": List[str],
    "temperature": str (e.g. 25 °C),
    "time": str (e.g. 2 h),
    "pressure": str (e.g. 1 atm),
    "cooling": bool,
    "pH_adjustment": bool,
    "washing": False or List[str],
    "filtration": bool,
    "drying": bool
}}

- Each element of solvent and washing must be one of this list:
['water', 'DMF', 'MeOH', 'EtOH', 'acetonitrile', 'DMSO', 'DMA', 'CH2Cl2', 'CHCl3', 'THF', 'toluene', 'acetone', 'Et3N', 'DEF', 'pyridine', 'benzene', 'NMP', 'isopropanol', 'AcOH', 'dioxane', 'diethyl ether', 'ethylen

In [19]:
template = PromptTemplate(template=prompt, input_variables=['precursor'])

In [24]:
model = 'gpt-3.5-turbo'

llm = ChatOpenAI(model=model, openai_api_key=api_key)
chain = LLMChain(prompt=template, llm=llm)


true_label, prediction_label = list(), list()

for test in tqdm(test_data[0]):
    precursor = test['messages'][1]['content']
    true = test['messages'][2]['content']

    try:
        prediction = chain.run(precursor=precursor, stop=['Precursor:'])
        true = json.loads(true.strip())
        prediction = json.loads(prediction.strip())
    except Exception as e:
        print (e)
    else:
        true_label.append(true)
        prediction_label.append(prediction)

  6%|▌         | 118/2094 [03:13<56:08,  1.70s/it]  Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: HTTP code 520 from API (<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>


<title>api.openai.com | 520: Web server is returning an unknown error</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />


</head>
<body>
<div id="cf-wra

Invalid \escape: line 2 column 28 (char 29)


100%|██████████| 2094/2094 [59:35<00:00,  1.71s/it] 


In [25]:
prediction

{'precursors': ['ZnSO4⋅7H2O', 'Hmtz', 'Hnic', 'NaOH'],
 'synthesis_method': 'hydrothermal synthesis',
 'solvent': ['water', 'DMF'],
 'temperature': '120 °C',
 'time': '2 days',
 'pressure': 'autogenous',
 'cooling': True,
 'pH_adjustment': True,
 'washing': ['water'],
 'filtration': True,
 'drying': True}

In [26]:
import pickle

with open('true_label_zeroshot_gpt3_save.pickle', 'wb') as f:
    pickle.dump(true_label, f)

with open('prediction_label_zeroshot_gpt3_save.pickle', 'wb') as f:
    pickle.dump(prediction_label, f)

In [27]:
model = 'gpt-4o'

llm = ChatOpenAI(model=model, openai_api_key=api_key)
chain = LLMChain(prompt=template, llm=llm)


true_label_4o, prediction_label_4o = list(), list()

for test in tqdm(test_data[0]):
    precursor = test['messages'][1]['content']
    true = test['messages'][2]['content']

    try:
        prediction: str = chain.run(precursor=precursor, stop=['Precursor:']).strip()
        if prediction.startswith("```json") and prediction.endswith("```"):
            prediction = prediction.replace("```json", "").replace("```", "").strip()
        true = json.loads(true.strip())
        prediction = json.loads(prediction.strip())
    except Exception as e:
        print (e)
    else:
        true_label_4o.append(true)
        prediction_label_4o.append(prediction)

100%|██████████| 2094/2094 [1:24:54<00:00,  2.43s/it]


In [28]:
prediction

{'precursors': ['ZnSO4∙7H2O', 'Hmtz', 'Hnic', 'NaOH'],
 'synthesis_method': 'hydrothermal synthesis',
 'solvent': ['water'],
 'temperature': '120 °C',
 'time': '24 h',
 'pressure': 'autogenous',
 'cooling': True,
 'pH_adjustment': True,
 'washing': ['water'],
 'filtration': True,
 'drying': True}

In [29]:
with open('true_label_zeroshot_gpt4o_save.pickle', 'wb') as f:
    pickle.dump(true_label_4o, f)

with open('prediction_label_zeroshot_gpt4o_save.pickle', 'wb') as f:
    pickle.dump(prediction_label_4o, f)