In [None]:
import pubchempy as pcp
import re

In [37]:
def pubchem_common_name_to_smiles(common_name: str): 
    try: 
        c = pcp.get_cids(common_name, 'name')
        if len(c) ==1: 
            compound = pcp.Compound.from_cid(c[0])
            c_smiles = compound.isomeric_smiles
        return c_smiles, common_name
    except: 
        return None, common_name

In [98]:
def split_chemicals(value:str):
    components = [comp.strip() for comp in value.split('|')]
    result = []

    for component in components:
        match = re.match(r'(.+?)(\s*(\([^\)]+\)|\[[^\]]+\]))?\s*$', component)
        if match:
            chemical_name = match.group(1).strip()
            quantity = match.group(2).strip().replace('(', '').replace(')', '').replace('[', '').replace(']', '') if match.group(2) else None
        
            result.append((chemical_name, quantity))
        
    return result

In [99]:
rxn_dict = {
    "Electrolytes": "TBAB", 
    "Additives": "TMS3Si-H (1.5 equiv), MVK (2.5 equiv)",
    "Solvents": "CH3NO2 (5 equiv) | CH2Cl2 (~5:1)"
}
value = rxn_dict["Additives"]
split_chemicals(value)

[('TMS3Si-H (1.5 equiv), MVK', '2.5 equiv')]

In [None]:
# TODO: how about photocatalysts and catalysts? 
# TODO: How about mixed systems (can be A/B or A:B or some other form...)
keys = ['Electrolytes', 'Solvents', 'Chemicals', 'Additives']

def entity_resolution_rxn_dict(rxn_dict: dict, keys):
    for key in keys: 
        value = rxn_dict[key]
        split_value = split_chemicals(value)
        

('CN(C)C=O', 'DMF')

In [5]:
import os
import pandas as pd
import json

directory = '../ocr_eval_results/ocr_eval/organic_synthesis/'
data = []

for filename in os.listdir(directory):
    if filename.endswith('_bleuscore.json'):
        file_name = filename.replace('_bleuscore.json', '')
        filepath = os.path.join(directory, filename)
        
        with open(filepath, 'r') as file:
            json_data = json.load(file)
            for key, values in json_data.items():
                data.append([file_name, key, *values])

df = pd.DataFrame(data, columns=['File Name', 'Key', 'Value 1', 'Value 2', 'Value 3'])

# Save the DataFrame to a CSV file
df.to_csv('../ocr_eval_results/ocr_eval/organic_synthesis/combined_data.csv', index=False)

print("Data saved to combined_data.csv")


Data saved to combined_data.csv
