In [13]:
from ord_schema.proto import dataset_pb2
from ord_schema.proto import reaction_pb2
from ord_schema import message_helpers
from rdkit import Chem
from rdkit import RDLogger                                                                                                                                                               
RDLogger.DisableLog('rdApp.*')                                                                                                                                                           
import pandas as pd

data = pd.read_csv("5. zz210326_final_results.csv")

component_types = ["Electrophile", "Nucleophile", "Catalyst1", "Ligand1", "Ligand2", \
                   "Catalyst2", "BaseAcid", "ReductantOxidant", "Additive", "Solvent1", "Solvent2", "Product"]
component_smiles = [k+"_SMILES" for k in component_types]
component_charges = [k+" charge" for k in component_types]

convert_data = {k:[] for k in component_types}
for k in component_smiles:
    convert_data[k] = []
for k in component_charges:
    convert_data[k] = []
convert_data["Pd/IS"] = []
convert_data["Plate Position"] = []
convert_data["Reaction_type"] = []
    
for i,k in data.iterrows():
    for j in component_types:
        convert_data[j].append(k[j])
        if j == "Solvent1" or j == "Solvent2":
            sm = "None"
        else:
            sm = k[j+" SMILES"]
        if sm == None or sm == "None":
            convert_data[j+"_SMILES"].append("None")
        else:
            sm = Chem.MolFromSmiles(sm)
            if sm != None:
                sm = Chem.MolToSmiles(sm)
            else:
                sm = "None"
            convert_data[j+"_SMILES"].append(sm)
        if j == "Solvent1" or j == "Solvent2" or j == "Product":
            convert_data[j+" charge"].append("None")
        else:
            convert_data[j+" charge"].append(f"{k[j+ ' Conc (M)']}")
    convert_data["Pd/IS"].append(k["Output Value"])
    convert_data["Plate Position"].append(k["Row"] + str(k["Column"]))
    convert_data["Reaction_type"].append("sp3-sp3 deaminative-decarboxylative C-C coupling")
out_data = pd.DataFrame(convert_data)

reaction_dataset = dataset_pb2.Dataset()
for i,k in out_data.iterrows():
    reaction = reaction_pb2.Reaction()
    for rxt in ["Electrophile", "Nucleophile", "Catalyst1", "Ligand1", "Ligand2", "Catalyst2", "BaseAcid", "ReductantOxidant", "Additive"]:
        if k[rxt] == "None":
            continue
        solute = reaction.inputs[rxt].components.add()
        solute.CopyFrom(
            message_helpers.build_compound(
                name=k[rxt],
                smiles=k[rxt+"_SMILES"],
                role="reactant",
                amount=f"{float(k[rxt+' charge'])*.1} mmol",
                prep=None,
                is_limiting=False,
                prep_details=None,
            )
        )
    solvent = reaction.inputs["Solvent1"].components.add()
    solvent.CopyFrom(
        message_helpers.build_compound(
            name=k["Solvent1"],
            smiles="placeholder",
            role="solvent",
            amount="100 uL",
            prep=None,
            is_limiting=False,
            prep_details=None,
        )
    )
    solvent.amount.volume_includes_solutes = True
    
    outcome = reaction.outcomes.add()
    prod_2a = outcome.products.add(is_desired_product=True)
    prod_2a.identifiers.add(type="SMILES", value=k["Product_SMILES"])
    prod_2a.identifiers.add(type="NAME", value=k["Product"])
    prod_2a.reaction_role = reaction_pb2.ReactionRole.PRODUCT
    prod_2a.measurements.add(type="YIELD", analysis_key="UPLC-MS Integration", percentage=dict(value=k["Pd/IS"], precision=5), uses_internal_standard=True
                            )
dataset = dataset_pb2.Dataset(
    name="sp3-sp3 deaminative-decarboxylative C-C coupling zz210326",
    description="DOI: 10.1002/anie.202112454",
    reactions=reactions,
)
message_helpers.write_message(dataset, 'ord-zz210326.pbtxt')