In [20]:
reactions = []

In [50]:
from ord_schema.proto import dataset_pb2
from ord_schema.proto import reaction_pb2
from ord_schema import message_helpers
from rdkit import Chem
from rdkit import RDLogger                                                                                                                                                               
RDLogger.DisableLog('rdApp.*')                                                                                                                                                           
import pandas as pd

data = pd.read_excel("all_CC_data_compiled_20221027.xlsx")

component_types = ["Electrophile", "Nucleophile", "Catalyst1", "Ligand1", "Ligand2", \
                   "Catalyst2", "BaseAcid", "ReductantOxidant", "Additive", "Solvent1", "Product"]
component_smiles = [k+"_SMILES" for k in component_types]
component_charges = [k+" charge" for k in component_types]

convert_data = {k:[] for k in component_types}
for k in component_smiles:
    convert_data[k] = []
for k in component_charges:
    convert_data[k] = []
convert_data["Pd/IS"] = []
convert_data["Plate Position"] = []
convert_data["Reaction_type"] = []
convert_data["Temperature"] = []
    
for i,k in data.iterrows():
    for j in component_types:
        convert_data[j].append(k[j])
        if j == "Solvent1" or j == "Solvent2":
            sm = "None"
        else:
            sm = k[j+" SMILES"]
        if sm == None or sm == "None" or pd.isna(sm):
            convert_data[j+"_SMILES"].append("None")
        else:
            # print(sm)
            sm = Chem.MolFromSmiles(sm)
            if sm != None:
                sm = Chem.MolToSmiles(sm)
            else:
                sm = "None"
            convert_data[j+"_SMILES"].append(sm)
        if j == "Solvent1" or j == "Solvent2" or j == "Product":
            convert_data[j+" charge"].append("None")
        else:
            convert_data[j+" charge"].append(f"{k[j+ ' Conc (M)']}")
    convert_data["Pd/IS"].append(k["Output Value"])
    convert_data["Plate Position"].append(k["Row"] + str(k["Column"]))
    # convert_data["Plate Position"].append(k["1536loc"])

    convert_data["Reaction_type"].append(k["Notebook"])
    convert_data["Temperature"].append(k["Temperature"])

out_data = pd.DataFrame(convert_data)

for i,k in out_data.iterrows():
    reaction = reaction_pb2.Reaction()
    for rxt in ["Electrophile", "Nucleophile", "Catalyst1", "Ligand1", "Ligand2", "Catalyst2", "BaseAcid", "ReductantOxidant", "Additive"]:
        if k[rxt] == "None":
            continue
        solute = reaction.inputs[rxt].components.add()
        solute.CopyFrom(
            message_helpers.build_compound(
                name=str(k[rxt]),
                smiles=k[rxt+"_SMILES"],
                role="reactant",
                amount=f"{float(k[rxt+' charge'])*.1} mmol",
                prep=None,
                is_limiting=False,
                prep_details=None,
            )
        )
    solvent = reaction.inputs["Solvent1"].components.add()
    solvent.CopyFrom(
        message_helpers.build_compound(
            name=k["Solvent1"],
            smiles="placeholder",
            role="solvent",
            amount="100 uL",
            prep=None,
            is_limiting=False,
            prep_details=None,
        )
    )
    solvent.amount.volume_includes_solutes = True
    
    outcome = reaction.outcomes.add()
    prod_2a = outcome.products.add(is_desired_product=True)
    prod_2a.identifiers.add(type="SMILES", value=k["Product_SMILES"])
    prod_2a.identifiers.add(type="NAME", value=k["Product"])
    prod_2a.reaction_role = reaction_pb2.ReactionRole.PRODUCT
    prod_2a.measurements.add(type="YIELD", analysis_key="UPLC-MS Integration", percentage=dict(value=k["Pd/IS"], precision=5), uses_internal_standard=True)

    reaction.conditions.temperature.CopyFrom(
        reaction_pb2.TemperatureConditions(
            control=(
                dict(type=k["Temperature"])
            )
        )
    )
    # reaction.conditions.add(type="Temperature", value= k["Temperature"])
    reactions.append(reaction)
    


In [51]:
print(len(reactions))

5136


In [52]:
dataset = dataset_pb2.Dataset(
    name="all_phactor_screens",
    description="Screens from figure2, figure3 and figure4 in the phactor paper",
    reactions=reactions,
)
message_helpers.write_message(dataset, 'ord-phactor-screens.pbtxt')