In [1]:
import pandas as pd

In [2]:
data = pd.read_excel("../all_transformations_ch_specialized_20240702_updated.xlsx")

In [3]:
data.columns

Index(['input_target_molecule_smiles', 'mapped_input_target_molecule_smiles',
       'target_substructure_smarts', 'substructure_atom_map',
       'disconnection_map', 'synthon_a', 'synthon_a_label', 'synthon_b',
       'synthon_b_label', 'synthon_a_building_block_class',
       'synthon_a_building_block_label', 'synthon_b_building_block_class',
       'synthon_b_building_block_label', 'synthon_a_building_block_smiles',
       'synthon_a_building_block_cas', 'synthon_a_building_block_price',
       'synthon_a_building_block_material', 'synthon_b_building_block_smiles',
       'synthon_b_building_block_cas', 'synthon_b_building_block_price',
       'synthon_b_building_block_material', 'benzylic_a', 'alpha_het_a',
       'alpha_carbonyl_a', 'benzylic_b', 'alpha_het_b', 'alpha_carbonyl_b',
       'bond_formed', 'synth_class_specific_a', 'synth_class_specific_b',
       'synth_class_specific_a_updated', 'synth_class_specific_b_updated',
       'Unnamed: 32', 'benzylic_a_updated', 'benzylic

In [4]:
all_groups = data.groupby("bond_formed")

In [5]:
for i, ii in all_groups:
    print(i)

alkyl-alkyl-CC
alkyl-alkyl-CN
alkyl-alkyl-CO
alkyl-aryl-Cn
aryl-alkyl-cC
aryl-alkyl-cN
aryl-alkyl-cO
aryl-aryl-cc
aryl-aryl-cn


In [6]:
all_lab = list(
    set(
        data["synth_class_specific_a"].unique().tolist()
        + data["synth_class_specific_b"].unique().tolist()
    )
)
print(len(all_lab))

all_labs = [
    "alcohol",
    "boronate",
    "acid",
    "amine",
    "iodide",
    "bromide",
    "chloride",
]

18


In [7]:
from rdkit import Chem
import tabulate

def remove_atom_mapping(x):
    mol = Chem.MolFromSmiles(x)
    for atom in mol.GetAtoms():
        atom.SetAtomMapNum(0)
    return Chem.MolToSmiles(mol)


table = {
    "building block": [],
    "label": [],
    "unique drugs formed": [],
    "comm. synthon a": [],
    "comm. synthon b": [],
    "total reactions": [],
}

for ii in [("alkyl", "alkyl", "alkyl-alkyl-CC"), ("alkyl", "aryl", "aryl-alkyl-cC"), ("aryl", "alkyl", "aryl-alkyl-cC"), ("aryl", "aryl", "aryl-aryl-cc")]:
    for i in all_labs:
        # print(ii[0], i)
        d = data[((data["synth_class_specific_a"] == ii[0] + " " + i) | (data["synth_class_specific_b"] == ii[0] + " " + i)) & (data["bond_formed"] == ii[2])]
        d = d.drop_duplicates()
        syn_a, syn_b = set(), set()
        for j,k in d.iterrows():
            if k["synth_class_specific_a"] == ii[0] + " " + i:
                sm = remove_atom_mapping(k["synthon_a_building_block_smiles"])
                syn_a.add(sm)
                sm = remove_atom_mapping(k["synthon_b_building_block_smiles"])
                syn_b.add(sm)
            elif k["synth_class_specific_b"] == ii[0] + " " + i:
                sm = remove_atom_mapping(k["synthon_b_building_block_smiles"])
                syn_a.add(sm)
                sm = remove_atom_mapping(k["synthon_a_building_block_smiles"])
                syn_b.add(sm)
        
        table["building block"].append(i)
        if ii[0] == "alkyl" and ii[1] == "aryl":
            table["label"].append("alkyl-aryl-Cc")
        else:
            table["label"].append(ii[2])
        table["unique drugs formed"].append(len(d["input_target_molecule_smiles"].unique()))
        table["comm. synthon a"].append(len(syn_a))
        table["comm. synthon b"].append(len(syn_b))
        table["total reactions"].append(len(d))

print(tabulate.tabulate(table, headers="keys", tablefmt="grid"))


+------------------+----------------+-----------------------+-------------------+-------------------+-------------------+
| building block   | label          |   unique drugs formed |   comm. synthon a |   comm. synthon b |   total reactions |
| alcohol          | alkyl-alkyl-CC |                   865 |               469 |              1552 |              6989 |
+------------------+----------------+-----------------------+-------------------+-------------------+-------------------+
| boronate         | alkyl-alkyl-CC |                   466 |                41 |               996 |              2426 |
+------------------+----------------+-----------------------+-------------------+-------------------+-------------------+
| acid             | alkyl-alkyl-CC |                  1191 |               636 |              2218 |              9973 |
+------------------+----------------+-----------------------+-------------------+-------------------+-------------------+
| amine            | alk

In [118]:
pd.DataFrame(table).to_excel("cross_coupling_table_20250713.xlsx")