### get molecular properties of the late-stage diversified systems

In [1]:
from rdkit import Chem
import pandas as pd
import numpy as np

from rdkit.Chem import AllChem, rdMolDescriptors, Descriptors
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions
from tqdm.notebook import tqdm

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

import multiprocessing as mp
import molprops_utils

### physicochemical properties - for joint distribution plots.

In [2]:
def get_df_props(input_filename,output_filename,smiles_column):
    data = pd.read_csv(input_filename)
    all_molprops = []
    
    for s in tqdm(data[smiles_column]):
        all_molprops.append(molprops_utils.getallprops(s))

    data["allprops"] = all_molprops
    data[["LogP","MW","HBD","HBA","PSA","ROTB","FSP3","SSSR","QED"]] = data.allprops.tolist()
    data.drop("allprops",axis=1,inplace=True)
    data.to_csv(output_filename,index=False)
    
    

### for sp3 system

In [3]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_full_4rings.csv"

output_filename = "./ac3drug_am3small/ac3drug_am3small_props.csv"
smiles_column = "largest_cleaned"

In [4]:
get_df_props(input_filename,output_filename,smiles_column)

  0%|          | 0/14153 [00:00<?, ?it/s]

In [5]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_rxnctr_4rings.csv"

output_filename = "./ac3drug_am3small/ac3drug_am3small_rxn_centers_props.csv"
smiles_column = "pdt_cleaned"

In [6]:
get_df_props(input_filename,output_filename,smiles_column)

  0%|          | 0/14153 [00:00<?, ?it/s]

### for sp2 system

In [7]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_full_4rings.csv"

output_filename = "./ac2drug_am2small/ac2drug_am2small_aromatic_props.csv"
smiles_column = "largest_cleaned"

In [8]:
get_df_props(input_filename,output_filename,smiles_column)

  0%|          | 0/9839 [00:00<?, ?it/s]

In [9]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_rxnctr_4rings.csv"

output_filename = "./ac2drug_am2small/ac2drug_am2small_rxn_centers_props.csv"
smiles_column = "pdt_cleaned"

In [10]:
get_df_props(input_filename,output_filename,smiles_column)

  0%|          | 0/9839 [00:00<?, ?it/s]

## PMI

## make the stereocenters

In [11]:
opts = StereoEnumerationOptions(unique=True,onlyUnassigned=False,tryEmbedding=False)

## sp3 system

In [12]:
full_system = pd.read_csv("./ac3drug_am3small/ac3drug_am3small_smiles.csv")
rxn_center  = pd.read_csv("./ac3drug_am3small/ac3drug_am3small_rxn_centers.csv")

# filter ring count 

SSSR = []
for s in tqdm(rxn_center.pdt_cleaned):
    m = Chem.MolFromSmiles(s)
    SSSR.append(len(Chem.GetSymmSSSR(m)))

rxn_center["SSSR"] = SSSR

# save dataset with only 4 rings and below
rxn_center[rxn_center.SSSR <= 4].to_csv("./ac3drug_am3small/ac3drug_am3small_rxnctr_4rings.csv",index=False)

# use the reaction center ring count to filter the full molecular system
valid_rmat_tags = rxn_center[rxn_center.SSSR <= 4].rmat_tag
full_system[full_system.rmat_tag.isin(valid_rmat_tags)].to_csv("./ac3drug_am3small/ac3drug_am3small_full_4rings.csv",index=False)

  0%|          | 0/14169 [00:00<?, ?it/s]

In [13]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_full_4rings.csv"

output_filename = "./ac3drug_am3small/ac3drug_am3small_full_4rings_stereo.csv"
smiles_column = "largest_cleaned"

molprops_utils.get_stereoisomers(input_filename,output_filename,smiles_column)

  0%|          | 0/14153 [00:00<?, ?it/s]

In [14]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_rxnctr_4rings.csv"

output_filename = "./ac3drug_am3small/ac3drug_am3small_rxnctr_stereos.csv"
smiles_column = "pdt_cleaned"

molprops_utils.get_stereoisomers(input_filename,output_filename,smiles_column)

  0%|          | 0/14153 [00:00<?, ?it/s]

## sp2 system

In [15]:
full_system = pd.read_csv("./ac2drug_am2small/ac2drug_am2small_aromatic.csv")
rxn_center  = pd.read_csv("./ac2drug_am2small/ac2drug_am2small_rxn_centers.csv")

SSSR = []
for s in tqdm(rxn_center.pdt_cleaned):
    m = Chem.MolFromSmiles(s)
    SSSR.append(len(Chem.GetSymmSSSR(m)))
    
rxn_center["SSSR"] = SSSR

valid_rmat_tags = rxn_center[rxn_center.SSSR <= 4].rmat_tag

rxn_center[rxn_center.SSSR <= 4].to_csv("./ac2drug_am2small/ac2drug_am2small_rxnctr_4rings.csv",index=False)

full_system[full_system.rmat_tag.isin(valid_rmat_tags)].to_csv("./ac2drug_am2small/ac2drug_am2small_full_4rings.csv",index=False)

  0%|          | 0/9842 [00:00<?, ?it/s]

In [16]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_full_4rings.csv"

output_filename = "./ac2drug_am2small/ac2drug_am2small_aromatic_stereos.csv"
smiles_column = "largest_cleaned"

molprops_utils.get_stereoisomers(input_filename,output_filename,smiles_column)

  0%|          | 0/9839 [00:00<?, ?it/s]

## Calculate PMI

### sp3 system

### full molecule

In [17]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_full_4rings_stereo.csv"
output_filename = "./ac3drug_am3small/ac3drug_am3small_stereo_pmi.csv"
smiles_column = "smiles"

In [18]:
data = pd.read_csv(input_filename,index_col=0)

In [19]:
if __name__ == '__main__':
    with mp.Pool(32) as p:
        result = p.map(molprops_utils.cal_pmi, data.smiles,chunksize=1000)

In [20]:
data["pmi_x"] = np.array(result)[:,0]
data["pmi_y"] = np.array(result)[:,1]
data.to_csv(output_filename,index=False)

### reaction center

In [21]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_rxnctr_stereos.csv"
output_filename = "./ac3drug_am3small/ac3drug_am3small_rxnctr_pmi.csv"
smiles_column = "smiles"

In [22]:
data = pd.read_csv(input_filename)

In [23]:
if __name__ == '__main__':
    with mp.Pool(32) as p:
        result = p.map(molprops_utils.cal_pmi, data.smiles,chunksize=1000)

In [24]:
data["pmi_x"] = np.array(result)[:,0]
data["pmi_y"] = np.array(result)[:,1]
data.to_csv(output_filename,index=False)

### sp2 system

### full molecule

In [25]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_aromatic_stereos.csv"
output_filename = "./ac2drug_am2small/ac2drug_am2small_aromatic_pmi.csv"
smiles_column = "smiles"

In [26]:
data = pd.read_csv(input_filename)

In [27]:
if __name__ == '__main__':
    with mp.Pool(32) as p:
        result = p.map(molprops_utils.cal_pmi, data.smiles,chunksize=1000)

In [28]:
data["pmi_x"] = np.array(result)[:,0]
data["pmi_y"] = np.array(result)[:,1]
data.to_csv(output_filename,index=False)

### reaction center

In [29]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_rxnctr_stereos.csv"
output_filename = "./ac2drug_am2small/ac2drug_am2small_rxnctr_pmi.csv"
smiles_column = "smiles"

In [30]:
data = pd.read_csv(input_filename)

In [31]:
if __name__ == '__main__':
    with mp.Pool(32) as p:
        result = p.map(molprops_utils.cal_pmi, data.smiles,chunksize=1000)

In [32]:
data["pmi_x"] = np.array(result)[:,0]
data["pmi_y"] = np.array(result)[:,1]
data.to_csv(output_filename,index=False)