### get molecular properties of the late-stage diversified systems

In [31]:
from rdkit import Chem
import pandas as pd
import numpy as np

from rdkit.Chem import AllChem, rdMolDescriptors, Descriptors

from tqdm.contrib.concurrent import process_map

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

### physicochemical properties - for joint distribution plots.

In [32]:
def getLogP(x): return Chem.rdMolDescriptors.CalcCrippenDescriptors(x)[0]
def getMW(x): return Chem.Descriptors.MolWt(x)
def getHBD(x): return Chem.rdMolDescriptors.CalcNumHBD(x)
def getHBA(x): return Chem.rdMolDescriptors.CalcNumHBA(x)
def getPSA(x): return Chem.rdMolDescriptors.CalcTPSA(x)
def getROTB(x): return Chem.rdMolDescriptors.CalcNumRotatableBonds(x)
def getAROM(x): return Chem.rdMolDescriptors.CalcNumAromaticRings(x)
def getFSP3(x): return Chem.rdMolDescriptors.CalcFractionCSP3(x)
def getFC(x): return Chem.rdmolops.GetFormalCharge(x)
def getQED(x): return Chem.QED.qed(x)
def getSSSR(x): return Chem.GetSSSR(x)

def getallprops(s):
    x = Chem.MolFromSmiles(s)
    return [getLogP(x),getMW(x),getHBD(x),getHBA(x),getPSA(x),getROTB(x),getFSP3(x),getSSSR(x),getQED(x)]

In [11]:
def get_df_props(input_filename,output_filename,smiles_column):
    data = pd.read_csv(input_filename)
    all_molprops = process_map(getallprops,list(data[smiles_column]),max_workers=24,chunksize=10000)
    data["allprops"] = all_molprops
    data[["LogP","MW","HBD","HBA","PSA","ROTB","FSP3","SSSR","QED"]] = data.allprops.tolist()
    data.drop("allprops",axis=1,inplace=True)
    data.to_csv(output_filename,index=False)
    
    

### for sp3 system

In [6]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_smiles.csv"

output_filename = "./ac3drug_am3small/ac3drug_am3small_props.csv"
smiles_column = "largest_cleaned"

In [7]:
get_df_props(input_filename,output_filename,smiles_column)

  0%|          | 0/916053 [00:00<?, ?it/s]

In [8]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_rxn_centers.csv"

output_filename = "./ac3drug_am3small/ac3drug_am3small_rxn_centers_props.csv"
smiles_column = "pdt_cleaned"

In [9]:
get_df_props(input_filename,output_filename,smiles_column)

  0%|          | 0/916053 [00:00<?, ?it/s]

### for sp2 system

In [12]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_aromatic.csv"

output_filename = "./ac2drug_am2small/ac2drug_am2small_aromatic_props.csv"
smiles_column = "largest_cleaned"

In [13]:
get_df_props(input_filename,output_filename,smiles_column)

  0%|          | 0/7240 [00:00<?, ?it/s]

In [14]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_rxn_centers.csv"

output_filename = "./ac2drug_am2small/ac2drug_am2small_rxn_centers_props.csv"
smiles_column = "pdt_cleaned"

In [15]:
get_df_props(input_filename,output_filename,smiles_column)

  0%|          | 0/7240 [00:00<?, ?it/s]

### PMI

In [39]:
def cal_pmi(s):
    try:
        m = Chem.MolFromSmiles(s)
        mol = Chem.AddHs(m)
        AllChem.EmbedMolecule(mol)
        x = Chem.Descriptors3D.NPR1(mol)
        y = Chem.Descriptors3D.NPR2(mol)
        return x, y
    except:
        return None,None
    
def get_df_pmi(input_filename,output_filename,smiles_column):
    data = pd.read_csv(input_filename)
#     smiles = data[smiles_column][::100]
    smiles = data[smiles_column]
    result = process_map(cal_pmi,smiles,max_workers=32,chunksize=10000)
    
    result = np.array(result)

    data["pmi_x"] = result[:,0]
    data["pmi_y"] = result[:,1]
    
    data.to_csv(output_filename)

### sp3 system

### full molecule

In [40]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_smiles.csv"
output_filename = "./ac3drug_am3small/ac3drug_am3small_pmi.csv"
smiles_column = "largest_cleaned"

In [41]:
get_df_pmi(input_filename,output_filename,smiles_column)

  0%|          | 0/916053 [00:00<?, ?it/s]

### reaction center

In [37]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_rxn_centers.csv"
output_filename = "./ac3drug_am3small/ac3drug_am3small_rxn_centers_pmi.csv"
smiles_column = "pdt_cleaned"

In [38]:
get_df_pmi(input_filename,output_filename,smiles_column)

  0%|          | 0/916053 [00:00<?, ?it/s]

### sp2 system

### full molecule

In [65]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_aromatic.csv"
output_filename = "./ac2drug_am2small/ac2drug_am2small_aromatic_pmi.csv"
smiles_column = "largest_cleaned"

In [66]:
data = pd.read_csv(input_filename)
#     smiles = data[smiles_column][::100]
smiles = data[smiles_column]

result = []
for s in tqdm(smiles):
    result.append(cal_pmi(s))

result = np.array(result)

data["pmi_x"] = result[:,0]
data["pmi_y"] = result[:,1]

data.to_csv(output_filename)

  0%|          | 0/3890386 [00:00<?, ?it/s]

### reaction center

In [67]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_rxn_centers.csv"
output_filename = "./ac2drug_am2small/ac2drug_am2small_rxn_centers_pmi.csv"
smiles_column = "pdt_cleaned"

In [68]:
data = pd.read_csv(input_filename)
#     smiles = data[smiles_column][::100]
smiles = data[smiles_column]

result = []
for s in tqdm(smiles):
    result.append(cal_pmi(s))
    
    
result = np.array(result)

data["pmi_x"] = result[:,0]
data["pmi_y"] = result[:,1]

data.to_csv(output_filename)

  0%|          | 0/7240 [00:00<?, ?it/s]