### get molecular properties of the late-stage diversified systems

In [1]:
from rdkit import Chem
import pandas as pd
import numpy as np

from rdkit.Chem import AllChem, rdMolDescriptors, Descriptors

from tqdm.notebook import tqdm

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

### physicochemical properties - for joint distribution plots.

In [2]:
def getLogP(x): return Chem.rdMolDescriptors.CalcCrippenDescriptors(x)[0]
def getMW(x): return Chem.Descriptors.MolWt(x)
def getHBD(x): return Chem.rdMolDescriptors.CalcNumHBD(x)
def getHBA(x): return Chem.rdMolDescriptors.CalcNumHBA(x)
def getPSA(x): return Chem.rdMolDescriptors.CalcTPSA(x)
def getROTB(x): return Chem.rdMolDescriptors.CalcNumRotatableBonds(x)
def getAROM(x): return Chem.rdMolDescriptors.CalcNumAromaticRings(x)
def getFSP3(x): return Chem.rdMolDescriptors.CalcFractionCSP3(x)
def getFC(x): return Chem.rdmolops.GetFormalCharge(x)
def getQED(x): return Chem.QED.qed(x)
def getSSSR(x): return Chem.GetSSSR(x)

def getallprops(s):
    x = Chem.MolFromSmiles(s)
    return [getLogP(x),getMW(x),getHBD(x),getHBA(x),getPSA(x),getROTB(x),getFSP3(x),getSSSR(x),getQED(x)]

In [3]:
def get_df_props(input_filename,output_filename,smiles_column):
    data = pd.read_csv(input_filename)
    
    all_molprops = []
    
    for s in tqdm(data[smiles_column]):
        all_molprops.append(getallprops(s))
        
    data["allprops"] = all_molprops
    data[["LogP","MW","HBD","HBA","PSA","ROTB","FSP3","SSSR","QED"]] = data.allprops.tolist()
    data.drop("allprops",axis=1,inplace=True)
    data.to_csv(output_filename,index=False)
    
    

### for sp3 system

In [None]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_smiles.csv"

output_filename = "./ac3drug_am3small/ac3drug_am3small_props.csv"
smiles_column = "largest_cleaned"

In [None]:
get_df_props(input_filename,output_filename,smiles_column)

In [None]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_rxn_centers.csv"

output_filename = "./ac3drug_am3small/ac3drug_am3small_rxn_centers_props.csv"
smiles_column = "pdt_cleaned"

In [None]:
get_df_props(input_filename,output_filename,smiles_column)

### for sp2 system

In [None]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_smiles.csv"

output_filename = "./ac2drug_am2small/ac2drug_am2small_props.csv"
smiles_column = "largest_cleaned"

In [None]:
get_df_props(input_filename,output_filename,smiles_column)

In [None]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_rxn_centers.csv"

output_filename = "./ac2drug_am2small/ac2drug_am2small_rxn_centers_props.csv"
smiles_column = "pdt_cleaned"

In [None]:
get_df_props(input_filename,output_filename,smiles_column)

### PMI

In [4]:
def cal_pmi(s):
    try:
        m = Chem.MolFromSmiles(s)
        mol = Chem.AddHs(m)
        AllChem.EmbedMolecule(mol)
        x = Chem.Descriptors3D.NPR1(mol)
        y = Chem.Descriptors3D.NPR2(mol)
        return x, y
    except:
        return None,None
    
def get_df_pmi(smiles):

    result = []
    for s in tqdm(smiles):
        result.append(cal_pmi(s))
        
    return np.array(result)

### sp3 system

### full molecule

In [None]:
# 5 minutes

input_filename = "./ac3drug_am3small/ac3drug_am3small_smiles.csv"
output_filename = "./ac3drug_am3small/ac3drug_am3small_pmi.csv"
smiles_column = "largest_cleaned"
reduction_factor = 20

data = pd.read_csv(input_filename)
short_data = data.iloc[::reduction_factor].copy()

result = get_df_pmi(short_data[smiles_column])

short_data["pmi_x"] = result[:,0]
short_data["pmi_y"] = result[:,1]

short_data.to_csv(output_filename)

### reaction center

In [None]:
input_filename = "./ac3drug_am3small/ac3drug_am3small_rxn_centers.csv"
output_filename = "./ac3drug_am3small/ac3drug_am3small_rxn_centers_pmi.csv"
smiles_column = "pdt_cleaned"

In [None]:
reduction_factor = 20

data = pd.read_csv(input_filename)
short_data = data.iloc[::reduction_factor].copy()

result = get_df_pmi(short_data[smiles_column])

short_data["pmi_x"] = result[:,0]
short_data["pmi_y"] = result[:,1]

short_data.to_csv(output_filename)

### sp2 system

### full molecule

In [None]:
# 10 minutes
input_filename = "./ac2drug_am2small/ac2drug_am2small_smiles.csv"
output_filename = "./ac2drug_am2small/ac2drug_am2small_pmi.csv"
smiles_column = "largest_cleaned"

reduction_factor = 100

data = pd.read_csv(input_filename)
short_data = data.iloc[::reduction_factor].copy()

result = get_df_pmi(short_data[smiles_column])

In [None]:


short_data["pmi_x"] = result[:,0]
short_data["pmi_y"] = result[:,1]

short_data.to_csv(output_filename)

### reaction center

In [5]:
input_filename = "./ac2drug_am2small/ac2drug_am2small_rxn_centers.csv"
output_filename = "./ac2drug_am2small/ac2drug_am2small_rxn_centers_pmi.csv"
smiles_column = "pdt_cleaned"

In [6]:
reduction_factor = 100

data = pd.read_csv(input_filename)
short_data = data.iloc[::reduction_factor].copy()

result = get_df_pmi(short_data[smiles_column])

short_data["pmi_x"] = result[:,0]
short_data["pmi_y"] = result[:,1]

short_data.to_csv(output_filename)

  0%|          | 0/721 [00:00<?, ?it/s]