In [1]:
from rdkit import Chem
import pandas as pd
import numpy as np

from rdkit.Chem import rdMolDescriptors, Descriptors

from tqdm.contrib.concurrent import process_map



In [2]:
def getLogP(x): return Chem.rdMolDescriptors.CalcCrippenDescriptors(x)[0]
def getMW(x): return Chem.Descriptors.MolWt(x)
def getHBD(x): return Chem.rdMolDescriptors.CalcNumHBD(x)
def getHBA(x): return Chem.rdMolDescriptors.CalcNumHBA(x)
def getPSA(x): return Chem.rdMolDescriptors.CalcTPSA(x)
def getROTB(x): return Chem.rdMolDescriptors.CalcNumRotatableBonds(x)
def getAROM(x): return Chem.rdMolDescriptors.CalcNumAromaticRings(x)
def getFSP3(x): return Chem.rdMolDescriptors.CalcFractionCSP3(x)
def getFC(x): return Chem.rdmolops.GetFormalCharge(x)
def getQED(x): return Chem.QED.qed(x)
def getSSSR(x): return Chem.GetSSSR(x)

def getallprops(s):
    x = Chem.MolFromSmiles(s)
    return [getLogP(x),getMW(x),getHBD(x),getHBA(x),getPSA(x),getROTB(x),getFSP3(x),getSSSR(x),getQED(x)]

In [5]:
getallprops("O=C(Nc1cc(Cl)cc(Cl)c1)c1ccc2c(C3CCCCC3)c(-c3ccccc3)n(CC(=O)N3CCOCC3)c2c1")

[7.773900000000009,
 590.5510000000003,
 1,
 4,
 63.57000000000001,
 6,
 0.3333333333333333,
 6,
 0.24961135541957985]

In [21]:
data = pd.read_csv("ac2drug_am2small_small.csv")

In [22]:
data

Unnamed: 0,rmat_tag,pdt_smiles,frag_in_large,pdt_cleaned
0,36909943,[15C]=[16C][17O][17C]1=[14C][13C]=[15N][18O]1,[15C]=[16C][17O][17C]1=[14C][13C]=[15N][18O]1,C=COC1=CC=NO1
1,36909947,[15C]=[16C][17O][18O][17C]1=[14C][13C]=[15N]1,[15C]=[16C][17O][18O][17C]1=[14C][13C]=[15N]1,C=COOC1=CC=N1
2,36909948,[15C]=[16C][17O][17C]1=[14C][13C]=[15N]1.[18O],[15C]=[16C][17O][17C]1=[14C][13C]=[15N]1,C=COC1=CC=N1
3,36910003,[15C]=[16C][17C]1=[14C][13C]=[15N][18O]1.[17O],[15C]=[16C][17C]1=[14C][13C]=[15N][18O]1,C=CC1=CC=NO1
4,36910005,[15C]=[16C][17C]1=[14C][13C]=[15N][17O][18O]1,[15C]=[16C][17C]1=[14C][13C]=[15N][17O][18O]1,C=CC1=CC=NOO1
...,...,...,...,...
7235,53886546,[18O][15N]1[17O][17C]12[14C]=[13C][15C]=[16C]2,[18O][15N]1[17O][17C]12[14C]=[13C][15C]=[16C]2,ON1OC12C=CC=C2
7236,53886547,[15N]=[17C]1[14C]=[13C][15C]=[16C]1.[17O].[18O],[15N]=[17C]1[14C]=[13C][15C]=[16C]1,N=C1C=CC=C1
7237,53886550,[17O].[18O][15N]=[17C]1[14C]=[13C][15C]=[16C]1,[18O][15N]=[17C]1[14C]=[13C][15C]=[16C]1,ON=C1C=CC=C1
7238,53886551,[17O][18O][15N]=[17C]1[14C]=[13C][15C]=[16C]1,[17O][18O][15N]=[17C]1[14C]=[13C][15C]=[16C]1,OON=C1C=CC=C1


In [23]:
all_molprops = process_map(getallprops,list(data.pdt_cleaned),max_workers=16,chunksize=1000)

  0%|          | 0/7240 [00:00<?, ?it/s]

In [24]:
data["allprops"] = all_molprops
data[["LogP","MW","HBD","HBA","PSA","ROTB","FSP3","SSSR","QED"]] = data.allprops.tolist()

data.drop("allprops",axis=1,inplace=True)

In [25]:
data.to_csv("props_ac2drug_am2small_small.csv",index=False)

In [9]:
data = pd.read_csv("enamine_small.csv")

In [10]:
data

Unnamed: 0,rmat_tag,pdt_smiles,frag_in_large,pdt_cleaned
0,40932,[13C].[14C][18O][16C].[15C].[15N].[17C]=[17O],[14C][18O][16C],COC
1,40955,[13C].[14C][18O][17C][17O][16C].[15C].[15N],[14C][18O][17C][17O][16C],COCOC
2,40986,[13C].[14C][18O][16C][17O][15N]=[17C].[15C],[14C][18O][16C][17O][15N]=[17C],COCON=C
3,41039,[13C].[14C][18O][15N]([16C])[17C]=[17O].[15C],[14C][18O][15N]([16C])[17C]=[17O],CON(C)C=O
4,41040,[13C].[14C][18O][17C](=[17O])[15N][16C].[15C],[14C][18O][17C](=[17O])[15N][16C],COC(=O)NC
...,...,...,...,...
2749744,55323663,[13C]#[14C][15C]=[16C][17C](=[15N])[18O].[17O],[13C]#[14C][15C]=[16C][17C](=[15N])[18O],C#CC=CC(=N)O
2749745,55323664,[13C]#[14C][15C]=[16C][17C](=[15N])[18O][17O],[13C]#[14C][15C]=[16C][17C](=[15N])[18O][17O],C#CC=CC(=N)OO
2749746,55323666,[13C]#[14C][15C]=[16C][17C]([18O])=[15N][17O],[13C]#[14C][15C]=[16C][17C]([18O])=[15N][17O],C#CC=CC(O)=NO
2749747,55323667,[13C]#[14C][15C]=[16C][17C]1=[15N][17O][18O]1,[13C]#[14C][15C]=[16C][17C]1=[15N][17O][18O]1,C#CC=CC1=NOO1


In [12]:
all_molprops = process_map(getallprops,list(data.pdt_cleaned),max_workers=32,chunksize=100)

data["allprops"] = all_molprops
data[["LogP","MW","HBD","HBA","PSA","ROTB","FSP3","SSSR","QED"]] = data.allprops.tolist()

data.drop("allprops",axis=1,inplace=True)

  0%|          | 0/2749749 [00:00<?, ?it/s]

In [15]:
data.to_csv("enamine_small_props.csv",index=False)