In [14]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
# from rdkit.Chem.Draw import MolDrawing, DrawingOptions, MolToImage

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

import numpy as np

from tqdm.notebook import tqdm

from tqdm.contrib.concurrent import process_map

import pandas as pd

In [15]:
from rdkit.Chem import rdMolDescriptors, Descriptors

In [16]:
def cal_pmi(s):
    try:
        m = Chem.MolFromSmiles(s)
        mol = Chem.AddHs(m)
        AllChem.EmbedMolecule(mol)
        x = Chem.Descriptors3D.NPR1(mol)
        y = Chem.Descriptors3D.NPR2(mol)
        return x, y
    except:
        return None,None

In [17]:
fragment_data = pd.read_csv("pmi_withH.csv")

In [18]:
fragment_data.head()

Unnamed: 0,tag,smiles,energy,natoms,matches,min_dist_all,pmi_x,pmi_y,QED
0,47175,C#CNOC=C=C=O,27.053258,8,0,5,0.08534,0.956318,0.099648
1,16719,C#CN=COOC=C,66.764029,8,0,8,0.08928,0.969234,0.101996
2,52793,C=C=COOC=NC,55.738542,8,0,8,0.184249,0.84852,0.103434
3,109307,C#CNOC=CC=O,72.520258,8,0,4,0.29327,0.904591,0.104276
4,212268,C=C=COOC#CN,24.060621,8,0,6,0.072374,0.969175,0.104546


In [19]:
pmi_dict = {}

for r in fragment_data.itertuples():
    pmi_dict[r.smiles] = (r.pmi_x,r.pmi_y)

In [20]:
data = pd.read_csv("ac2drug_am2small_small.csv")

In [21]:
data.head()

Unnamed: 0,rmat_tag,pdt_smiles,frag_in_large,pdt_cleaned
0,36909943,[15C]=[16C][17O][17C]1=[14C][13C]=[15N][18O]1,[15C]=[16C][17O][17C]1=[14C][13C]=[15N][18O]1,C=COC1=CC=NO1
1,36909947,[15C]=[16C][17O][18O][17C]1=[14C][13C]=[15N]1,[15C]=[16C][17O][18O][17C]1=[14C][13C]=[15N]1,C=COOC1=CC=N1
2,36909948,[15C]=[16C][17O][17C]1=[14C][13C]=[15N]1.[18O],[15C]=[16C][17O][17C]1=[14C][13C]=[15N]1,C=COC1=CC=N1
3,36910003,[15C]=[16C][17C]1=[14C][13C]=[15N][18O]1.[17O],[15C]=[16C][17C]1=[14C][13C]=[15N][18O]1,C=CC1=CC=NO1
4,36910005,[15C]=[16C][17C]1=[14C][13C]=[15N][17O][18O]1,[15C]=[16C][17C]1=[14C][13C]=[15N][17O][18O]1,C=CC1=CC=NOO1


In [22]:
smiles = list(data["pdt_cleaned"])

In [23]:
all_pmi = []
for s in tqdm(data.pdt_cleaned):
    
    s2 = Chem.MolToSmiles(Chem.MolFromSmiles(s))
    try:
        all_pmi.append(pmi_dict[s2])
        
    except KeyError:
        all_pmi.append(cal_pmi(s2))

  0%|          | 0/7240 [00:00<?, ?it/s]

In [24]:
data["pmi_x"] = np.array(all_pmi)[:,0]
data["pmi_y"] = np.array(all_pmi)[:,1]

In [22]:
data.to_csv("pmi_ac2drug_am2small_small.csv")

In [26]:
data2 = pd.read_csv("coupled_aromatic_ac2drug_am2small.csv")

In [27]:
data2

Unnamed: 0,rmat_tag,distance,largest_frag,largest_cleaned
0,50231461,1.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]([15N][1...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc(Nc3cc(Cl)cc...
1,53844353,1.0,[15N][14c]1cc(Cl)cc(Cl)[13c]1-[15c]1cc2c(C3CCC...,Nc1cc(Cl)cc(Cl)c1-c1cc2c(C3CCCCC3)c(-c3ccccc3)...
2,50626702,1.0,[15N][14c]1cc(Cl)cc(Cl)[13c]1[18O][17C](=[17O]...,Nc1cc(Cl)cc(Cl)c1OC(=O)c1cc2c(cc1)c(C1CCCCC1)c...
3,50230203,1.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c][16c]([1...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2ccc(C(=O)ONc3c...
4,53844354,2.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)C(=O...
...,...,...,...,...
7235,39341506,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)=c1c...
7236,39341510,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)=c1c...
7237,43275542,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)=c1c...
7238,43275556,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)=c1c...


In [28]:
smiles = list(data2.largest_frag)

In [30]:
result = process_map(cal_pmi,smiles,max_workers=16,chunksize=500)

In [14]:
result = np.array(result)

In [15]:
data2["pmi_x"] = result[:,0]
data2["pmi_y"] = result[:,1]

In [16]:
data2

Unnamed: 0,rmat_tag,distance,largest_frag,largest_cleaned,pmi_x,pmi_y
0,50231461,3.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]([15N][1...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc(Nc3cc(Cl)cc...,0.32488,0.791645
1,53844353,3.0,[15N][14c]1cc(Cl)cc(Cl)[13c]1-[15c]1cc2c(C3CCC...,Nc1cc(Cl)cc(Cl)c1-c1cc2c(C3CCCCC3)c(-c3ccccc3)...,0.388706,0.733463
2,50626702,3.0,[15N][14c]1cc(Cl)cc(Cl)[13c]1[18O][17C](=[17O]...,Nc1cc(Cl)cc(Cl)c1OC(=O)c1cc2c(cc1)c(C1CCCCC1)c...,0.298142,0.794647
3,50230203,3.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c][16c]([1...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2ccc(C(=O)ONc3c...,0.301793,0.797943
4,50231462,4.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)C(=O...,0.314681,0.78479
...,...,...,...,...,...,...
7235,50290366,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)OON=...,0.312885,0.727153
7236,51733479,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]([17C]3=...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc(C3=NOOc4cc(...,0.317694,0.732366
7237,52733260,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)-c1c...,0.415783,0.627639
7238,51718333,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)Oc1c...,0.370812,0.685895


In [None]:
data2.to_csv("pmi_ac2drug_am2small_small")