In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
# from rdkit.Chem.Draw import MolDrawing, DrawingOptions, MolToImage

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

import numpy as np

from tqdm.notebook import tqdm

from tqdm.contrib.concurrent import process_map

import pandas as pd



In [2]:
from rdkit.Chem import rdMolDescriptors, Descriptors

In [3]:
def cal_pmi(s):
    try:
        m = Chem.MolFromSmiles(s)
        mol = Chem.AddHs(m)
        AllChem.EmbedMolecule(mol)
        x = Chem.Descriptors3D.NPR1(mol)
        y = Chem.Descriptors3D.NPR2(mol)
        return x, y
    except:
        return None,None

In [6]:
def run_cal_pmi(i):
    ind_start = i*100000
    ind_end = (i+1)*100000
    all_pmi = []
    smiles_to_run = smiles[ind_start:ind_end]
    for s in tqdm(smiles_to_run):
        all_pmi.append(cal_pmi(s))
    pmi_x = [i[0] for i in all_pmi]
    pmi_y = [i[1] for i in all_pmi]
    out_df = pd.DataFrame({"smiles":smiles,"pmi_x":pmi_x,"pmi_y":pmi_y})
    out_df.to_csv(f"pmi_{use_filename.split('.')[0]}_{i}.csv")
    

In [7]:
use_filename = "stereo_sp3_large.csv"

In [8]:
data = pd.read_csv(use_filename)

In [9]:
data.head()

Unnamed: 0,smiles
0,C[C@H](N[C@H](C)c1ccccc1)[C@@H](C(=O)O)c1ccc(N...
1,C[C@@H](N[C@H](C)c1ccccc1)[C@@H](C(=O)O)c1ccc(...
2,CC[C@@H](C(=O)OC[C@@H](N)c1ccccc1)c1ccc(N2Cc3c...
3,C[C@H](C[C@@H](N)c1ccccc1)[C@@H](C(=O)O)c1ccc(...
4,C[C@@H](C[C@@H](N)c1ccccc1)[C@@H](C(=O)O)c1ccc...


In [10]:
smiles = list(data.smiles)

In [11]:
smiles = list(set(smiles))

In [12]:
len(smiles)

3178433

In [None]:
run_cal_pmi(3)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [None]:
result = process_map(cal_pmi,smiles,max_workers=48,chunksize=1000)

  0%|          | 0/3178433 [00:00<?, ?it/s]

In [21]:
data["pmi_x"] = [i[0] for i in result]
data["pmi_y"] = [i[1] for i in result]

In [22]:
data.head()

Unnamed: 0,smiles,pmi_x,pmi_y
0,C=COc1ccno1,0.163239,0.88916
1,C=COOC1=CC=N1,0.117137,0.962207
2,C=COC1=CC=N1,0.197921,0.820305
3,C=Cc1ccno1,0.209901,0.790099
4,C=CC1=CC=NOO1,0.234393,0.765607


In [23]:
data.to_csv(f"pmi_{use_filename}")

1000 per minute on 8 cores
6000 per minute on 48 cores

In [22]:
data.to_csv("pmi_ac2drug_am2small_small.csv")

In [26]:
data2 = pd.read_csv("coupled_aromatic_ac2drug_am2small.csv")

In [27]:
data2

Unnamed: 0,rmat_tag,distance,largest_frag,largest_cleaned
0,50231461,1.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]([15N][1...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc(Nc3cc(Cl)cc...
1,53844353,1.0,[15N][14c]1cc(Cl)cc(Cl)[13c]1-[15c]1cc2c(C3CCC...,Nc1cc(Cl)cc(Cl)c1-c1cc2c(C3CCCCC3)c(-c3ccccc3)...
2,50626702,1.0,[15N][14c]1cc(Cl)cc(Cl)[13c]1[18O][17C](=[17O]...,Nc1cc(Cl)cc(Cl)c1OC(=O)c1cc2c(cc1)c(C1CCCCC1)c...
3,50230203,1.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c][16c]([1...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2ccc(C(=O)ONc3c...
4,53844354,2.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)C(=O...
...,...,...,...,...
7235,39341506,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)=c1c...
7236,39341510,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)=c1c...
7237,43275542,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)=c1c...
7238,43275556,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)=c1c...


In [28]:
smiles = list(data2.largest_frag)

In [30]:
result = process_map(cal_pmi,smiles,max_workers=16,chunksize=500)

In [14]:
result = np.array(result)

In [15]:
data2["pmi_x"] = result[:,0]
data2["pmi_y"] = result[:,1]

In [16]:
data2

Unnamed: 0,rmat_tag,distance,largest_frag,largest_cleaned,pmi_x,pmi_y
0,50231461,3.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]([15N][1...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc(Nc3cc(Cl)cc...,0.32488,0.791645
1,53844353,3.0,[15N][14c]1cc(Cl)cc(Cl)[13c]1-[15c]1cc2c(C3CCC...,Nc1cc(Cl)cc(Cl)c1-c1cc2c(C3CCCCC3)c(-c3ccccc3)...,0.388706,0.733463
2,50626702,3.0,[15N][14c]1cc(Cl)cc(Cl)[13c]1[18O][17C](=[17O]...,Nc1cc(Cl)cc(Cl)c1OC(=O)c1cc2c(cc1)c(C1CCCCC1)c...,0.298142,0.794647
3,50230203,3.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c][16c]([1...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2ccc(C(=O)ONc3c...,0.301793,0.797943
4,50231462,4.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)C(=O...,0.314681,0.78479
...,...,...,...,...,...,...
7235,50290366,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)OON=...,0.312885,0.727153
7236,51733479,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]([17C]3=...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc(C3=NOOc4cc(...,0.317694,0.732366
7237,52733260,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)-c1c...,0.415783,0.627639
7238,51718333,14.0,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2c[15c]3[16c](c...,O=C(Cn1c(-c2ccccc2)c(C2CCCCC2)c2cc3c(cc21)Oc1c...,0.370812,0.685895


In [None]:
data2.to_csv("pmi_ac2drug_am2small_small")