In [1]:
import pandas as pd
import datamol as dm
from tqdm.auto import tqdm
from rdkit.Chem import rdDepictor
import mols2grid

In [2]:
df_2 = pd.read_csv('E:\\Machine Learning\\Cheminformatics\\CompoundDataAcquisition\\table_3.csv', index_col='Unnamed: 0')

In [3]:
df_2

Unnamed: 0,pIC50,preprocessed_smiles,molecular_weight,n_hba,n_hbd,logp,Lipinski_pass
0,9.522879,OCCN1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1,403.148511,5,1,3.94270,True
1,8.958607,CC(C)(C)[C@]1(O)CCN2C[C@H]3c4ccccc4CCc4cccc(c4...,361.240565,2,1,4.84480,True
2,8.958607,CC(C)(C)[C@]1(O)CCN2C[C@@H]3c4ccccc4CCc4cccc(c...,361.240565,2,1,4.84480,True
3,8.896196,O=C(CCCN1CCC2(CC1)C(=O)NCN2c1ccccc1)c1ccc(F)cc1,395.200905,4,1,3.21710,True
4,8.831503,CC(C)[C@@]1(NC(=O)C2C[C@@H]3c4cccc5[nH]cc(c45)...,611.310769,6,3,2.71720,True
...,...,...,...,...,...,...,...
288,4.775726,Cc1c(C)c2c(c(C)c1O)CCC(C)(CN1CCN(c3cc(N4CCCC4)...,520.352575,8,1,4.21376,True
289,4.700014,CCCCc1oc2ccccc2c1C(=O)c1cc(I)c(OCCN(CC)CC)c(I)c1,645.023690,4,0,6.93620,False
290,4.637932,CCN(CC)C(=S)SSC(=S)N(CC)CC,296.050933,4,0,3.62120,True
291,4.619789,CNC[C@@H]1OCCc2ccsc21,183.071785,3,1,1.58130,True


In [6]:
df = df_2.copy()
df = df[df["Lipinski_pass"]==True].reset_index()
df['Lipinski_pass'].value_counts()

True    260
Name: Lipinski_pass, dtype: int64

The following part of the notebook was inspired by a useful post by Pat Walters 

In [17]:
#generate RDKit mol object for prerprocessed smiles
df['romol'] = dm.from_df(df, smiles_column = "preprocessed_smiles")

#cluster molecules
cluster_list = dm.cluster_mols(df.romol)

cluster_idx = [-1] * len(df)
for i, cluster in enumerate(tqdm(cluster_list[0])):
    #use Bemis-Murcko framework to align structures in each cluster 
    dm.align.auto_align_many([df.romol.values[x] for x in cluster],copy=False,partition_method='cluster', cluster_cutoff = 0.70)
    for c in cluster:
        cluster_idx[c] = i

df['cluster_idx'] = cluster_idx

  0%|          | 0/242 [00:00<?, ?it/s]

In [16]:
cluster_sample_df = df.sort_values("cluster_idx").drop_duplicates("cluster_idx").copy()
cluster_sample_df

Unnamed: 0,index,pIC50,preprocessed_smiles,molecular_weight,n_hba,n_hbd,logp,Lipinski_pass,romol,cluster_idx
107,112,6.853872,CCOc1ccccc1N1CCN(CCCC(=O)NCc2nc3ccc(F)cc3c(=O)...,573.275133,8,1,4.15070,True,<rdkit.Chem.rdchem.Mol object at 0x000001ABF30...,0
84,88,7.136677,Cc1ccc(N2CCN(C[C@@H]3C[C@H]3c3ccccc3)CC2)c(C)c1,320.225249,2,0,4.22914,True,<rdkit.Chem.rdchem.Mol object at 0x000001ABF30...,1
2,2,8.958607,CC(C)(C)[C@]1(O)CCN2C[C@@H]3c4ccccc4CCc4cccc(c...,361.240565,2,1,4.84480,True,<rdkit.Chem.rdchem.Mol object at 0x000001ABF30...,2
201,215,5.823909,Cc1cccc(N2CCN(CCCC(=O)NCc3nc4cc(F)ccc4c(=O)n3-...,527.269654,6,1,4.36024,True,<rdkit.Chem.rdchem.Mol object at 0x000001ABF30...,3
166,176,6.146971,CCCN(CCC)[C@@H]1Cc2cccc3c2n(c(=O)n3CCCCNC(=O)c...,487.294725,5,2,4.54120,True,<rdkit.Chem.rdchem.Mol object at 0x000001ABF30...,4
...,...,...,...,...,...,...,...,...,...,...
7,7,8.769551,CN(C)C(=O)N[C@H]1CC[C@H](CCN2CCN(c3cccc(Cl)c3C...,426.195317,3,1,4.33550,True,<rdkit.Chem.rdchem.Mol object at 0x000001ABF30...,237
6,6,8.791559,OCCN1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1,437.174868,5,1,4.30810,True,<rdkit.Chem.rdchem.Mol object at 0x000001ABF30...,238
5,5,8.800794,CC(C)CC1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=O)C3...,653.221281,6,3,3.19280,True,<rdkit.Chem.rdchem.Mol object at 0x000001ABF30...,239
3,3,8.896196,O=C(CCCN1CCC2(CC1)C(=O)NCN2c1ccccc1)c1ccc(F)cc1,395.200905,4,1,3.21710,True,<rdkit.Chem.rdchem.Mol object at 0x000001ABF30...,240
