Data Preprocessing with RDKit

1. Import necessary libraries

In [1]:
import math 

import datamol as dm
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.patches as mpatches
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw, PandasTools
from rdkit.Chem.MolStandardize import rdMolStandardize

2. Load a dataset 

In [2]:
df = pd.read_csv('D:\\MachineLearning\\table_1.csv', index_col='Unnamed: 0')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293 entries, 0 to 292
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   canonical_smiles    293 non-null    object 
 1   molecule_chembl_id  293 non-null    object 
 2   standard_units      293 non-null    object 
 3   standard_value      293 non-null    float64
 4   pIC50               293 non-null    float64
 5   ROMol               293 non-null    object 
dtypes: float64(2), object(4)
memory usage: 16.0+ KB


In [4]:
df.head(20)

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_units,standard_value,pIC50,ROMol
0,OCCN1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1,CHEMBL567,nM,0.3,9.522879,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...
1,CC(C)(C)[C@]1(O)CCN2C[C@H]3c4ccccc4CCc4cccc(c4...,CHEMBL8514,nM,1.1,8.958607,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...
2,CC(C)(C)[C@]1(O)CCN2C[C@@H]3c4ccccc4CCc4cccc(c...,CHEMBL3885419,nM,1.1,8.958607,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...
3,O=C(CCCN1CCC2(CC1)C(=O)NCN2c1ccccc1)c1ccc(F)cc1,CHEMBL267930,nM,1.27,8.896196,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...
4,CC(C)[C@@]1(NC(=O)[C@@H]2C[C@@H]3c4cccc5[nH]cc...,CHEMBL1255837,nM,1.474,8.831503,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...
5,CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=...,CHEMBL493,nM,1.582,8.800794,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...
6,OCCN1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1,CHEMBL726,nM,1.616,8.791559,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...
7,CN(C)C(=O)N[C@H]1CC[C@H](CCN2CCN(c3cccc(Cl)c3C...,CHEMBL2028019,nM,1.7,8.769551,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...
8,CC(C)(C)NS(=O)(=O)c1ccc(-c2sc(C(=O)N[C@H]3C[C@...,CHEMBL4128926,nM,1.8,8.744727,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...
9,CCN1CCC[C@H]1CNC(=O)c1c(O)c(Cl)cc(Cl)c1OC,CHEMBL8809,nM,1.83,8.737549,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...


Now it is time to preprocess the SMILES notation of loaded molecules. Let;s break down this snippet of code: 

rd.MolStandardize 

In [5]:
smiles_column = "canonical_smiles"

def preprocessSmiles(row):
    
    
    """
    Applied from: https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/
    
    """
    mol = Chem.MolFromSmiles(row[smiles_column])
    clean_mol = rdMolStandardize.Cleanup(mol)
    parent_mol = rdMolStandardize.FragmentParent(clean_mol)
    neut_charge = rdMolStandardize.Uncharger()
    neut_mols = neut_charge.uncharge(parent_mol)
    
    taut_en = rdMolStandardize.TautomerEnumerator()
    taut_uncharged_parent_clean_mol = taut_en.Canonicalize(neut_mols)
    
    standardized_smiles= Chem.MolToSmiles(taut_uncharged_parent_clean_mol,isomericSmiles=True)
    
    return standardized_smiles
    

In [6]:
df['preprocessed_smiles'] = df.apply(preprocessSmiles, axis=1)

[09:12:43] Initializing MetalDisconnector
[09:12:43] Running MetalDisconnector
[09:12:43] Initializing Normalizer
[09:12:43] Running Normalizer
[09:12:43] Initializing MetalDisconnector
[09:12:43] Running MetalDisconnector
[09:12:43] Initializing Normalizer
[09:12:43] Running Normalizer
[09:12:43] Running LargestFragmentChooser
[09:12:43] Fragment: OCCN1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1
[09:12:43] New largest fragment: OCCN1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1 (53)
[09:12:43] Running Uncharger
[09:12:43] Initializing MetalDisconnector
[09:12:43] Running MetalDisconnector
[09:12:43] Initializing Normalizer
[09:12:43] Running Normalizer
[09:12:43] Initializing MetalDisconnector
[09:12:43] Running MetalDisconnector
[09:12:43] Initializing Normalizer
[09:12:43] Running Normalizer
[09:12:43] Running LargestFragmentChooser
[09:12:43] Fragment: CC(C)(C)[C@]1(O)CCN2C[C@H]3c4ccccc4CCc4cccc(c43)[C@@H]2C1
[09:12:43] New largest fragment: CC(C)(C)[C@]1(O)CCN2C[C@H]3c4ccccc4CCc4cccc(c43)[C@@H]2C1 

[09:12:44] Running MetalDisconnector
[09:12:44] Initializing Normalizer
[09:12:44] Running Normalizer
[09:12:44] Initializing MetalDisconnector
[09:12:44] Running MetalDisconnector
[09:12:44] Initializing Normalizer
[09:12:44] Running Normalizer
[09:12:44] Running LargestFragmentChooser
[09:12:44] Fragment: O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1
[09:12:44] New largest fragment: O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1 (49)
[09:12:44] Running Uncharger
[09:12:44] Initializing MetalDisconnector
[09:12:44] Running MetalDisconnector
[09:12:44] Initializing Normalizer
[09:12:44] Running Normalizer
[09:12:44] Initializing MetalDisconnector
[09:12:44] Running MetalDisconnector
[09:12:44] Initializing Normalizer
[09:12:44] Running Normalizer
[09:12:44] Running LargestFragmentChooser
[09:12:44] Fragment: NC(=O)C1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1
[09:12:44] New largest fragment: NC(=O)C1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1 (51)
[09:12:44] Running Uncharger
[09:12:44] Initializing Meta

[09:12:45] Initializing MetalDisconnector
[09:12:45] Running MetalDisconnector
[09:12:45] Initializing Normalizer
[09:12:45] Running Normalizer
[09:12:45] Initializing MetalDisconnector
[09:12:45] Running MetalDisconnector
[09:12:45] Initializing Normalizer
[09:12:45] Running Normalizer
[09:12:45] Running LargestFragmentChooser
[09:12:45] Fragment: Br
[09:12:45] New largest fragment: Br (2)
[09:12:45] Fragment: Fc1ccc([C@H]2C[C@@H]2CN2CCN(c3ccccc3Cl)CC2)cc1
[09:12:45] New largest fragment: Fc1ccc([C@H]2C[C@@H]2CN2CCN(c3ccccc3Cl)CC2)cc1 (46)
[09:12:45] Running Uncharger
[09:12:45] Initializing MetalDisconnector
[09:12:45] Running MetalDisconnector
[09:12:45] Initializing Normalizer
[09:12:45] Running Normalizer
[09:12:45] Initializing MetalDisconnector
[09:12:45] Running MetalDisconnector
[09:12:45] Initializing Normalizer
[09:12:45] Running Normalizer
[09:12:45] Running LargestFragmentChooser
[09:12:45] Fragment: COc1cccc(C(=O)CCCN2CCN(c3ccccc3OC)CC2)c1
[09:12:45] New largest fragment:

[09:12:59] Running MetalDisconnector
[09:12:59] Initializing Normalizer
[09:12:59] Running Normalizer
[09:12:59] Running LargestFragmentChooser
[09:12:59] Fragment: COc1ccc2c(c1OC)-c1c(Br)ccc3c1[C@@H](C2)N(C)CC3
[09:12:59] New largest fragment: COc1ccc2c(c1OC)-c1c(Br)ccc3c1[C@@H](C2)N(C)CC3 (43)
[09:12:59] Running Uncharger
[09:12:59] Initializing MetalDisconnector
[09:12:59] Running MetalDisconnector
[09:12:59] Initializing Normalizer
[09:12:59] Running Normalizer
[09:12:59] Initializing MetalDisconnector
[09:12:59] Running MetalDisconnector
[09:12:59] Initializing Normalizer
[09:12:59] Running Normalizer
[09:12:59] Running LargestFragmentChooser
[09:12:59] Fragment: Br
[09:12:59] New largest fragment: Br (2)
[09:12:59] Fragment: Clc1cc(Cl)cc(N2CCN(C[C@H]3C[C@@H]3c3ccccc3)CC2)c1
[09:12:59] New largest fragment: Clc1cc(Cl)cc(N2CCN(C[C@H]3C[C@@H]3c3ccccc3)CC2)c1 (46)
[09:12:59] Running Uncharger
[09:12:59] Initializing MetalDisconnector
[09:12:59] Running MetalDisconnector
[09:12:59] In

[09:13:00] Initializing MetalDisconnector
[09:13:00] Running MetalDisconnector
[09:13:00] Initializing Normalizer
[09:13:00] Running Normalizer
[09:13:00] Running LargestFragmentChooser
[09:13:00] Fragment: CN1CCN(C2=Nc3ccccc3Oc3ccc(Cl)cc32)CC1
[09:13:00] New largest fragment: CN1CCN(C2=Nc3ccccc3Oc3ccc(Cl)cc32)CC1 (41)
[09:13:00] Running Uncharger
[09:13:00] Initializing MetalDisconnector
[09:13:00] Running MetalDisconnector
[09:13:00] Initializing Normalizer
[09:13:00] Running Normalizer
[09:13:00] Initializing MetalDisconnector
[09:13:00] Running MetalDisconnector
[09:13:00] Initializing Normalizer
[09:13:00] Running Normalizer
[09:13:00] Running LargestFragmentChooser
[09:13:00] Fragment: CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21
[09:13:00] New largest fragment: CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21 (40)
[09:13:00] Running Uncharger
[09:13:00] Initializing MetalDisconnector
[09:13:00] Running MetalDisconnector
[09:13:00] Initializing Normalizer
[09:13:00] Running Normalizer
[09:13:00] Initializin

[09:13:00] Initializing MetalDisconnector
[09:13:00] Running MetalDisconnector
[09:13:00] Initializing Normalizer
[09:13:00] Running Normalizer
[09:13:00] Initializing MetalDisconnector
[09:13:00] Running MetalDisconnector
[09:13:00] Initializing Normalizer
[09:13:00] Running Normalizer
[09:13:00] Running LargestFragmentChooser
[09:13:00] Fragment: Cl
[09:13:00] New largest fragment: Cl (2)
[09:13:00] Fragment: Clc1cccc(Nc2nc(Cc3ccccc3)nc3c2CCNCC3)c1
[09:13:00] New largest fragment: Clc1cccc(Nc2nc(Cc3ccccc3)nc3c2CCNCC3)c1 (47)
[09:13:00] Running Uncharger
[09:13:00] Initializing MetalDisconnector
[09:13:00] Running MetalDisconnector
[09:13:00] Initializing Normalizer
[09:13:00] Running Normalizer
[09:13:00] Initializing MetalDisconnector
[09:13:00] Running MetalDisconnector
[09:13:00] Initializing Normalizer
[09:13:00] Running Normalizer
[09:13:00] Running LargestFragmentChooser
[09:13:00] Fragment: O=C(NCCCCN1CCN(c2cccc3ccccc23)CC1)c1cc2ccccc2[nH]1
[09:13:00] New largest fragment: O=C

[09:13:01] Running MetalDisconnector
[09:13:01] Initializing Normalizer
[09:13:01] Running Normalizer
[09:13:01] Running LargestFragmentChooser
[09:13:01] Fragment: CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1
[09:13:01] New largest fragment: CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1 (50)
[09:13:01] Running Uncharger
[09:13:01] Initializing MetalDisconnector
[09:13:01] Running MetalDisconnector
[09:13:01] Initializing Normalizer
[09:13:01] Running Normalizer
[09:13:01] Initializing MetalDisconnector
[09:13:01] Running MetalDisconnector
[09:13:01] Initializing Normalizer
[09:13:01] Running Normalizer
[09:13:01] Running LargestFragmentChooser
[09:13:01] Fragment: CC1CC(=O)c2ccc(N3CCN(C)CC3)cc2N1S(=O)(=O)c1ccc2ccccc2c1
[09:13:01] New largest fragment: CC1CC(=O)c2ccc(N3CCN(C)CC3)cc2N1S(=O)(=O)c1ccc2ccccc2c1 (59)
[09:13:01] Running Uncharger
[09:13:01] Initializing MetalDisconnector
[09:13:01] Running MetalDisconnector
[09:13:01] Initializing Normalizer
[09:13:01] Running Normalizer
[0

[09:13:02] Initializing MetalDisconnector
[09:13:02] Running MetalDisconnector
[09:13:02] Initializing Normalizer
[09:13:02] Running Normalizer
[09:13:02] Initializing MetalDisconnector
[09:13:02] Running MetalDisconnector
[09:13:02] Initializing Normalizer
[09:13:02] Running Normalizer
[09:13:02] Running LargestFragmentChooser
[09:13:02] Fragment: Clc1ccc2c(c1)C(N1CCNCC1)=Nc1ccccc1O2
[09:13:02] New largest fragment: Clc1ccc2c(c1)C(N1CCNCC1)=Nc1ccccc1O2 (38)
[09:13:02] Running Uncharger
[09:13:02] Initializing MetalDisconnector
[09:13:02] Running MetalDisconnector
[09:13:02] Initializing Normalizer
[09:13:02] Running Normalizer
[09:13:02] Initializing MetalDisconnector
[09:13:02] Running MetalDisconnector
[09:13:02] Initializing Normalizer
[09:13:02] Running Normalizer
[09:13:02] Running LargestFragmentChooser
[09:13:02] Fragment: Br
[09:13:02] New largest fragment: Br (2)
[09:13:02] Fragment: Fc1ccc([C@H]2C[C@@H]2CN2CCN(c3ccc(Cl)cc3)CC2)cc1
[09:13:02] New largest fragment: Fc1ccc([C@H

[09:13:09] Initializing MetalDisconnector
[09:13:09] Running MetalDisconnector
[09:13:09] Initializing Normalizer
[09:13:09] Running Normalizer
[09:13:09] Running LargestFragmentChooser
[09:13:09] Fragment: O=Cc1cnn2ccc(OCCCCN3CCN(c4cccc(Cl)c4Cl)CC3)cc12
[09:13:09] New largest fragment: O=Cc1cnn2ccc(OCCCCN3CCN(c4cccc(Cl)c4Cl)CC3)cc12 (54)
[09:13:09] Running Uncharger
[09:13:09] Initializing MetalDisconnector
[09:13:09] Running MetalDisconnector
[09:13:09] Initializing Normalizer
[09:13:09] Running Normalizer
[09:13:09] Initializing MetalDisconnector
[09:13:09] Running MetalDisconnector
[09:13:09] Initializing Normalizer
[09:13:09] Running Normalizer
[09:13:09] Running LargestFragmentChooser
[09:13:09] Fragment: COc1ccccc1-c1cccc(CN2CCN(c3ncccn3)CC2)c1
[09:13:09] New largest fragment: COc1ccccc1-c1cccc(CN2CCN(c3ncccn3)CC2)c1 (51)
[09:13:09] Running Uncharger
[09:13:09] Initializing MetalDisconnector
[09:13:09] Running MetalDisconnector
[09:13:09] Initializing Normalizer
[09:13:09] Runni

[09:13:09] Running LargestFragmentChooser
[09:13:09] Fragment: CC(C)Oc1cccnc1N(C)C1CCN(Cc2ccccc2)CC1
[09:13:09] New largest fragment: CC(C)Oc1cccnc1N(C)C1CCN(Cc2ccccc2)CC1 (54)
[09:13:09] Fragment: O=C(O)/C=C\C(=O)O
[09:13:09] Running Uncharger
[09:13:09] Initializing MetalDisconnector
[09:13:09] Running MetalDisconnector
[09:13:09] Initializing Normalizer
[09:13:09] Running Normalizer
[09:13:09] Initializing MetalDisconnector
[09:13:09] Running MetalDisconnector
[09:13:09] Initializing Normalizer
[09:13:09] Running Normalizer
[09:13:09] Running LargestFragmentChooser
[09:13:09] Fragment: CC(=O)c1sc(NC(=O)N[C@@H]2CN(C(C)=O)CC[C@H]2CN2CCC[C@@H](Cc3ccc(F)cc3)C2)nc1C
[09:13:09] New largest fragment: CC(=O)c1sc(NC(=O)N[C@@H]2CN(C(C)=O)CC[C@H]2CN2CCC[C@@H](Cc3ccc(F)cc3)C2)nc1C (73)
[09:13:09] Running Uncharger
[09:13:10] Initializing MetalDisconnector
[09:13:10] Running MetalDisconnector
[09:13:10] Initializing Normalizer
[09:13:10] Running Normalizer
[09:13:10] Initializing MetalDisconnect

[09:13:10] Running Uncharger
[09:13:10] Initializing MetalDisconnector
[09:13:10] Running MetalDisconnector
[09:13:10] Initializing Normalizer
[09:13:10] Running Normalizer
[09:13:10] Initializing MetalDisconnector
[09:13:10] Running MetalDisconnector
[09:13:10] Initializing Normalizer
[09:13:10] Running Normalizer
[09:13:10] Running LargestFragmentChooser
[09:13:10] Fragment: COc1ccc2c(c1)C[C@H]1c3cc(O)c(OC)cc3CCN1C2
[09:13:10] New largest fragment: COc1ccc2c(c1)C[C@H]1c3cc(O)c(OC)cc3CCN1C2 (44)
[09:13:10] Running Uncharger
[09:13:10] Initializing MetalDisconnector
[09:13:10] Running MetalDisconnector
[09:13:10] Initializing Normalizer
[09:13:10] Running Normalizer
[09:13:10] Initializing MetalDisconnector
[09:13:10] Running MetalDisconnector
[09:13:10] Initializing Normalizer
[09:13:10] Running Normalizer
[09:13:10] Running LargestFragmentChooser
[09:13:10] Fragment: CCN1CCC[C@H]1CNC(=O)c1c(OC)ccc(Br)c1OC
[09:13:10] New largest fragment: CCN1CCC[C@H]1CNC(=O)c1c(OC)ccc(Br)c1OC (45)
[0

[09:13:11] Initializing MetalDisconnector
[09:13:11] Running MetalDisconnector
[09:13:11] Initializing Normalizer
[09:13:11] Running Normalizer
[09:13:11] Initializing MetalDisconnector
[09:13:11] Running MetalDisconnector
[09:13:11] Initializing Normalizer
[09:13:11] Running Normalizer
[09:13:11] Running LargestFragmentChooser
[09:13:11] Fragment: c1c2c(cc3c1OCO3)CN1CCc3cc4c(cc3[C@@H]1C2)OCO4
[09:13:11] New largest fragment: c1c2c(cc3c1OCO3)CN1CCc3cc4c(cc3[C@@H]1C2)OCO4 (41)
[09:13:11] Running Uncharger
[09:13:11] Initializing MetalDisconnector
[09:13:11] Running MetalDisconnector
[09:13:11] Initializing Normalizer
[09:13:11] Running Normalizer
[09:13:11] Initializing MetalDisconnector
[09:13:11] Running MetalDisconnector
[09:13:11] Initializing Normalizer
[09:13:11] Running Normalizer
[09:13:11] Running LargestFragmentChooser
[09:13:11] Fragment: Cc1cccc(-c2ccc(C(=O)NC/C=C/CN3CCN(c4cccc(Cl)c4Cl)CC3)cc2)n1
[09:13:11] New largest fragment: Cc1cccc(-c2ccc(C(=O)NC/C=C/CN3CCN(c4cccc(Cl)c4

[09:13:12] Fragment: CN1CCN(C2=Nc3cc(Cl)ccc3N(NC(=O)CCCCC(=O)NN3c4ccc(Cl)cc4N=C(N4CCN(C)CC4)c4ccccc43)c3ccccc32)CC1
[09:13:12] New largest fragment: CN1CCN(C2=Nc3cc(Cl)ccc3N(NC(=O)CCCCC(=O)NN3c4ccc(Cl)cc4N=C(N4CCN(C)CC4)c4ccccc43)c3ccccc32)CC1 (102)
[09:13:12] Running Uncharger
[09:13:12] Initializing MetalDisconnector
[09:13:12] Running MetalDisconnector
[09:13:12] Initializing Normalizer
[09:13:12] Running Normalizer
[09:13:12] Initializing MetalDisconnector
[09:13:12] Running MetalDisconnector
[09:13:12] Initializing Normalizer
[09:13:12] Running Normalizer
[09:13:12] Running LargestFragmentChooser
[09:13:12] Fragment: CNC(=O)CN1CN(c2ccccc2)C2(CCN(Cc3cc4c(cc3Cl)OCO4)CC2)C1=O
[09:13:12] New largest fragment: CNC(=O)CN1CN(c2ccccc2)C2(CCN(Cc3cc4c(cc3Cl)OCO4)CC2)C1=O (60)
[09:13:12] Running Uncharger
[09:13:12] Initializing MetalDisconnector
[09:13:12] Running MetalDisconnector
[09:13:12] Initializing Normalizer
[09:13:12] Running Normalizer
[09:13:12] Initializing MetalDisconnector
[09

[09:13:14] Fragment: CCCN(CCCCNC(=O)c1cc2ccccc2[nH]1)[C@@H]1Cc2cccc3c2n(c(=O)n3CCC)C1
[09:13:14] New largest fragment: CCCN(CCCCNC(=O)c1cc2ccccc2[nH]1)[C@@H]1Cc2cccc3c2n(c(=O)n3CCC)C1 (73)
[09:13:14] Running Uncharger
[09:13:14] Initializing MetalDisconnector
[09:13:14] Running MetalDisconnector
[09:13:14] Initializing Normalizer
[09:13:14] Running Normalizer
[09:13:14] Initializing MetalDisconnector
[09:13:14] Running MetalDisconnector
[09:13:14] Initializing Normalizer
[09:13:14] Running Normalizer
[09:13:14] Running LargestFragmentChooser
[09:13:14] Fragment: CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccc(N(C)C)cc2)cc1
[09:13:14] New largest fragment: CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccc(N(C)C)cc2)cc1 (58)
[09:13:14] Fragment: [Cl-]
[09:13:14] Running Uncharger
[09:13:14] Initializing MetalDisconnector
[09:13:14] Running MetalDisconnector
[09:13:14] Initializing Normalizer
[09:13:14] Running Normalizer
[09:13:14] Initializing MetalDisconnector
[09:13:14] Running MetalDisconnector
[0

[09:13:15] Initializing MetalDisconnector
[09:13:15] Running MetalDisconnector
[09:13:15] Initializing Normalizer
[09:13:15] Running Normalizer
[09:13:15] Initializing MetalDisconnector
[09:13:15] Running MetalDisconnector
[09:13:15] Initializing Normalizer
[09:13:15] Running Normalizer
[09:13:15] Running LargestFragmentChooser
[09:13:15] Fragment: CCOc1cc(N)c(Cl)cc1C(=O)NC[C@@H]1CCN2CCC[C@@H]12
[09:13:15] New largest fragment: CCOc1cc(N)c(Cl)cc1C(=O)NC[C@@H]1CCN2CCC[C@@H]12 (47)
[09:13:15] Running Uncharger
[09:13:15] Initializing MetalDisconnector
[09:13:15] Running MetalDisconnector
[09:13:15] Initializing Normalizer
[09:13:15] Running Normalizer
[09:13:15] Initializing MetalDisconnector
[09:13:15] Running MetalDisconnector
[09:13:15] Initializing Normalizer
[09:13:15] Running Normalizer
[09:13:15] Running LargestFragmentChooser
[09:13:15] Fragment: CN1CCc2cccc3c2[C@H]1Cc1ccc(OCCCNC(=O)CCCCC(=O)NCCCOc2ccc4c(c2O)-c2cccc5c2[C@@H](C4)N(C)CC5)c(O)c1-3
[09:13:15] New largest fragment: CN

[09:13:16] Initializing MetalDisconnector
[09:13:16] Running MetalDisconnector
[09:13:16] Initializing Normalizer
[09:13:16] Running Normalizer
[09:13:16] Initializing MetalDisconnector
[09:13:16] Running MetalDisconnector
[09:13:16] Initializing Normalizer
[09:13:16] Running Normalizer
[09:13:16] Running LargestFragmentChooser
[09:13:16] Fragment: Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
[09:13:16] New largest fragment: Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1 (42)
[09:13:16] Running Uncharger
[09:13:16] Initializing MetalDisconnector
[09:13:16] Running MetalDisconnector
[09:13:16] Initializing Normalizer
[09:13:16] Running Normalizer
[09:13:16] Initializing MetalDisconnector
[09:13:16] Running MetalDisconnector
[09:13:16] Initializing Normalizer
[09:13:16] Running Normalizer
[09:13:16] Running LargestFragmentChooser
[09:13:16] Fragment: Clc1ccc(COC(Cn2ccnc2)c2ccc(Cl)cc2Cl)c(Cl)c1
[09:13:16] New largest fragment: Clc1ccc(COC(Cn2ccnc2)c2ccc(Cl)cc2Cl)c(Cl)c1 (39)
[09:13:16] Running Uncharge

In [7]:
df.head(20)

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_units,standard_value,pIC50,ROMol,preprocessed_smiles
0,OCCN1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1,CHEMBL567,nM,0.3,9.522879,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...,OCCN1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1
1,CC(C)(C)[C@]1(O)CCN2C[C@H]3c4ccccc4CCc4cccc(c4...,CHEMBL8514,nM,1.1,8.958607,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...,CC(C)(C)[C@]1(O)CCN2C[C@H]3c4ccccc4CCc4cccc(c4...
2,CC(C)(C)[C@]1(O)CCN2C[C@@H]3c4ccccc4CCc4cccc(c...,CHEMBL3885419,nM,1.1,8.958607,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...,CC(C)(C)[C@]1(O)CCN2C[C@@H]3c4ccccc4CCc4cccc(c...
3,O=C(CCCN1CCC2(CC1)C(=O)NCN2c1ccccc1)c1ccc(F)cc1,CHEMBL267930,nM,1.27,8.896196,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...,O=C(CCCN1CCC2(CC1)C(=O)NCN2c1ccccc1)c1ccc(F)cc1
4,CC(C)[C@@]1(NC(=O)[C@@H]2C[C@@H]3c4cccc5[nH]cc...,CHEMBL1255837,nM,1.474,8.831503,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...,CC(C)[C@@]1(NC(=O)C2C[C@@H]3c4cccc5[nH]cc(c45)...
5,CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=...,CHEMBL493,nM,1.582,8.800794,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...,CC(C)CC1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=O)C3...
6,OCCN1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1,CHEMBL726,nM,1.616,8.791559,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...,OCCN1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1
7,CN(C)C(=O)N[C@H]1CC[C@H](CCN2CCN(c3cccc(Cl)c3C...,CHEMBL2028019,nM,1.7,8.769551,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...,CN(C)C(=O)N[C@H]1CC[C@H](CCN2CCN(c3cccc(Cl)c3C...
8,CC(C)(C)NS(=O)(=O)c1ccc(-c2sc(C(=O)N[C@H]3C[C@...,CHEMBL4128926,nM,1.8,8.744727,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...,CC(C)(C)NS(=O)(=O)c1ccc(-c2sc(C(=O)NC3CC(C(=O)...
9,CCN1CCC[C@H]1CNC(=O)c1c(O)c(Cl)cc(Cl)c1OC,CHEMBL8809,nM,1.83,8.737549,<rdkit.Chem.rdchem.Mol object at 0x000001186AC...,CCN1CCC[C@H]1CNC(=O)c1c(O)c(Cl)cc(Cl)c1OC


In [8]:
df.to_csv('D:\\MachineLearning\\table_2.csv')

In [9]:
PandasTools.AddMoleculeColumnToFrame(df, "preprocessed_smiles")