## Importing libraries reading data

In [10]:
import sys
import pandas as pd
sys.path.append('../../utils/')
from smiles_cleaner import SmilesCleaner

In [11]:
pdb_nondrugs = pd.read_csv('../../../data/negative_datasets/RAW_datasets/RAW_negative_dataset_pdb_below_tanimoto0.85_to_fda.csv')
print(pdb_nondrugs.shape)
pdb_nondrugs.head(10)

(12333, 3)


Unnamed: 0,index,SMILES,new_SMILES
0,0,CC1(C)O[C@H]2[C@@H]3OS(=O)(=O)O[C@@H]3CO[C@@]2...,CC1(C)OC2C3OS(=O)(=O)OC3COC2(COS(N)(=O)=O)O1
1,1,CNC(=O)c1cccc(C)c1Nc1nc(N2CCN(c3ccccc3Cl)CC2)n...,CNC(=O)c1cccc(C)c1Nc1nc(N2CCN(c3ccccc3Cl)CC2)n...
2,2,CCNC(=O)Nc1cc2c(-c3ccnc(C)c3)ccc(C)c2cn1,CCNC(=O)Nc1cc2c(-c3ccnc(C)c3)ccc(C)c2cn1
3,3,Clc1ccccc1Nc1ccnc(Nc2ccc(-c3nnn[nH]3)cc2)n1,Clc1ccccc1Nc1ccnc(Nc2ccc(-c3nnn[nH]3)cc2)n1
4,4,Cc1c(CCC(=O)O)c2cc3[nH]c(cc4nc(cc5[nH]c(cc1n2)...,CC1=C(CCC(=O)O)c2cc3[nH]c(cc4nc(cc5[nH]c(cc1n2...
5,5,COc1cc2c(c(OC)c1)[C@@H]1[C@H]3CCC[C@@H](C(=O)N...,COc1cc2c(c(OC)c1)C1C3CCCC(C(=O)N1CC2)N3C(=O)C(...
6,6,C[C@@H]1O[C@@H](OC2=C(O)c3c(O)cc(O)cc3O[C@@H]2...,CC1OC(OC2=C(O)c3c(O)cc(O)cc3OC2c2ccc(O)cc2)C(O...
7,7,COC(=O)Nc1ccc2c(c1)NC(=O)C[C@@H](C)/C=C/C[C@H]...,COC(=O)Nc1ccc2c(c1)NC(=O)CC(C)C=CCC(NC(=O)c1c(...
8,8,Nc1ncnc2c1ccn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ccn2C1OC(COP(=O)(O)OP(=O)(O)O[PH](=O...
9,9,NCCNC(=O)c1cncc(-c2cnc(Nc3cc(N4CCOCC4)ccn3)s2)c1,NCCNC(=O)c1cncc(-c2cnc(Nc3cc(N4CCOCC4)ccn3)s2)c1


## Cleaning process

### General Workflow:

1. **Remove Salts**
- Identify and remove compounds with salts.



2. **Neutralize Structures**
- Verify and correct structures without salts.
- Flag and remove structures with errors.

3. **Flag and Remove Duplicates**
- Identify and eliminate duplicate compounds.

4. **Save the Dataset**
- Store the cleaned dataset for future use.


In [12]:
# Working with SMILES since new_SMILES do not have optical isomerism
pdb_nondrugs = pdb_nondrugs[['SMILES']]

In [13]:
# Instatiating SmilesCleaner object

workflow = SmilesCleaner(pdb_nondrugs)

# 1) Strip salt
pdb_stripped_salt = workflow.strip_salt(
    smiles_col='SMILES',
    output_col='stripped_salt_smiles'
)

# Checking to see if there is any salt left and getting rid of
remaining_salts_df = pdb_stripped_salt.df[pdb_stripped_salt.df["stripped_salt_smiles"].str.contains('\.')]
print(f'There is still {remaining_salts_df.shape[0]} salts left')

pdb_stripped_salt.df[~pdb_stripped_salt.df['stripped_salt_smiles'].str.contains('\.')].reset_index(drop=True)

There is still 10 salts left


Unnamed: 0,SMILES,stripped_salt_smiles
0,CC1(C)O[C@H]2[C@@H]3OS(=O)(=O)O[C@@H]3CO[C@@]2...,CC1(C)O[C@H]2[C@@H]3OS(=O)(=O)O[C@@H]3CO[C@@]2...
1,CNC(=O)c1cccc(C)c1Nc1nc(N2CCN(c3ccccc3Cl)CC2)n...,CNC(=O)c1cccc(C)c1Nc1nc(N2CCN(c3ccccc3Cl)CC2)n...
2,CCNC(=O)Nc1cc2c(-c3ccnc(C)c3)ccc(C)c2cn1,CCNC(=O)Nc1cc2c(-c3ccnc(C)c3)ccc(C)c2cn1
3,Clc1ccccc1Nc1ccnc(Nc2ccc(-c3nnn[nH]3)cc2)n1,Clc1ccccc1Nc1ccnc(Nc2ccc(-c3nnn[nH]3)cc2)n1
4,Cc1c(CCC(=O)O)c2cc3[nH]c(cc4nc(cc5[nH]c(cc1n2)...,CC1=C(CCC(=O)O)c2cc3[nH]c(cc4nc(cc5[nH]c(cc1n2...
...,...,...
12318,NS(=O)(=O)OC[C@H]1C[C@@H](Nc2ccnc3cc(-c4cccc(S...,NS(=O)(=O)OC[C@H]1C[C@@H](Nc2ccnc3cc(-c4cccc(S...
12319,NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=...,NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=...
12320,CCOc1ccc(Br)c2c1[B-]1(OC2CN)OC2C(COP(=O)(O)O)O...,CCOc1ccc(Br)c2c1[B-]1(OC2CN)OC2C(COP(=O)(O)O)O...
12321,COCCOc1cnc2ccc([C@H](C)c3nnc4c(F)cc(-c5cc(C)no...,COCCOc1cnc2ccc([C@H](C)c3nnc4c(F)cc(-c5cc(C)no...


In [14]:
# neutralize the salts
pdb_neutralized = pdb_stripped_salt.neutralize(
    smiles_col='stripped_salt_smiles',
    output_col='clean_smiles'
)

# Checking if there is any error
pdb_neutralized.df[pdb_neutralized.df['clean_smiles'] == 'cant be neutralized']

Unnamed: 0,SMILES,stripped_salt_smiles,clean_smiles
757,N[C@@H](CCCC[B-](O)(O)O)C(=O)O,N[C@@H](CCCC[B-](O)(O)O)C(=O)O,cant be neutralized
1070,N[C@](CCCC[B-](O)(O)O)(CC[NH+]1CCCCC1)C(=O)O,N[C@](CCCC[B-](O)(O)O)(CC[NH+]1CCCCC1)C(=O)O,cant be neutralized
2399,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C2O[B-]3(OCc4cc...,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C2O[B-]3(OCc4cc...,cant be neutralized
3319,NC(CCCC[B-](O)(O)O)(C(=O)O)C1CC2CCC(C1)[NH+]2C...,NC(CCCC[B-](O)(O)O)(C(=O)O)C1CC2CCC(C1)[NH+]2C...,cant be neutralized
3469,NC(CCCC[B-](O)(O)O)(CC[NH+]1CCC(O)CC1)C(=O)O,NC(CCCC[B-](O)(O)O)(CC[NH+]1CCC(O)CC1)C(=O)O,cant be neutralized
3832,CCOc1ccc(Cl)c2c1[B-]1(OC2CN)OC2C(COP(=O)(O)O)O...,CCOc1ccc(Cl)c2c1[B-]1(OC2CN)OC2C(COP(=O)(O)O)O...,cant be neutralized
5784,C[C@](N)(CCCC[B-](O)(O)O)C(=O)O,C[C@](N)(CCCC[B-](O)(O)O)C(=O)O,cant be neutralized
5999,NC(CCCC[B-](O)(O)O)(C(=O)O)C1CC[NH+](Cc2ccc(Cl...,NC(CCCC[B-](O)(O)O)(C(=O)O)C1CC[NH+](Cc2ccc(Cl...,cant be neutralized
6380,NC(CCCC[B-](O)(O)O)(C(=O)O)C1CC[NH+](Cc2ccc(Cl...,NC(CCCC[B-](O)(O)O)(C(=O)O)C1CC[NH+](Cc2ccc(Cl...,cant be neutralized
6748,N[C@@H](CSCC[B-](O)(O)O)C(=O)O,N[C@@H](CSCC[B-](O)(O)O)C(=O)O,cant be neutralized


In [15]:
# This chunk genrates a lot of warning messages so we disable them using the RDLogger
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

# Dropping the structures that cant be neutralized
pdb_neutralized = pdb_neutralized.df[pdb_neutralized.df['clean_smiles'] != 'cant be neutralized'].reset_index(drop=True)
pdb_neutralized = SmilesCleaner(pdb_neutralized)

# Search duplicate
pdb_flag_duplicate = pdb_neutralized.search_duplicate(
    smiles_col='clean_smiles',
    keep_inchi=True
)

In [16]:
pdb_cleaned = pdb_flag_duplicate.df.drop_duplicates(subset=['inchi'])
print(f"Before dropping duplicates: {pdb_flag_duplicate.df['duplicated'].sum()} After dropping duplicates: {pdb_cleaned['duplicated'].sum()}")

Before dropping duplicates: 134 After dropping duplicates: 65


## Saving

In [8]:
print(pdb_cleaned.shape)
print(pdb_cleaned.columns)
pdb_cleaned.head(10)

(12246, 5)
Index(['SMILES', 'stripped_salt_smiles', 'clean_smiles', 'inchi',
       'duplicated'],
      dtype='object')


Unnamed: 0,SMILES,stripped_salt_smiles,clean_smiles,inchi,duplicated
0,CC1(C)O[C@H]2[C@@H]3OS(=O)(=O)O[C@@H]3CO[C@@]2...,CC1(C)O[C@H]2[C@@H]3OS(=O)(=O)O[C@@H]3CO[C@@]2...,CC1(C)O[C@H]2[C@@H]3OS(=O)(=O)O[C@@H]3CO[C@@]2...,"InChI=1S/C9H15NO10S2/c1-8(2)17-7-6-5(18-22(13,...",False
1,CNC(=O)c1cccc(C)c1Nc1nc(N2CCN(c3ccccc3Cl)CC2)n...,CNC(=O)c1cccc(C)c1Nc1nc(N2CCN(c3ccccc3Cl)CC2)n...,CNC(=O)c1cccc(C)c1Nc1nc(N2CCN(c3ccccc3Cl)CC2)n...,InChI=1S/C28H33ClN10O4S/c1-17-6-5-7-18(24(41)3...,False
2,CCNC(=O)Nc1cc2c(-c3ccnc(C)c3)ccc(C)c2cn1,CCNC(=O)Nc1cc2c(-c3ccnc(C)c3)ccc(C)c2cn1,CCNC(=O)Nc1cc2c(-c3ccnc(C)c3)ccc(C)c2cn1,InChI=1S/C19H20N4O/c1-4-20-19(24)23-18-10-16-1...,False
3,Clc1ccccc1Nc1ccnc(Nc2ccc(-c3nnn[nH]3)cc2)n1,Clc1ccccc1Nc1ccnc(Nc2ccc(-c3nnn[nH]3)cc2)n1,Clc1ccccc1Nc1ccnc(Nc2ccc(-c3nnn[nH]3)cc2)n1,InChI=1S/C17H13ClN8/c18-13-3-1-2-4-14(13)21-15...,False
4,Cc1c(CCC(=O)O)c2cc3[nH]c(cc4nc(cc5[nH]c(cc1n2)...,CC1=C(CCC(=O)O)c2cc3[nH]c(cc4nc(cc5[nH]c(cc1n2...,CC1=C(CCC(=O)O)c2cc3[nH]c(cc4nc(cc5[nH]c(cc1n2...,InChI=1S/C30H30N4O10S2/c1-13-17(5-7-27(35)36)2...,False
5,COc1cc2c(c(OC)c1)[C@@H]1[C@H]3CCC[C@@H](C(=O)N...,COc1cc2c(c(OC)c1)[C@@H]1[C@H]3CCC[C@@H](C(=O)N...,COc1cc2c(c(OC)c1)[C@@H]1[C@H]3CCC[C@@H](C(=O)N...,InChI=1S/C28H32N2O8/c1-34-17-11-15-9-10-29-24(...,False
6,C[C@@H]1O[C@@H](OC2=C(O)c3c(O)cc(O)cc3O[C@@H]2...,C[C@@H]1O[C@@H](OC2=C(O)c3c(O)cc(O)cc3O[C@@H]2...,C[C@@H]1O[C@@H](OC2=C(O)c3c(O)cc(O)cc3O[C@@H]2...,InChI=1S/C21H22O10/c1-8-15(25)17(27)18(28)21(2...,False
7,COC(=O)Nc1ccc2c(c1)NC(=O)C[C@@H](C)/C=C/C[C@H]...,COC(=O)Nc1ccc2c(c1)NC(=O)C[C@@H](C)/C=C/C[C@H]...,COC(=O)Nc1ccc2c(c1)NC(=O)C[C@@H](C)/C=C/C[C@H]...,InChI=1S/C27H26ClF2N5O4/c1-13-5-4-6-19(33-26(3...,False
8,Nc1ncnc2c1ccn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ccn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ccn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,InChI=1S/C11H16N4O13P3/c12-9-5-1-2-15(10(5)14-...,False
9,NCCNC(=O)c1cncc(-c2cnc(Nc3cc(N4CCOCC4)ccn3)s2)c1,NCCNC(=O)c1cncc(-c2cnc(Nc3cc(N4CCOCC4)ccn3)s2)c1,NCCNC(=O)c1cncc(-c2cnc(Nc3cc(N4CCOCC4)ccn3)s2)c1,InChI=1S/C20H23N7O2S/c21-2-4-24-19(28)15-9-14(...,False


In [9]:
# Saving
pdb_cleaned[['clean_smiles']].to_csv('../../../data/negative_datasets/cleaned_datasets/pdb_cleaned.csv', index=False)