## Importing libraries

In [1]:
import sys
import pandas as pd
sys.path.append('../../utils/')
from smiles_cleaner import SmilesCleaner
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

## Dataframe from SMILES list

Joining the two datasets *[...]sample_1* and *[...]sample_2* into only one *dataframe*

In [2]:
def read_smiles_file(file_path):
    with open(file_path, "r") as file:
        smiles = file.read().splitlines()
    return list(smiles)

In [3]:
zinc15_1 = read_smiles_file("../../../data/negative_datasets/RAW_datasets/RAW_zinc15_nondrugs_sample_1.smiles")
zinc15_2 = read_smiles_file("../../../data/negative_datasets/RAW_datasets/RAW_zinc15_nondrugs_sample_2.smiles")

In [4]:
zinc15_smiles = zinc15_1 + zinc15_2
print(f"{len(zinc15_1)} + {len(zinc15_2)} = {len(zinc15_smiles)}")

4987 + 4984 = 9971


In [5]:
zinc15 = pd.DataFrame(zinc15_smiles, columns=["smiles"])
zinc15.head()

Unnamed: 0,smiles
0,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1
1,C[C@H]1CCCC[C@H]1NC(=O)NC(=O)CN1CCc2ccc([N+](=...
2,Brc1ccccc1-c1nc2c3ccccc3ncn2n1
3,CC[C@H](C)NC(=O)Nc1ccnn1C1CCCC1
4,CCn1c(SCc2cc(C(=O)OC)c(C)o2)nnc1-c1ccccc1OC


In [6]:
zinc15.to_csv('../../../data/negative_datasets/RAW_datasets/RAW_zinc15.csv', index=False)

## Cleaning

### Workflow:

- Remove the salts
  - Check if there are still salts present
  - Drop rows with persistent salt
- Neutralize the structures without salts
  - Verify and drop the structures that had errors (with a flag)
- Flag the duplicates
  - Remove the duplicates
- Save the *dataset*


In [7]:
print(zinc15.shape)
zinc15.head(5)

(9971, 1)


Unnamed: 0,smiles
0,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1
1,C[C@H]1CCCC[C@H]1NC(=O)NC(=O)CN1CCc2ccc([N+](=...
2,Brc1ccccc1-c1nc2c3ccccc3ncn2n1
3,CC[C@H](C)NC(=O)Nc1ccnn1C1CCCC1
4,CCn1c(SCc2cc(C(=O)OC)c(C)o2)nnc1-c1ccccc1OC


In [8]:
workflow = SmilesCleaner(zinc15)

zinc15_stripped_salt = workflow.strip_salt(
    smiles_col='smiles',
    output_col='stripped_salt_smiles'
)
# Checking if there is still any salt
zinc15_stripped_salt.df[zinc15_stripped_salt.df['stripped_salt_smiles'].str.contains('\.')]

Unnamed: 0,smiles,stripped_salt_smiles


In [9]:
zinc15_neutralized = zinc15_stripped_salt.neutralize(
    smiles_col = 'stripped_salt_smiles',
    output_col = 'clean_smiles'
)
# Checking if there is any error
zinc15_neutralized.df[zinc15_neutralized.df['clean_smiles'] == 'cant be neutralized']

Unnamed: 0,smiles,stripped_salt_smiles,clean_smiles


In [10]:
zinc15_flag_duplicate = zinc15_neutralized.search_duplicate(
    smiles_col='clean_smiles',
    keep_inchi=True
)



























































































































































































































































































































































































































































































































In [11]:
# Showing how many duplicates there is still left
print(f'Before dropping duplicates: {zinc15_flag_duplicate.df["duplicated"].sum()}')
zinc15_cleaned = zinc15_flag_duplicate.df.drop_duplicates(subset=['inchi'])
print(f"After dropping duplicates: {zinc15_cleaned['duplicated'].sum()}")

Before dropping duplicates: 2
After dropping duplicates: 1


In [12]:
# Final dataset
print(zinc15_cleaned.shape)
print(zinc15_cleaned.columns)
zinc15_cleaned.head(10)

(9970, 5)
Index(['smiles', 'stripped_salt_smiles', 'clean_smiles', 'inchi',
       'duplicated'],
      dtype='object')


Unnamed: 0,smiles,stripped_salt_smiles,clean_smiles,inchi,duplicated
0,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1,InChI=1S/C20H16N6O2/c1-12-18(20(28)23-17-7-3-2...,False
1,C[C@H]1CCCC[C@H]1NC(=O)NC(=O)CN1CCc2ccc([N+](=...,C[C@H]1CCCC[C@H]1NC(=O)NC(=O)CN1CCc2ccc([N+](=...,C[C@H]1CCCC[C@H]1NC(=O)NC(=O)CN1CCc2ccc([N+](=...,InChI=1S/C19H26N4O4/c1-13-4-2-3-5-17(13)20-19(...,False
2,Brc1ccccc1-c1nc2c3ccccc3ncn2n1,Brc1ccccc1-c1nc2c3ccccc3ncn2n1,Brc1ccccc1-c1nc2c3ccccc3ncn2n1,InChI=1S/C15H9BrN4/c16-12-7-3-1-5-10(12)14-18-...,False
3,CC[C@H](C)NC(=O)Nc1ccnn1C1CCCC1,CC[C@H](C)NC(=O)Nc1ccnn1C1CCCC1,CC[C@H](C)NC(=O)Nc1ccnn1C1CCCC1,InChI=1S/C13H22N4O/c1-3-10(2)15-13(18)16-12-8-...,False
4,CCn1c(SCc2cc(C(=O)OC)c(C)o2)nnc1-c1ccccc1OC,CCn1c(SCc2cc(C(=O)OC)c(C)o2)nnc1-c1ccccc1OC,CCn1c(SCc2cc(C(=O)OC)c(C)o2)nnc1-c1ccccc1OC,InChI=1S/C19H21N3O4S/c1-5-22-17(14-8-6-7-9-16(...,False
5,CC[C@H](C)c1[nH+]c2cc(C(=O)O)ccc2n1C[C@@H](C)O,CC[C@H](C)c1[nH+]c2cc(C(=O)O)ccc2n1C[C@@H](C)O,CC[C@H](C)c1nc2cc(C(=O)O)ccc2n1C[C@@H](C)O,InChI=1S/C15H20N2O3/c1-4-9(2)14-16-12-7-11(15(...,False
6,COc1nn(C)cc1C(=O)N1CCN(CCNC(=O)COc2ccc(F)cc2)CC1,COc1nn(C)cc1C(=O)N1CCN(CCNC(=O)COc2ccc(F)cc2)CC1,COc1nn(C)cc1C(=O)N1CCN(CCNC(=O)COc2ccc(F)cc2)CC1,InChI=1S/C20H26FN5O4/c1-24-13-17(19(23-24)29-2...,False
7,O=C(CNc1ccc(C(F)(F)F)cc1)Nc1ccc(Cl)cc1F,O=C(CNc1ccc(C(F)(F)F)cc1)Nc1ccc(Cl)cc1F,O=C(CNc1ccc(C(F)(F)F)cc1)Nc1ccc(Cl)cc1F,InChI=1S/C15H11ClF4N2O/c16-10-3-6-13(12(17)7-1...,False
8,C[C@H]1CCCC[N@@H+]1CCCNC(=O)CN(C)c1nn2c(NC(C)(...,C[C@H]1CCCC[N@@H+]1CCCNC(=O)CN(C)c1nn2c(NC(C)(...,C[C@H]1CCCC[N@]1CCCNC(=O)CN(C)c1nn2c(NC(C)(C)C...,InChI=1S/C26H38FN7OS/c1-18-10-6-7-14-33(18)15-...,False
9,COC(=O)c1c(C)sc(-c2c(-c3ccccc3)[nH]c3c2c(O)nc(...,COC(=O)c1c(C)sc(-c2c(-c3ccccc3)[nH]c3c2c(O)nc(...,COC(=O)c1c(C)sc(-c2c(-c3ccccc3)[nH]c3c2c(O)nc(...,InChI=1S/C20H17N3O5S/c1-9-11(19(26)28-3)15(24)...,False


In [13]:
# Saving
zinc15[['clean_smiles']].to_csv('../../../data/negative_datasets/cleaned_datasets/zinc15_cleaned.csv', index=False)