## Importing libraries and reading data

In [1]:
import sys
import pandas as pd
sys.path.append('../../utils/')
from smiles_cleaner import SmilesCleaner

In [2]:
tox21_df = pd.read_csv('../../../data/negative_datasets/RAW_datasets/RAW_tox21.csv')
print(tox21_df.shape)
tox21_df.head()

(7831, 14)


Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


## Cleaning process

### General Workflow:

1. **Remove Salts**
- Identify and remove compounds with salts.

2. **Neutralize Structures**
- Verify and correct structures without salts.
- Flag and remove structures with errors.

3. **Flag and Remove Duplicates**
- Identify and eliminate duplicate compounds.

4. **Save the Dataset**
- Store the cleaned dataset for future use.


In [3]:
# Instatiating SmilesCleaner object
workflow = SmilesCleaner(tox21_df)

# 1) Strip salt
tox21_df_stripped_salt = workflow.strip_salt(
    smiles_col='smiles',
    output_col='stripped_salt_smiles'
)

# Checking to see if there is any salt left
remaining_salts_df = tox21_df_stripped_salt.df[tox21_df_stripped_salt.df['stripped_salt_smiles'].str.contains('\.')]
print(f'There is still {remaining_salts_df.shape[0]} salts left. Before dropping the salts we have: {tox21_df_stripped_salt.df.shape[0]}')



There is still 231 salts left. Before dropping the salts we have: 7831


In [4]:
# Removing lingering salts
tox21_df_stripped_salt = tox21_df_stripped_salt.df[~tox21_df_stripped_salt.df['stripped_salt_smiles'].str.contains('\.').reset_index(drop=True)]
print(f'After dropping the salts {tox21_df_stripped_salt.shape}')
tox21_df_stripped_salt = SmilesCleaner(tox21_df_stripped_salt)

After dropping the salts (7600, 15)


In [5]:
# neutralize the salts
tox21_df_neutralized = tox21_df_stripped_salt.neutralize(
    smiles_col='stripped_salt_smiles',
    output_col='neutralized_smiles'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame[molCol] = frame[smilesCol].map(Chem.MolFromSmiles)
[19:11:08] Explicit valence for atom # 1 B, 5, is greater than permitted
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[output_col] = self.df['stripped_salt_mol'].apply(neutralize_standardize_atoms)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.drop(columns=['stripped_salt_mol'], a

In [7]:
# Removing stuff that cant be neutralized
print(f"There is {tox21_df_neutralized.df[tox21_df_neutralized.df['neutralized_smiles'] == 'cant be neutralized'].shape[0]} structures that cant be neutralized")
tox21_df_neutralized = tox21_df_neutralized.df[tox21_df_neutralized.df['neutralized_smiles'] != 'cant be neutralized'].reset_index(drop = True)
tox21_df_neutralized = SmilesCleaner(tox21_df_neutralized)

There is 0 structures that cant be neutralized


In [8]:
# This chunk genrates a lot of warning messages so we disable them using the RDLogger
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
# Search duplicate
tox21_flag_duplicate = tox21_df_neutralized.search_duplicate(
    smiles_col='neutralized_smiles',
    keep_inchi=True
)

In [9]:
tox21_cleaned = tox21_flag_duplicate.df.drop_duplicates(subset=['inchi'])
print(f"Before dropping duplicates: {tox21_flag_duplicate.df['duplicated'].sum()} .After dropping duplicates: {tox21_cleaned['duplicated'].sum()}")

Before dropping duplicates: 180 .After dropping duplicates: 87


In [10]:
print(tox21_cleaned.shape)
print(tox21_cleaned.columns)
tox21_cleaned.head(10)

(7506, 18)
Index(['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
       'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53',
       'mol_id', 'smiles', 'stripped_salt_smiles', 'neutralized_smiles',
       'inchi', 'duplicated'],
      dtype='object')


Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles,stripped_salt_smiles,neutralized_smiles,inchi,duplicated
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,InChI=1S/C9H10N2O3S2/c1-2-14-6-3-4-7-8(5-6)15-...,False
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,CCN1C(=O)NC(c2ccccc2)C1=O,CCN1C(=O)NC(c2ccccc2)C1=O,InChI=1S/C11H12N2O2/c1-2-13-10(14)9(12-11(13)1...,False
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,InChI=1S/C20H32O/c1-3-20(21)13-11-18-17-9-8-14...,False
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,InChI=1S/C17H28N2O/c1-6-12-19(8-3)15(7-2)17(20...,False
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,CC(O)(P(=O)(O)O)P(=O)(O)O,CC(O)(P(=O)(O)O)P(=O)(O)O,"InChI=1S/C2H8O7P2/c1-2(3,10(4,5)6)11(7,8)9/h3H...",True
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,TOX5110,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C,"InChI=1S/C16H34O4/c1-13(2,3)17-19-15(7,8)11-12...",False
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX6619,O=S(=O)(Cl)c1ccccc1,O=S(=O)(Cl)c1ccccc1,O=S(=O)(Cl)c1ccccc1,"InChI=1S/C6H5ClO2S/c7-10(8,9)6-4-2-1-3-5-6/h1-5H",False
7,0.0,,0.0,,1.0,,,1.0,0.0,1.0,0.0,1.0,TOX25232,O=C(O)Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1,O=C(O)Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1,O=C(O)Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1,InChI=1S/C14H9I3O4/c15-9-6-8(1-2-12(9)18)21-14...,False
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,TOX22514,OC[C@H](O)[C@@H](O)[C@H](O)CO,OC[C@@H](O)[C@H](O)[C@@H](O)CO,OC[C@@H](O)[C@H](O)[C@@H](O)CO,"InChI=1S/C5H12O5/c6-1-3(8)5(10)4(9)2-7/h3-10H,...",False
9,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,,,0.0,TOX25236,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)...,InChI=1S/C21H27N7O14P2/c22-17-12-19(25-7-24-17...,False


In [11]:
# Sanity check
tox21_cleaned[tox21_cleaned['neutralized_smiles'].str.contains('\.')]

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles,stripped_salt_smiles,neutralized_smiles,inchi,duplicated


In [12]:
tox21_cleaned.to_csv('../../../data/negative_datasets/cleaned_datasets/tox21_cleaned.csv', index=False)