**Table of contents**<a id='toc0_'></a>    
- [Importing and Reading Data](#toc1_1_)    
  - [Fragmenting Toxic Compounds](#toc1_2_)    
  - [Saving](#toc1_3_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

## <a id='toc1_1_'></a>[Importing and Reading Data](#toc0_)

In [1]:
import sys
sys.path.append('../utils/')
import pandas as pd
from rdkit.Chem.PandasTools import AddMoleculeColumnToFrame
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import HTML
from ring_fragmenter import get_ring_systems, get_ring_adjacent, get_ring_fragments

IPythonConsole.molSize = (600,300)

# to show rdkit.Chem.Mol objects
def show_df(df):
    return HTML(df.to_html(notebook=True))

In [2]:
tox21_df = pd.read_csv('../../data/negative_datasets/cleaned_datasets/tox21_cleaned.csv')
list_of_smiles = tox21_df['neutralized_smiles'].to_list()
print(f'The toxic compounds dataframe has the shape: {tox21_df.shape}')
tox21_df.head()

The toxic compounds dataframe has the shape: (7506, 18)


Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles,stripped_salt_smiles,neutralized_smiles,inchi,duplicated
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,InChI=1S/C9H10N2O3S2/c1-2-14-6-3-4-7-8(5-6)15-...,False
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,CCN1C(=O)NC(c2ccccc2)C1=O,CCN1C(=O)NC(c2ccccc2)C1=O,InChI=1S/C11H12N2O2/c1-2-13-10(14)9(12-11(13)1...,False
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,InChI=1S/C20H32O/c1-3-20(21)13-11-18-17-9-8-14...,False
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,InChI=1S/C17H28N2O/c1-6-12-19(8-3)15(7-2)17(20...,False
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,CC(O)(P(=O)(O)O)P(=O)(O)O,CC(O)(P(=O)(O)O)P(=O)(O)O,"InChI=1S/C2H8O7P2/c1-2(3,10(4,5)6)11(7,8)9/h3H...",True


## <a id='toc1_2_'></a>[Fragmenting Toxic Compounds](#toc0_)

In [3]:
ring_fragments_tox21 = get_ring_fragments(list_of_smiles=list_of_smiles, no_rings_list=False)
ring_fragments_tox21

Unnamed: 0,parent_smiles,ring_fragment
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,Oc1ccc2nc(S)sc2c1
1,CCN1C(=O)NC(c2ccccc2)C1=O,CC1NC(=O)N(C)C1=O
2,CCN1C(=O)NC(c2ccccc2)C1=O,Cc1ccccc1
3,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]3...
4,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,Cc1cccc(C)c1N
...,...,...
9551,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,Cc1nnn[nH]1
9552,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,C[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(C)[C@...
9553,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...
9554,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...


In [4]:
tox21_df['parent_smiles'] = tox21_df['neutralized_smiles']
ring_fragments_tox21 = ring_fragments_tox21.merge(tox21_df[['mol_id', 'parent_smiles']], on='parent_smiles', how='left')
ring_fragments_tox21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9556 entries, 0 to 9555
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   parent_smiles  9556 non-null   object
 1   ring_fragment  9556 non-null   object
 2   mol_id         9556 non-null   object
dtypes: object(3)
memory usage: 224.1+ KB


In [5]:
# Final dataframe with mol_id for later retrieval of information
ring_fragments_tox21

Unnamed: 0,parent_smiles,ring_fragment,mol_id
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,Oc1ccc2nc(S)sc2c1,TOX3021
1,CCN1C(=O)NC(c2ccccc2)C1=O,CC1NC(=O)N(C)C1=O,TOX3020
2,CCN1C(=O)NC(c2ccccc2)C1=O,Cc1ccccc1,TOX3020
3,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]3...,TOX3024
4,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,Cc1cccc(C)c1N,TOX3027
...,...,...,...
9551,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,Cc1nnn[nH]1,TOX2725
9552,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,C[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(C)[C@...,TOX2370
9553,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371
9554,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377


Keeping in mind that if a compound has two of the same ring there would be two ring fragments from the same parent structure, so I'm calling this dataset 'non-unique'

## <a id='toc1_3_'></a>[Saving](#toc0_)

Saving the non-unique relation (parent x ring fragment):

In [None]:
ring_fragments_tox21.to_csv('../../data/fragments/non_unique/tox21_fragments_non_unique.csv', index=False)

Saving with the relation one parent with unique ring fragment:

In [11]:
ring_fragments_tox21_unique = ring_fragments_tox21[~ring_fragments_tox21.duplicated()].reset_index(drop=True)
ring_fragments_tox21_unique

Unnamed: 0,parent_smiles,ring_fragment,mol_id
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,Oc1ccc2nc(S)sc2c1,TOX3021
1,CCN1C(=O)NC(c2ccccc2)C1=O,CC1NC(=O)N(C)C1=O,TOX3020
2,CCN1C(=O)NC(c2ccccc2)C1=O,Cc1ccccc1,TOX3020
3,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]3...,TOX3024
4,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,Cc1cccc(C)c1N,TOX3027
...,...,...,...
8891,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,Cc1nnn[nH]1,TOX2725
8892,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,C[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(C)[C@...,TOX2370
8893,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371
8894,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377


In [12]:
ring_fragments_tox21_unique.to_csv('../../data/fragments/unique/tox21_fragments_no_duplicated.csv', index=False)