## Importing and Reading Data

In [3]:
import sys
sys.path.append('../utils/')
import pandas as pd
from rdkit.Chem.PandasTools import AddMoleculeColumnToFrame
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import HTML
from ring_fragmenter import get_ring_fragments

IPythonConsole.molSize = (600,300)

def show_df(df):
    return HTML(df.to_html(notebook=True))

In [4]:
zinc_non_drugs_df = pd.read_csv('../../data/negative_datasets/cleaned_datasets/zinc15_cleaned.csv')
print(zinc_non_drugs_df.shape)
zinc_non_drugs_df.columns

(9971, 1)


Index(['clean_smiles'], dtype='object')

In [5]:
list_of_smiles = zinc_non_drugs_df['clean_smiles'].to_list()

## Fragmenting ZINC non-drugs

In [6]:
ring_fragments_zinc = get_ring_fragments(list_of_smiles=list_of_smiles, no_rings_list=False)
ring_fragments_zinc

Unnamed: 0,parent_smiles,ring_fragment
0,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1,Cc1nnn(C)c1C
1,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1,Cc1ccccc1N
2,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1,Nc1ccc2ncccc2c1
3,C[C@H]1CCCC[C@H]1NC(=O)NC(=O)CN1CCc2ccc([N+](=...,C[C@H]1CCCC[C@H]1N
4,C[C@H]1CCCC[C@H]1NC(=O)NC(=O)CN1CCc2ccc([N+](=...,CN1CCc2ccc([N+])cc2C1
...,...,...
25493,Cc1ccc(C[N@]2CC[C@H]3O[C@H](C(=O)NCc4nccn4C)C[...,Cc1ccc(C)o1
25494,Cc1ccc(C[N@]2CC[C@H]3O[C@H](C(=O)NCc4nccn4C)C[...,C[C@@H]1C[C@@H]2[C@@H](CCN2C)O1
25495,Cc1ccc(C[N@]2CC[C@H]3O[C@H](C(=O)NCc4nccn4C)C[...,Cc1nccn1C
25496,CCc1c(C)nc(C)nc1N1CCN(CC(=O)O)CC1,Cc1nc(C)c(C)c(N)n1


Keep in mind that if a molecule has two of the same ring there would be two ring fragments from the same parent structure.

## Saving

Saving the non-unique relation (parent x ring fragment)

In [7]:
ring_fragments_zinc.to_csv('../../data/fragments/non_unique/zinc_fragments_non_unique.csv', index=False)

Saving with the relation (one parent and unique ring fragments)

In [8]:
ring_fragments_zinc_unique = ring_fragments_zinc[~ring_fragments_zinc.duplicated()].reset_index(drop=True)
ring_fragments_zinc_unique

Unnamed: 0,parent_smiles,ring_fragment
0,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1,Cc1nnn(C)c1C
1,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1,Cc1ccccc1N
2,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1,Nc1ccc2ncccc2c1
3,C[C@H]1CCCC[C@H]1NC(=O)NC(=O)CN1CCc2ccc([N+](=...,C[C@H]1CCCC[C@H]1N
4,C[C@H]1CCCC[C@H]1NC(=O)NC(=O)CN1CCc2ccc([N+](=...,CN1CCc2ccc([N+])cc2C1
...,...,...
25209,Cc1ccc(C[N@]2CC[C@H]3O[C@H](C(=O)NCc4nccn4C)C[...,Cc1ccc(C)o1
25210,Cc1ccc(C[N@]2CC[C@H]3O[C@H](C(=O)NCc4nccn4C)C[...,C[C@@H]1C[C@@H]2[C@@H](CCN2C)O1
25211,Cc1ccc(C[N@]2CC[C@H]3O[C@H](C(=O)NCc4nccn4C)C[...,Cc1nccn1C
25212,CCc1c(C)nc(C)nc1N1CCN(CC(=O)O)CC1,Cc1nc(C)c(C)c(N)n1


In [9]:
ring_fragments_zinc_unique.to_csv('../../data/fragments/unique/zinc_fragments_no_duplicated.csv', index=False)