## Import libraries and read data 

In [5]:
# OS path
from pathlib import Path

# Data and plot
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

# RDKit
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, rdFingerprintGenerator, rdMolDescriptors, AllChem 
from rdkit.Chem import PandasTools, rdDepictor, rdFMCS
from rdkit.ML.Cluster import Butina
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import rdMolDraw2D

In [9]:
HERE = Path(_dh[-1])
DATA_FOLDER = HERE.parent.parent/'data'

fda_ring_fragments = pd.read_csv(DATA_FOLDER/'fragments/unique/drug_fragments_no_duplicated.csv')
fda_ring_fragments.head(3)

Unnamed: 0,parent_smiles,ring_fragment,chembl_id
0,Cc1cn[nH]c1,Cc1cn[nH]c1,CHEMBL1308
1,C1CNCCN1,C1CNCCN1,CHEMBL1412
2,Nc1ccncc1,Nc1ccncc1,CHEMBL284348


In [13]:
fda_drugs = pd.read_csv(DATA_FOLDER/'fda_approved_datasets/fda_approved_drugs.csv')
fda_drugs[['chembl_id', 'oral']].head(3)

Unnamed: 0,chembl_id,oral
0,CHEMBL1200728,True
1,CHEMBL734,True
2,CHEMBL467,True


In [14]:
# merging info about dosage_form
ring_fragments_dosage_form = pd.merge(fda_ring_fragments, fda_drugs[['chembl_id', 'oral']], on='chembl_id', how='left')
print(ring_fragments_dosage_form.shape)
ring_fragments_dosage_form.head(3)

(3590, 4)


Unnamed: 0,parent_smiles,ring_fragment,chembl_id,oral
0,Cc1cn[nH]c1,Cc1cn[nH]c1,CHEMBL1308,False
1,C1CNCCN1,C1CNCCN1,CHEMBL1412,True
2,Nc1ccncc1,Nc1ccncc1,CHEMBL284348,True


### Separate oral/non-oral fragments

In [17]:
ring_fragments_oral = ring_fragments_dosage_form[ring_fragments_dosage_form['oral'] == True]
ring_fragments_oral

Unnamed: 0,parent_smiles,ring_fragment,chembl_id,oral
1,C1CNCCN1,C1CNCCN1,CHEMBL1412,True
2,Nc1ccncc1,Nc1ccncc1,CHEMBL284348,True
3,N[C@@H]1CONC1=O,N[C@@H]1CONC1=O,CHEMBL771,True
4,Nc1ccncc1N,Nc1ccncc1N,CHEMBL354077,True
8,Cn1ccnc1S,Cn1ccnc1S,CHEMBL1515,True
...,...,...,...,...
3562,CN1Cc2c(Cl)cc(Cl)cc2[C@H](c2cccc(S(=O)(=O)NCCO...,C[C@@H]1CN(C)Cc2c(Cl)cc(Cl)cc21,CHEMBL3304485,True
3563,CN1Cc2c(Cl)cc(Cl)cc2[C@H](c2cccc(S(=O)(=O)NCCO...,Cc1cccc(S)c1,CHEMBL3304485,True
3582,CN[C@H](CC(C)C)C(=O)N[C@H]1C(=O)N[C@@H](CC(N)=...,C[C@H]1O[C@@H](O)[C@H](O)[C@@H](O)[C@@H]1O,CHEMBL262777,True
3583,CN[C@H](CC(C)C)C(=O)N[C@H]1C(=O)N[C@@H](CC(N)=...,C[C@@H]1O[C@@H](O)C[C@](C)(N)[C@@H]1O,CHEMBL262777,True


In [18]:
ring_fragments_non_oral = ring_fragments_dosage_form[ring_fragments_dosage_form['oral'] == False]
ring_fragments_non_oral

Unnamed: 0,parent_smiles,ring_fragment,chembl_id,oral
0,Cc1cn[nH]c1,Cc1cn[nH]c1,CHEMBL1308,False
5,Oc1cccc(O)c1,Oc1cccc(O)c1,CHEMBL24147,False
6,NCCc1cc[nH]n1,Cc1cc[nH]n1,CHEMBL1201323,False
7,NCCc1c[nH]cn1,Cc1c[nH]cn1,CHEMBL90,False
10,COc1ccc(O)cc1,Oc1ccc(O)cc1,CHEMBL544,False
...,...,...,...,...
3585,CCC(C)CCCCC(=O)NC(CCNCS(=O)(=O)O)C(=O)NC(C(=O)...,CC1NC(=O)C(N)CCNC(=O)C(C)NC(=O)C(C)NC(=O)C(C)N...,CHEMBL1201441,False
3586,CCCCCCCCCCNCCN[C@@]1(C)C[C@H](O[C@H]2[C@H](Oc3...,C[C@@H]1O[C@@H](O)C[C@](C)(N)[C@@H]1O,CHEMBL507870,False
3587,CCCCCCCCCCNCCN[C@@]1(C)C[C@H](O[C@H]2[C@H](Oc3...,C[C@H]1O[C@@H](O)[C@H](O)[C@@H](O)[C@@H]1O,CHEMBL507870,False
3588,CCCCCCCCCCNCCN[C@@]1(C)C[C@H](O[C@H]2[C@H](Oc3...,Cc1c(O)cc2c(c1O)-c1cc(ccc1O)[C@H]1NC(=O)[C@@H]...,CHEMBL507870,False
