## Update! We only need to gather the small molecules

We retrieved an csv from drug bank containing only the DrugBank IDs of Small Molecules! So we do not need to worry about manually curating the biological ones

First we import the libraries

In [1]:
import pandas as pd

We load the small_molecules dataset from github

In [2]:
small_molecules = pd.read_csv("https://raw.githubusercontent.com/arturcgs/shared-side-projects/main/_Lipinski/data/RAW_datasets/RAW_drug_bank_small_molecules.csv")

In [3]:
small_molecules.head(5)

Unnamed: 0,DrugBank ID,Name,Drug Type
0,DB00006,Bivalirudin,SmallMoleculeDrug
1,DB00007,Leuprolide,SmallMoleculeDrug
2,DB00014,Goserelin,SmallMoleculeDrug
3,DB00027,Gramicidin D,SmallMoleculeDrug
4,DB00035,Desmopressin,SmallMoleculeDrug


Then we load our manually curated dataset

In [24]:
final_fda = pd.read_excel("https://github.com/arturcgs/shared-side-projects/blob/main/_Lipinski/data/manually_curated_datasets/fda_approved_1997_2021_with_all_smiles.xlsx?raw=true", sheet_name = "fda_approved_97_21")

In [25]:
final_fda = final_fda.drop("index", axis = 1)
print(final_fda.shape)
final_fda.head(3)

(593, 7)


Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,DrugBank ID,Drug Groups,SMILES
0,troglitazone,NDA,1997,troglitazone,DB00197,approved; investigational; withdrawn,CC1=C(C)C2=C(CCC(C)(COC3=CC=C(CC4SC(=O)NC4=O)C...
1,imiquimod,NDA,1997,imiquimod,DB00724,approved; investigational,CC(C)CN1C=NC2=C1C1=C(C=CC=C1)N=C2N
2,anagrelide hydrochloride,NDA,1997,anagrelide,DB00261,approved,ClC1=CC=C2N=C3NC(=O)CN3CC2=C1Cl


We remove the structures that do not have DrugBank IDs in our dataset

In [16]:
final_fda[final_fda["DrugBank ID"].isna()]

Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,DrugBank ID,Drug Groups,SMILES
578,fish oil triglycerides,NDA,2018,fish oil triglycerides,,approved,Not found
582,air polymer-type A,NDA,2019,air polymer-type A,,approved,Not found


Merging the two datasets based on DrugBank IDs so we're left only with the small_molecules

In [17]:
final_fda_only_small = pd.merge(final_fda, small_molecules, how = "left", on = "DrugBank ID")

In [26]:
final_fda_only_small.head(10)

Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,DrugBank ID,Drug Groups,SMILES,Name,Drug Type
0,troglitazone,NDA,1997,troglitazone,DB00197,approved; investigational; withdrawn,CC1=C(C)C2=C(CCC(C)(COC3=CC=C(CC4SC(=O)NC4=O)C...,Troglitazone,SmallMoleculeDrug
1,imiquimod,NDA,1997,imiquimod,DB00724,approved; investigational,CC(C)CN1C=NC2=C1C1=C(C=CC=C1)N=C2N,Imiquimod,SmallMoleculeDrug
2,anagrelide hydrochloride,NDA,1997,anagrelide,DB00261,approved,ClC1=CC=C2N=C3NC(=O)CN3CC2=C1Cl,Anagrelide,SmallMoleculeDrug
3,nelfinavir mesylate,NDA,1997,nelfinavir,DB00220,approved,[H][C@@]12CCCC[C@]1([H])CN(C[C@@H](O)[C@H](CSC...,Nelfinavir,SmallMoleculeDrug
4,delavirdine mesylate,NDA,1997,delavirdine,DB00705,approved,CC(C)NC1=C(N=CC=C1)N1CCN(CC1)C(=O)C1=CC2=C(N1)...,Delavirdine,SmallMoleculeDrug
5,tamsulosin hydrochloride,NDA,1997,tamsulosin,DB00706,approved; investigational,CCOC1=CC=CC=C1OCCN[C@H](C)CC1=CC(=C(OC)C=C1)S(...,Tamsulosin,SmallMoleculeDrug
6,toremifene citrate,NDA,1997,toremifene,DB00539,approved; investigational,CN(C)CCOC1=CC=C(C=C1)C(=C(\CCCl)C1=CC=CC=C1)\C...,Toremifene,SmallMoleculeDrug
7,tazarotene,NDA,1997,tazarotene,DB00799,approved; investigational,CCOC(=O)C1=CN=C(C=C1)C#CC1=CC2=C(SCCC2(C)C)C=C1,Tazarotene,SmallMoleculeDrug
8,cerivastatin sodium,NDA,1997,cerivastatin,DB00439,approved; withdrawn,COCC1=C(C2=CC=C(F)C=C2)C(\C=C\[C@@H](O)C[C@@H]...,Cerivastatin,SmallMoleculeDrug
9,pramipexole dihydrochloride,NDA,1997,pramipexole,DB00413,approved; investigational,CCCN[C@H]1CCC2=C(C1)SC(N)=N2,Pramipexole,SmallMoleculeDrug


Checking the not found "Drug Type" column (about 30 structures):

In [27]:
final_fda_only_small[final_fda_only_small["Drug Type"].isna()]

Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,DrugBank ID,Drug Groups,SMILES,Name,Drug Type
530,lepirudin,NDA,1998,lepirudin,DB00001,approved; withdrawn; biological,CC[C@H](C)[C@H](NC(=O)[C@H](CS)NC(=O)[C@H](CCC...,,
532,eptifibatide,NDA,1998,eptifibatide,DB00063,approved; investigational,NC(N)=NCCCCC1NC(=O)CCSSCC(NC(=O)C2CCCN2C(=O)C(...,,
533,fomivirsen sodium,NDA,1998,fomivirsen,DB06759,approved; investigational; withdrawn,CC1=CN([C@H]2C[C@H](O[P](O)(=S)OC[C@H]3O[C@H](...,,
536,gemtuzumab ozogamicin,NDA,2000,gemtuzumab ozogamicin,DB00056,biological,Not found,,
540,unoprostone isopropyl,NDA,2000,unoprostone isopropyl,DBSALT001760,approved,CCCCCCCC(=O)CC[C@H]1[C@@H](C[C@@H]([C@@H]1C/C=...,,
542,nesiritide,NDA,2001,nesiritide,DB04899,biological,Not found,,
543,cefditoren pivoxil,NDA,2001,cefditoren pivoxil,DBSALT001811,approved,[H][C@]12SCC(\C=C/C3=C(C)N=CS3)=C(N1C(=O)[C@H]...,,
547,enfuvirtide,NDA,2003,enfuvirtide,DB00109,approved; investigational,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H]...,,
548,human secretin,NDA,2004,human secretin,DB09532,biological,Not found,,
549,pentetate calcium trisodium,NDA,2004,pentetate,DB06806,approved,[O-]C(=O)CN(CCN(CC([O-])=O)CC([O-])=O)CCN(CC([...,,


Dropping the non small molecules of our final dataset:

In [29]:
fda_small_molecules_smiles = final_fda_only_small[~final_fda_only_small["Drug Type"].isna()].reset_index(drop = True)

In [32]:
fda_small_molecules_smiles.head(3)
fda_small_molecules_smiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   active_ingredient_moiety  560 non-null    object
 1   nda_bla                   560 non-null    object
 2   approval_year             560 non-null    int64 
 3   active                    560 non-null    object
 4   DrugBank ID               560 non-null    object
 5   Drug Groups               560 non-null    object
 6   SMILES                    560 non-null    object
 7   Name                      560 non-null    object
 8   Drug Type                 560 non-null    object
dtypes: int64(1), object(8)
memory usage: 39.5+ KB


Now we finally filter the biological tagged ones:

In [33]:
filters = fda_small_molecules_smiles["Drug Groups"].str.contains(pat = "biological|polymer")
fda_small_molecules_smiles[~filters].reset_index(drop = True)

Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,DrugBank ID,Drug Groups,SMILES,Name,Drug Type
0,troglitazone,NDA,1997,troglitazone,DB00197,approved; investigational; withdrawn,CC1=C(C)C2=C(CCC(C)(COC3=CC=C(CC4SC(=O)NC4=O)C...,Troglitazone,SmallMoleculeDrug
1,imiquimod,NDA,1997,imiquimod,DB00724,approved; investigational,CC(C)CN1C=NC2=C1C1=C(C=CC=C1)N=C2N,Imiquimod,SmallMoleculeDrug
2,anagrelide hydrochloride,NDA,1997,anagrelide,DB00261,approved,ClC1=CC=C2N=C3NC(=O)CN3CC2=C1Cl,Anagrelide,SmallMoleculeDrug
3,nelfinavir mesylate,NDA,1997,nelfinavir,DB00220,approved,[H][C@@]12CCCC[C@]1([H])CN(C[C@@H](O)[C@H](CSC...,Nelfinavir,SmallMoleculeDrug
4,delavirdine mesylate,NDA,1997,delavirdine,DB00705,approved,CC(C)NC1=C(N=CC=C1)N1CCN(CC1)C(=O)C1=CC2=C(N1)...,Delavirdine,SmallMoleculeDrug
...,...,...,...,...,...,...,...,...,...
552,dalbavancin,NDA,2014,dalbavancin,DB06219,approved; investigational,CN[C@H]1C(=O)N[C@@H]2Cc3ccc(Oc4cc5cc(Oc6ccc(cc...,Dalbavancin,SmallMoleculeDrug
553,sulfur hexafluoride lipid-type A microspheres,NDA,2014,sulfur hexafluoride lipid-type A microspheres,DB11104,approved,Not found,Sulfur hexafluoride,SmallMoleculeDrug
554,sodium zirconium cyclosilicate,NDA,2018,zirconium cyclosilicate,DB14048,approved,Not found,Sodium zirconium cyclosilicate,SmallMoleculeDrug
555,tafamidis meglumine,NDA,2019,tafamidis meglumine,DB11644,approved; investigational,OC(=O)C1=CC=C2N=C(OC2=C1)C1=CC(Cl)=CC(Cl)=C1,Tafamidis,SmallMoleculeDrug


In [34]:
fda_small_molecules_smiles.to_csv("fda_small_molecules_smiles.csv")