# Joining the 2022 approved drugs
This notebook has the objective of concatenating the FDA approved small-molecule drugs of 2022 (updating our dataset);

In [3]:
from rdkit import Chem
from rdkit.Chem import Descriptors, PandasTools
from rdkit.Chem.SaltRemover import SaltRemover
import pandas as pd
import session_info

In [4]:
# This is the dataset without the 2022 drugs
drugs_db_route_merged = pd.read_csv('../data/approved_drugs_final.csv')
drugs_db_route_merged = drugs_db_route_merged.drop(["ROMol","Mol_Clean","mw", "n_hba", "n_hbd", "logp"], axis=1)
drugs_db_route_merged

Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,DrugBank ID,Drug Groups,SMILES,Name,Drug Type,route
0,troglitazone,NDA,1997,troglitazone,DB00197,approved; investigational; withdrawn,CC1=C(C)C2=C(CCC(C)(COC3=CC=C(CC4SC(=O)NC4=O)C...,Troglitazone,SmallMoleculeDrug,oral
1,imiquimod,NDA,1997,imiquimod,DB00724,approved; investigational,CC(C)CN1C=NC2=C1C1=C(C=CC=C1)N=C2N,Imiquimod,SmallMoleculeDrug,topical
2,anagrelide hydrochloride,NDA,1997,anagrelide,DB00261,approved,ClC1=CC=C2N=C3NC(=O)CN3CC2=C1Cl,Anagrelide,SmallMoleculeDrug,oral
3,nelfinavir mesylate,NDA,1997,nelfinavir,DB00220,approved,[H][C@@]12CCCC[C@]1([H])CN(C[C@@H](O)[C@H](CSC...,Nelfinavir,SmallMoleculeDrug,oral
4,delavirdine mesylate,NDA,1997,delavirdine,DB00705,approved,CC(C)NC1=C(N=CC=C1)N1CCN(CC1)C(=O)C1=CC2=C(N1)...,Delavirdine,SmallMoleculeDrug,oral
...,...,...,...,...,...,...,...,...,...,...
544,gadoterate meglumine,NDA,2013,gadoterate meglumine,DB09132,approved,[Gd+3].CNC[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO....,Gadoteric acid,SmallMoleculeDrug,intravenous
545,dimethyl fumarate,NDA,2013,dimethyl fumarate,DB08908,approved; investigational,[H]\C(=C(\[H])C(=O)OC)C(=O)OC,Dimethyl fumarate,SmallMoleculeDrug,oral
546,dalbavancin,NDA,2014,dalbavancin,DB06219,approved; investigational,CN[C@H]1C(=O)N[C@@H]2Cc3ccc(Oc4cc5cc(Oc6ccc(cc...,Dalbavancin,SmallMoleculeDrug,intravenous
547,tafamidis meglumine,NDA,2019,tafamidis meglumine,DB11644,approved; investigational,OC(=O)C1=CC=C2N=C(OC2=C1)C1=CC(Cl)=CC(Cl)=C1,Tafamidis,SmallMoleculeDrug,oral


In [5]:
# Loading the approved drugs from 2022, retrived from the same sources
# The SMILES were added manually and we generated a csv file afterwards
drugs_2022 = pd.read_csv("../data/drugs_2022.csv")
drugs_2022

Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,DrugBank ID,Drug Groups,SMILES,Name,Drug Type,route
0,daridorexant,NDA,2022,daridorexant,DB15031,approved,COC1=CC(C(=O)N2CCC[C@@]2(C)C2=NC3=C(N2)C=CC(Cl...,Daridorexant,SmallMoleculeDrug,oral
1,abrocitinib,NDA,2022,abrocitinib,DB14973,approved,CCCS(=O)(=O)N[C@H]1C[C@H](C1)N(C)C1=C2C=CNC2=N...,Abrocitinib,SmallMoleculeDrug,oral
2,mitapivat,NDA,2022,mitapivat,DB16236,approved,O=C(N1CCN(CC2CC2)CC1)C1=CC=C(NS(=O)(=O)C2=CC=C...,Mitapivat,SmallMoleculeDrug,oral
3,pacritinib,NDA,2022,pacritinib,DB11697,approved,C(CN1CCCC1)OC1=CC=C2NC3=NC=CC(=N3)C3=CC(COC\C=...,Pacritinib,SmallMoleculeDrug,oral
4,ganaxolone,NDA,2022,ganaxolone,DB05087,approved,[H][C@@]12CC[C@H](C(C)=O)[C@@]1(C)CC[C@@]1([H]...,Ganaxolone,SmallMoleculeDrug,oral
5,oteseconazole,NDA,2022,oteseconazole,DB13055,approved,O[C@@](CN1C=NN=N1)(C1=CC=C(F)C=C1F)C(F)(F)C1=C...,Oteseconazole,SmallMoleculeDrug,oral
6,mavacamten,NDA,2022,mavacamten,DB14921,approved,CC(C)N1C(=O)NC(N[C@@H](C)C2=CC=CC=C2)=CC1=O,Mavacamten,SmallMoleculeDrug,oral
7,tapinarof,NDA,2022,tapinarof,DB06083,approved,CC(C)C1=C(O)C=C(\C=C\C2=CC=CC=C2)C=C1O,Tapinarof,SmallMoleculeDrug,topical
8,deucravacitinib,NDA,2022,deucravacitinib,DB16650,approved,[2H]C([2H])([2H])NC(=O)C1=C(NC2=CC=CC(C3=NN(C)...,Deucravacitinib,SmallMoleculeDrug,oral
9,omidenepag,NDA,2022,omidenepag,DB15071,approved,CC(C)OC(=O)CNC1=CC=CC(CN(CC2=CC=C(C=C2)N2C=CC=...,Omidenepag,SmallMoleculeDrug,ophthalmic


## Merging the two datasets

In [6]:
concat_object = [drugs_db_route_merged, drugs_2022]
drugs_final = pd.concat(concat_object, ignore_index=True, axis=0)
drugs_final

Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,DrugBank ID,Drug Groups,SMILES,Name,Drug Type,route
0,troglitazone,NDA,1997,troglitazone,DB00197,approved; investigational; withdrawn,CC1=C(C)C2=C(CCC(C)(COC3=CC=C(CC4SC(=O)NC4=O)C...,Troglitazone,SmallMoleculeDrug,oral
1,imiquimod,NDA,1997,imiquimod,DB00724,approved; investigational,CC(C)CN1C=NC2=C1C1=C(C=CC=C1)N=C2N,Imiquimod,SmallMoleculeDrug,topical
2,anagrelide hydrochloride,NDA,1997,anagrelide,DB00261,approved,ClC1=CC=C2N=C3NC(=O)CN3CC2=C1Cl,Anagrelide,SmallMoleculeDrug,oral
3,nelfinavir mesylate,NDA,1997,nelfinavir,DB00220,approved,[H][C@@]12CCCC[C@]1([H])CN(C[C@@H](O)[C@H](CSC...,Nelfinavir,SmallMoleculeDrug,oral
4,delavirdine mesylate,NDA,1997,delavirdine,DB00705,approved,CC(C)NC1=C(N=CC=C1)N1CCN(CC1)C(=O)C1=CC2=C(N1)...,Delavirdine,SmallMoleculeDrug,oral
...,...,...,...,...,...,...,...,...,...,...
558,omidenepag,NDA,2022,omidenepag,DB15071,approved,CC(C)OC(=O)CNC1=CC=CC(CN(CC2=CC=C(C=C2)N2C=CC=...,Omidenepag,SmallMoleculeDrug,ophthalmic
559,futibatinib,NDA,2022,futibatinib,DB15149,approved,COC1=CC(=CC(OC)=C1)C#CC1=NN([C@H]2CCN(C2)C(=O)...,Futibatinib,SmallMoleculeDrug,oral
560,olutasidenib,NDA,2022,olutasidenib,DB16267,approved,C[C@H](NC1=CC=C(C#N)N(C)C1=O)C1=CC2=C(NC1=O)C=...,Olutasidenib,SmallMoleculeDrug,oral
561,adagrasib,NDA,2022,adagrasib,DB15568,approved,[H][C@@]1(COC2=NC3=C(CCN(C3)C3=CC=CC4=C3C(Cl)=...,Adagrasib,SmallMoleculeDrug,oral


## Cleaning (again)

In [7]:
# RDKit function to create "ROMol", not cleaned yet
PandasTools.AddMoleculeColumnToFrame(drugs_final, "SMILES")

In [8]:
# Functions to clean the data

# defining the salts we want to remove from the database
remover = SaltRemover(defnData="[Cl,Br,Na,K,Gd]")

# Defining the function to neutralize the atoms in organic molecules
def neutralize_atoms(mol):
    pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
    at_matches = mol.GetSubstructMatches(pattern)
    at_matches_list = [y[0] for y in at_matches]
    if len(at_matches_list) > 0:
        for at_idx in at_matches_list:
            atom = mol.GetAtomWithIdx(at_idx)
            chg = atom.GetFormalCharge()
            hcount = atom.GetTotalNumHs()
            atom.SetFormalCharge(0)
            atom.SetNumExplicitHs(hcount - chg)
            atom.UpdatePropertyCache()
    return mol

In [9]:
mols = [] # creating an empty list
## FUNCTION FOR CHECKING THE NON READABLE:
# Creating a loop to 1) convert from smiles to mol and use the `remover.StripMol() on each structure
for i, smi in enumerate(drugs_final.SMILES):
    try:
        mol = Chem.MolFromSmiles(smi) # converting to a proper format
        mol = remover.StripMol(mol) # stripping the salts
        mols.append(mol) # appending to a list
    except:
        print(smi,i) # printing the exceptions, or the non-readable smiles and the respective index

In [10]:
# Running the same function again (it's better to define a function and just call it again...)
mols = []
for i, smi in enumerate(drugs_final.SMILES):
    try:
        mol = Chem.MolFromSmiles(smi)
        mol = remover.StripMol(neutralize_atoms(mol))
        mols.append(mol)
    except:
        print(smi,i)

In [11]:
# Applying the functions created before

# Removing the salts
drugs_final["Mol_Clean"] = drugs_final.ROMol.apply(remover.StripMol)
# neutralizing atoms O- goes to OH and so on..
drugs_final["Mol_Clean"] = drugs_final.Mol_Clean.apply(neutralize_atoms)

### Generating descriptors

In [12]:
drugs_final["mw"] = drugs_final["Mol_Clean"].apply(Descriptors.ExactMolWt)
drugs_final["n_hba"] = drugs_final["Mol_Clean"].apply(Descriptors.NumHAcceptors)
drugs_final["n_hbd"] = drugs_final["Mol_Clean"].apply(Descriptors.NumHDonors)
drugs_final["logp"] = drugs_final["Mol_Clean"].apply(Descriptors.MolLogP)

## Saving the file

In [13]:
drugs_final = drugs_final.drop(['ROMol', 'Mol_Clean'], axis=1)
drugs_final

Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,DrugBank ID,Drug Groups,SMILES,Name,Drug Type,route,mw,n_hba,n_hbd,logp
0,troglitazone,NDA,1997,troglitazone,DB00197,approved; investigational; withdrawn,CC1=C(C)C2=C(CCC(C)(COC3=CC=C(CC4SC(=O)NC4=O)C...,Troglitazone,SmallMoleculeDrug,oral,441.160994,6,2,4.37426
1,imiquimod,NDA,1997,imiquimod,DB00724,approved; investigational,CC(C)CN1C=NC2=C1C1=C(C=CC=C1)N=C2N,Imiquimod,SmallMoleculeDrug,topical,240.137497,4,1,2.82270
2,anagrelide hydrochloride,NDA,1997,anagrelide,DB00261,approved,ClC1=CC=C2N=C3NC(=O)CN3CC2=C1Cl,Anagrelide,SmallMoleculeDrug,oral,254.996617,3,1,1.92630
3,nelfinavir mesylate,NDA,1997,nelfinavir,DB00220,approved,[H][C@@]12CCCC[C@]1([H])CN(C[C@@H](O)[C@H](CSC...,Nelfinavir,SmallMoleculeDrug,oral,567.313078,6,4,4.74762
4,delavirdine mesylate,NDA,1997,delavirdine,DB00705,approved,CC(C)NC1=C(N=CC=C1)N1CCN(CC1)C(=O)C1=CC2=C(N1)...,Delavirdine,SmallMoleculeDrug,oral,456.194360,6,3,2.71710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558,omidenepag,NDA,2022,omidenepag,DB15071,approved,CC(C)OC(=O)CNC1=CC=CC(CN(CC2=CC=C(C=C2)N2C=CC=...,Omidenepag,SmallMoleculeDrug,ophthalmic,520.189274,9,1,3.41690
559,futibatinib,NDA,2022,futibatinib,DB15149,approved,COC1=CC(=CC(OC)=C1)C#CC1=NN([C@H]2CCN(C2)C(=O)...,Futibatinib,SmallMoleculeDrug,oral,418.175339,8,1,1.78490
560,olutasidenib,NDA,2022,olutasidenib,DB16267,approved,C[C@H](NC1=CC=C(C#N)N(C)C1=O)C1=CC2=C(NC1=O)C=...,Olutasidenib,SmallMoleculeDrug,oral,354.088353,5,2,2.92498
561,adagrasib,NDA,2022,adagrasib,DB15568,approved,[H][C@@]1(COC2=NC3=C(CCN(C3)C3=CC=CC4=C3C(Cl)=...,Adagrasib,SmallMoleculeDrug,oral,603.252479,8,0,4.73298


In [14]:
drugs_final.to_csv("../data/approved_drugs_final_2022.csv", index=False)

In [15]:
session_info.show()