# defreeze drugbank data
# download from https://go.drugbank.com/releases/latest

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import zipfile
import pubchempy as pcp
import os
import sys
import time
current_dir = os.getcwd()
parent_parent_dir = os.path.dirname(os.path.dirname(current_dir))
src_dir = os.path.join(parent_parent_dir, 'src')
sys.path.append(src_dir)

from util import *

In [2]:
zip_path = '../../data/raw/drugbank/drugbank_all_full_database.xml.zip'
extract_path = '../../data/defreezed/drugbank'

# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_path)
# print(f"Extracted to {extract_path}")

In [None]:
filein = f'{extract_path}/full database.xml'

### re parse
tree = ET.parse(filein)
root = tree.getroot()
del tree

# ID
drugbank_id = []
ap = drugbank_id.append
for v in root:
    ele = v.find('{http://www.drugbank.ca}drugbank-id')
    if ele is None:
        ap('')
    else:
        ap(ele.text)

# Drugname
drugbank_names = []
ap = drugbank_names.append
for v in root:
    ele = v.find('{http://www.drugbank.ca}name')
    if ele is None:
        ap([])
    else:
        ap(ele.text)

# Cas number
casnumber = []
ap = casnumber.append
for v in root:
    ele = v.find('{http://www.drugbank.ca}cas-number')
    if ele is None:
        ap([])
    else:
        ap(ele.text)

# approved dates
appr_dates = []
ap = appr_dates.append
for i in root:
    dates = set()
    ap2 = dates.add
    v = i.find('{http://www.drugbank.ca}products')
    for x in v:
        ele = x.find("{http://www.drugbank.ca}started-marketing-on")
        if ele is not None:
            if ele.text != None:
                ap2(ele.text)
    if len(dates)==0:
        dates=None
    ap(dates)

# drug category
category = []
ap = category.append
for v in root:
    col = []
    ele = v.find('{http://www.drugbank.ca}categories')
    for x in ele:
        e = x.find("{http://www.drugbank.ca}category")
        if e is not None:
            col.append(e.text)
    ap(col)

smile = []
ap = smile.append
for v in root:
    col = []
    ele = v.find('{http://www.drugbank.ca}calculated-properties')
    if ele != None:
        for x in ele:
            smiles = False
            e = x.find("{http://www.drugbank.ca}kind")
            if e.text == "SMILES":
                smiles = True
            e = x.find("{http://www.drugbank.ca}value")
            if e != None and smiles == True:
                col.append(e.text)
    ap(col)

df = pd.DataFrame([drugbank_id, drugbank_names, casnumber, appr_dates, category, smile],index=["ID","name","cas","dates","category","smiles"]).T

In [35]:
print(len(df))

17430


In [41]:
df.to_csv("../../data/processed/drugbank/250106_drugbank.csv", index=False)

In [6]:
df = pd.read_csv("../../data/processed/drugbank/250106_drugbank.csv")

In [13]:
from tqdm import tqdm
for_pcp = []
for i in tqdm(range(len(df))):
    if df["smiles"][i] != []:
        smiles = df["smiles"][i][0]
    else:
        smiles = ""
    if pd.isna(df["cas"][i]):
        cas = ""
    else:
        cas = df["cas"][i]
    for_pcp.append([df["name"][i], cas, smiles])

100%|██████████| 17430/17430 [00:01<00:00, 11602.89it/s]


In [9]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [16]:
smiles = []
for i in tqdm(range(len(for_pcp))):
    name = for_pcp[i][0]
    time.sleep(0.2)
    compound = pcp.get_compounds(name, namespace='name')
    if not compound:
        cas = for_pcp[i][1]
        if cas != "":
            time.sleep(0.2)
            compound = pcp.get_compounds(cas, namespace='name')
        else:
            compound = None
    if compound:
        isomeric_smiles = compound[0].isomeric_smiles
        if isomeric_smiles:
            smiles.append(isomeric_smiles)
        else:
            canonical_smiles = compound[0].canonical_smiles
            if canonical_smiles:
                smiles.append(canonical_smiles)
            else:
                if for_pcp[i][2] != "":
                    smiles.append(for_pcp[i][2])
                else:
                    smiles.append("")
    else:
        if for_pcp[i][2] != "":
            smiles.append(for_pcp[i][2])
        else:
            smiles.append("")

100%|██████████| 17430/17430 [5:04:13<00:00,  1.05s/it]  


In [40]:
rdkit_smiles = []
for i in tqdm(smiles):
    if i == "[":
        rdkit_smiles.append(None)
        continue
    mol = Chem.MolFromSmiles(i)
    if mol is None:
        rdkit_smiles.append(i)
        continue
    new = Chem.MolToSmiles(mol)
    rdkit_smiles.append(new)

100%|██████████| 17430/17430 [00:08<00:00, 1983.67it/s]


In [46]:
len(rdkit_smiles)

17430

In [48]:
filtered_length = len(list(filter(lambda x: x is not None, rdkit_smiles)))
print("Length without None:", filtered_length)

Length without None: 12924


In [42]:
formatted_dates = []
for i in range(len(df)):
    date = df["dates"][i]
    if date != "[]":
        d = min(eval(date)).replace("-", "")
        date = convert_date_to_decimal(d)
        formatted_dates.append(date)
    else:
        formatted_dates.append(None)

In [43]:
formatted = [[rdkit_smiles[i], formatted_dates[i]] for i in range(len(rdkit_smiles))]
df_formatted = pd.DataFrame(formatted)

In [47]:
df_formatted = df_formatted.dropna()
df_formatted.to_csv("250106_drugbank_formatted.csv", index=False)

In [49]:
pd.DataFrame(list(filter(lambda x: x is not None, rdkit_smiles))).to_csv("250106_drugbank_onlysmiles.csv", index=False)

In [52]:
mol_date = []
smiles_date = dict()

for i in range(len(df_formatted)):
    smiles = df_formatted.iloc[i,0]
    date = df_formatted.iloc[i,1]
    mol = Chem.MolFromSmiles(smiles)
    mol_date.append([mol, date])
    smiles_date[smiles] = date

In [53]:
pickle_dump(mol_date, "../../data/processed/drugbank/250106_drugbank_mol_date.pickle")
pickle_dump(smiles_date, "../../data/processed/drugbank/250106_drugbank_smiles_date.pickle")