In [65]:
import pandas as pd
import numpy as np
import requests
import io
from datetime import datetime
import time
import lxml
from bs4 import BeautifulSoup
from zipfile import ZipFile
from sqlalchemy import create_engine

In [66]:
# Navigate to the DrugBank download page and capture the release date and download URL
source = requests.get('https://go.drugbank.com/releases/latest#open-data').text
soup = BeautifulSoup(source)

In [67]:
element = soup.find("td", text="DrugBank Vocabulary")
nextSiblings = element.find_next_siblings("td")
release_date = nextSiblings[1].get_text()
url_redirect = 'https://go.drugbank.com' + nextSiblings[6].findChild('a')['href']

In [71]:
df_db = None

# String with URL:
# url_redirect = 'https://go.drugbank.com/releases/5-1-8/downloads/all-drugbank-vocabulary'
r = requests.get(url_redirect, allow_redirects=False)
url = r.headers['Location']
response = requests.get(url)
with ZipFile(io.BytesIO(response.content)) as thezip:
    for zipinfo in thezip.infolist():
        thefile = thezip.open(zipinfo)
        df_db = pd.read_csv(thefile)

In [72]:
# Drop a few columns
df_db = df_db.drop(['CAS', 'UNII', 'Standard InChI Key'], axis=1)

# Rename the columns
df_db.rename(columns = {'DrugBank ID': 'DrugBank_ID', 'Common name': 'Common_name', 'Accession Numbers': 'Accession_Numbers' }, inplace = True)

# Add new columns for Source, Source_Date, and Download_Date
dt_release = datetime.strptime(release_date, '%Y-%m-%d')
str_date = int(dt_release.strftime("%Y%m%d"))
df_db["Source"] = "DrugBank"
df_db["Source_Date"] = str_date
df_db["Download_Date"] = int(time.strftime("%Y%m%d"))

In [73]:
# Write the file
df_db.to_csv(r"data files/DRUGBANK_drug_info.tsv", sep='\t')

# Store it in the database
engine = create_engine('sqlite:///TargetLink.db', echo=False)
sqlite_connection = engine.connect()
sqlite_table = "DRUGBANK_drug_info"
df_db.to_sql(sqlite_table, sqlite_connection, index=False, if_exists='replace')
sqlite_connection.close()