In [None]:
import requests
import time
import lxml
import pandas as pd
from bs4 import BeautifulSoup

drug_name_selector = '.drug-card h1'
target_name_selector_1 = '.card-header strong a'
target_name_selector_2 = '.card-header strong'
actions_selector = '.card-body .badge-action'
uniprot_selector = '.card-body dt#uniprot-id'
gene_selector = '.card-body dt#gene-name'

def get_target_info(dbid):
  target_list = []
  source = requests.get('https://go.drugbank.com/drugs/' + dbid).text
  soup = BeautifulSoup(source, 'lxml')
  drug_targets = soup.find_all('div', class_="bond card")
  for target in drug_targets:
    target_name = ''
    actions = ''
    uniprot_id = ''
    gene_name = ''

    # Drug name
    drug_name = soup.select(drug_name_selector)[0].text

    # Target name
    if len(target.select(target_name_selector_1)) == 1:
      target_name = target.select(target_name_selector_1)[0].text
    else:
      tmp_name = target.select(target_name_selector_2)[0].text
      target_name = tmp_name.split(" ", 1)[1]

    # Actions
    if len(target.select(actions_selector)) == 1:
      actions = target.select(actions_selector)[0].text

    # Uniprot
    if len(target.select(uniprot_selector)) == 1:
      uniprot_id = target.select(uniprot_selector)[0].find_next_sibling().text
    
    # Gene name
    if len(target.select(gene_selector)) == 1:
      gene_name = target.select('.card-body dt#gene-name')[0].find_next_sibling().text
      
    target_list.append((drug_name, target_name, actions, uniprot_id, gene_name))

  return target_list

In [None]:
in_file = r'../data files/DRUGBANK_drug_info.tsv'
out_filename = r'../data files/DRUGBANK_Targets.tsv'

# Create an empty file except for headers
df = pd.DataFrame(columns=['DrugBank_ID','HGNC_ID','Drug_Name','Target','Actions','UniProt_ID', 'Gene_symbol','Source', 'Source_Date', 'Download_Date'])
df.to_csv(out_filename, sep = '\t', index=False)

csv_data = pd.read_csv(in_file, sep='\t')
data = []
row_num = 0
today = int(time.strftime("%Y%m%d"))
for index, row in csv_data.iterrows():

  try:
    dbid = csv_data["DrugBank_ID"][index]
    target_tuples = get_target_info(dbid)
    for drug_name, target_name, actions, uniprot_id, gene_name in target_tuples:
      data_item = {'DrugBank_ID': dbid, 'HGNC_ID': '', 'Drug Name': drug_name, 'Target Name': target_name, 'actions': actions,  'uniprot_id': uniprot_id, 'gene name': gene_name, 
        'Source': 'DrugBank', 'Source_Date': today, 'Download_Date': today}
      data.append(data_item)
    
    row_num += 1
  except:
    print('Failure on row ', dbid)
  if row_num % 10 == 0:
    print("Rows processed: ", row_num)
    df = pd.DataFrame(data)
    df.to_csv(out_filename, sep = '\t', index=False, mode='a', header=False)
    data = []

# Add any last data
df = pd.DataFrame(data)
df.to_csv(out_filename, sep = '\t', index=False, mode='a', header=False)

In [13]:
# The last step is to open the generated file and fill in the HGNC_ID column based on the Gene_symbol
df = pd.read_csv(out_filename, sep='\t')

# Fill in the HGNC column by loading by the HGNC gene list and the previous symbols, then joining
df_gene_info = pd.read_csv(r"../data files/HGNC_gene_info.tsv", sep='\t')
df_prev_sym = pd.read_csv(r"../data files/HGNC_previous_symbols.tsv", sep='\t')
df["HGNC_ID"] = pd.merge(df, df_gene_info, on='Gene_symbol', how='left')[["HGNC_ID_y"]]
df.loc[df["HGNC_ID"].isnull(), "HGNC_ID"] = pd.merge(df, df_prev_sym, left_on='Gene_symbol', right_on='Previous_symbol', how='left')[["HGNC_ID_y"]].squeeze()

# Write the file
df.to_csv(out_filename, sep='\t')