In [1]:
import pandas as pd
import numpy as np
import json
import xml.etree.ElementTree as ET

In [2]:
file_path = 'database-3Jan2021.xml'
name_space = '{http://www.drugbank.ca}'

In [3]:
tree = ET.parse(file_path)
root = tree.getroot()
print(len(root))

14315


In [4]:
rows = list()

def set_drug_type(drug):
    drug_type = drug.get('type')
    if drug_type == 'biotech':
        categories = [category.text for category in drug.findall("{name_space}categories/{name_space}category/{name_space}category".format(name_space = name_space))]
        drug_type = 'antibody' if 'Antibodies' in categories else 'other'
    return drug_type

for i, drug in enumerate(root):
    row = {}
    assert drug.tag == name_space + 'drug'
    row['drugbank_id'] = drug.findtext(name_space + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(name_space + "name")
    row['type'] = set_drug_type(drug)
    row['groups'] = [group.text for group in drug.findall("{name_space}groups/{name_space}group".format(name_space = name_space))]
    rows.append(row)

In [None]:
print(len(rows))

# test_drug_entry = next((drug for drug in rows if drug['drugbank_id'] == 'DB15685'), None)
# print(test_drug_entry)

In [None]:
df = pd.DataFrame(rows)
df.head(10)

In [None]:
df['type'].value_counts()

In [None]:
df2 = pd.read_csv("drugbank_targets_subset.csv", sep=",")

In [None]:
df2.head()

In [None]:
import csv

# with open("drugbank_targets_subset.csv") as file:
#     raw_csv_file = csv.reader(file, delimiter=',')
#     next(raw_csv_file) #skip header
#     for row in raw_csv_file:

raw_csv_file = csv.DictReader(open('drugbank_pharmacologically_active_targets.csv'))

all_data = list()

for row in raw_csv_file:
    entry = {}
    entry['small_molecule_tractable'] = False
    entry['antibody_tractable'] = False
    entry['other_tractable'] = False 
    entry['symbol'] = row['Gene Name']
    entry['accession'] = row['UniProt ID']
#     entry['drug_ids'] = list(row['Drug IDs'].split('; '))
    drug_ids = list(row['Drug IDs'].split('; '))
    my_list = list()
    for x in drug_ids:
        drug_obj = next((drug for drug in rows if drug['drugbank_id'] == x), None)
        if drug_obj is not None:
            if 'approved' in drug_obj['groups'] or 'investigational' in drug_obj['groups'] or 'withdrawn' in drug_obj['groups']:
                if drug_obj['type'] == 'small molecule':
                    entry['small_molecule_tractable'] = True
                if drug_obj['type'] == 'antibody':
                    entry['antibody_tractable'] = True
                if drug_obj['type'] == 'other':
                    entry['other_tractable'] = True
                my_list.append(drug_obj)
    entry['drugs'] = my_list
    all_data.append(entry)

In [None]:
df3 = pd.DataFrame(all_data)

In [None]:
print(len(df3))

In [None]:
df3.to_csv("diff-types-only-run.csv")

In [None]:
df3.head()

### Second attempt with pandas DataFrames

In [13]:
drugbank_all_targets_df = pd.read_csv('drugbank_pharmacologically_active_targets_3Jan2021.csv', sep=',')

In [14]:
print(len(drugbank_all_targets_df))
drugbank_all_targets_df['Species'].value_counts()

1317


Humans                                                                                           1095
Escherichia coli (strain K12)                                                                      40
Escherichia coli                                                                                    9
Streptococcus pneumoniae serotype 4 (strain ATCC BAA-334 / TIGR4)                                   8
Mycobacterium tuberculosis                                                                          7
Klebsiella pneumoniae                                                                               7
Gram positive and gram negative bacteria                                                            7
Clostridium botulinum                                                                               7
Yeast                                                                                               6
Staphylococcus aureus                                                             

In [15]:
drugbank_human_targets_df = drugbank_all_targets_df[drugbank_all_targets_df['Species'] == 'Humans'].copy()
print(len(drugbank_human_targets_df))
drugbank_human_targets_df['Species'].value_counts()

1095


Humans    1095
Name: Species, dtype: int64

In [None]:
# all_rows = list()

# drug_groups = [
#     'approved',
#     'investigational',
#     'withdrawn'
# ]

# def process_drug_info(list_of_drugs, modality):
#     my_list = list()
#     for drug in list_of_drugs:
#         if isinstance(drug, str):
#             pass
#         else:
# #             if drug['type'] == modality and any(group in drug_groups for group in drug['groups']):
#             if drug['type'] == modality:
#                 my_list.append(drug)
#     return my_list

# for index, row in drugbank_human_targets_df.iterrows():
#     entry = {}
#     entry['symbol'] = row['Gene Name']
#     entry['accession'] = row['UniProt ID']
#     drug_ids = row['Drug IDs'].split('; ')
#     all_drugs = [(next((drug for drug in rows if drug['drugbank_id'] == x), "Entry N/A for " + x)) for x in drug_ids]
#     entry['all_drugs'] = all_drugs
#     entry['small_molecule_drugs_in_clinic'] = process_drug_info(all_drugs, 'small molecule') 
#     entry['antibody_drugs_in_clinic'] = process_drug_info(all_drugs, 'antibody')
#     entry['other_drugs_in_clinic'] = process_drug_info(all_drugs, 'other')
#     entry['is_small_molecule_in_clinic'] = True if len(entry['small_molecule_drugs_in_clinic']) > 0 else False
#     entry['is_antibody_in_clinic'] = True if len(entry['antibody_drugs_in_clinic']) > 0 else False
#     entry['is_other_in_clinic'] = True if len(entry['other_drugs_in_clinic']) > 0 else False
#     all_rows.append(entry)

In [16]:
all_rows = list()

clinic_drug_groups = [
    'approved',
    'investigational',
    'withdrawn'
]

def filter_drug_list(list_of_drugs, modality):
    my_list = list()
    for drug in list_of_drugs:
        drug_groups = drug['groups']
        if any(group in drug_groups for group in clinic_drug_groups):
            if drug['type'] == modality:
                my_list.append(drug)
    return my_list

def handle_no_drug_data(drug_id):
    drug_obj = {
        'drugbank_id': drug_id,
        'name': 'unknown',
        'type': 'unknown',
        'groups': 'unknown',
    }
    return drug_obj
    
for index, row in drugbank_human_targets_df.iterrows():
    entry = {}
    entry['db_symbol'] = row['Gene Name']
    entry['accession'] = row['UniProt ID']
    drug_ids = row['Drug IDs'].split('; ')
    all_drugs = [(next((drug for drug in rows if drug['drugbank_id'] == x), handle_no_drug_data(x))) for x in drug_ids]
    entry['db_all_drugs_in_clinic'] = all_drugs
    entry['db_small_molecule_drugs_in_clinic'] = filter_drug_list(all_drugs, 'small molecule') 
    entry['db_antibody_drugs_in_clinic'] = filter_drug_list(all_drugs, 'antibody')
    entry['db_other_drugs_in_clinic'] = filter_drug_list(all_drugs, 'other')
    entry['db_is_small_molecule_in_clinic'] = True if len(entry['db_small_molecule_drugs_in_clinic']) > 0 else False
    entry['db_is_antibody_in_clinic'] = True if len(entry['db_antibody_drugs_in_clinic']) > 0 else False
    entry['db_is_other_in_clinic'] = True if len(entry['db_other_drugs_in_clinic']) > 0 else False
    all_rows.append(entry)

In [17]:
df2 = pd.DataFrame(all_rows)

In [18]:
reordered_columns = [
    'accession',
    'db_symbol',
    'db_is_small_molecule_in_clinic',
    'db_is_antibody_in_clinic',
    'db_is_other_in_clinic',
    'db_small_molecule_drugs_in_clinic',
    'db_antibody_drugs_in_clinic',
    'db_other_drugs_in_clinic',
    'db_all_drugs_in_clinic',
]

In [19]:
df_for_csv_export = df2[reordered_columns].copy()
df_for_csv_export.to_csv('drugbank-clinical-targets-drugs-3Jan2021-data.csv')