In [1]:
# import relevant libraries
import pandas as pd
import xml.etree.ElementTree as ET
import time

### Step 1: Parse DrugBank XML file

In [2]:
# set location of XML file
drugbank_xml_file_path = 'input/drugbank_full_database_5-1-8.xml'

# set namespace value found in XML file
name_space = '{http://www.drugbank.ca}'

In [3]:
# set start time
start = time.time()

# parse XML file and set root
tree = ET.parse(drugbank_xml_file_path)
root = tree.getroot()

# set end time
end = time.time()

print('Number of records in DrugBank XML file: %i' % len(root))
print('Time taken to load DrugBank XML file: %i seconds' % (end - start))

Number of records in DrugBank XML file: 14315
Time taken to load DrugBank XML file: 50 seconds


In [4]:
# iterate through each record to extract relevant drug information

all_drugs_list = list()

def set_drug_type(drug):
    drug_type = drug.get('type')
    if drug_type == 'biotech':
        categories = [category.text for category in drug.findall("{name_space}categories/{name_space}category/{name_space}category".format(name_space = name_space))]
        drug_type = 'antibody' if 'Antibodies' in categories else 'other'
    return drug_type

for i, drug in enumerate(root):
    drug_entry = {}
    assert drug.tag == name_space + 'drug'
    drug_entry['drugbank_id'] = drug.findtext(name_space + "drugbank-id[@primary='true']")
    drug_entry['name'] = drug.findtext(name_space + "name")
    drug_entry['type'] = set_drug_type(drug)
    drug_entry['groups'] = [group.text for group in drug.findall("{name_space}groups/{name_space}group".format(name_space = name_space))]
    all_drugs_list.append(drug_entry)
    
print('Number of records found in all_drugs_list: %i' % len(all_drugs_list))

Number of records found in all_drugs_list: 14315


In [5]:
# create DataFrame with list of drugs and relevant drug information

all_drugs_df = pd.DataFrame(all_drugs_list)
all_drugs_df.head(5)

Unnamed: 0,drugbank_id,groups,name,type
0,DB00001,[approved],Lepirudin,other
1,DB00002,[approved],Cetuximab,antibody
2,DB00003,[approved],Dornase alfa,other
3,DB00004,"[approved, investigational]",Denileukin diftitox,other
4,DB00005,"[approved, investigational]",Etanercept,antibody


In [6]:
# save DataFrame as CSV
all_drugs_df.to_csv('output/drugbank_database_slim.csv', index=False)

### Step 2: Parse DrugBank pharmacologically active targets CSV file 

In [7]:
# set location of DrugBank target file
drugbank_target_file_path = 'input/drugbank_pharmacologically_active_targets_5-1-8.csv'

In [8]:
# import file into DataFrame
drugbank_targets_df = pd.read_csv(drugbank_target_file_path, sep=',')
print('Number of records found in DrugBank pharmacologically active CSV file: %i' % len(drugbank_targets_df))

Number of records found in DrugBank pharmacologically active CSV file: 1317


In [9]:
# explore DataFrame with DrugBank target information
drugbank_targets_df.head(5)

Unnamed: 0,ID,Name,Gene Name,GenBank Protein ID,GenBank Gene ID,UniProt ID,Uniprot Title,PDB ID,GeneCard ID,GenAtlas ID,HGNC ID,Species,Drug IDs
0,1,Peptidoglycan synthase FtsI,ftsI,1574687.0,L42023,P45059,FTSI_HAEIN,,,,,Haemophilus influenzae (strain ATCC 51907 / DS...,DB00303
1,4,Coagulation factor XIII A chain,F13A1,182309.0,M22001,P00488,F13A_HUMAN,1EVU; 1EX0; 1F13; 1FIE; 1GGT; 1GGU; 1GGY; 1QRK...,,F13A1,HGNC:3531,Humans,DB11300; DB11311; DB11571; DB13151
2,5,"Nitric oxide synthase, inducible",NOS2,292242.0,L09210,P35228,NOS2_HUMAN,1NSI; 2LL6; 2NSI; 3E7G; 3EJ8; 3HR4; 4CX7; 4NOS,,NOS2A,HGNC:7873,Humans,DB08814
3,8,"NAD(P) transhydrogenase, mitochondrial",NNT,1110520.0,U40490,Q13423,NNTM_HUMAN,1DJL; 1PT9; 1U31,,NNT,HGNC:7863,Humans,DB09092
4,11,"Isocitrate dehydrogenase [NAD] subunit alpha, ...",IDH3A,706839.0,U07681,P50213,IDH3A_HUMAN,,,IDH3A,HGNC:5384,Humans,DB09092


In [10]:
# print list of species found in DataFrame
drugbank_targets_df['Species'].value_counts()

Humans                                                                                           1095
Escherichia coli (strain K12)                                                                      40
Escherichia coli                                                                                    9
Streptococcus pneumoniae serotype 4 (strain ATCC BAA-334 / TIGR4)                                   8
Mycobacterium tuberculosis                                                                          7
Klebsiella pneumoniae                                                                               7
Gram positive and gram negative bacteria                                                            7
Clostridium botulinum                                                                               7
Staphylococcus aureus                                                                               6
Yeast                                                                             

In [11]:
# filter DataFrame for only human data
drugbank_human_targets_df = drugbank_targets_df[drugbank_targets_df['Species'] == 'Humans'].copy()
print('Number of records where Species == "Human": %i' % len(drugbank_human_targets_df))

Number of records where Species == "Human": 1095


### Step 3: Create DrugBank clinical targets CSV export

In [12]:
all_rows = list()

clinic_drug_groups = [
    'approved',
    'investigational',
    'withdrawn'
]

def filter_drug_list(list_of_drugs, modality):
    my_list = list()
    for drug in list_of_drugs:
        drug_groups = drug['groups']
        if any(group in drug_groups for group in clinic_drug_groups):
            if drug['type'] == modality:
                my_list.append(drug)
    return my_list

def handle_no_drug_data(drug_id):
    drug_obj = {
        'drugbank_id': drug_id,
        'name': 'unknown',
        'type': 'unknown',
        'groups': 'unknown',
    }
    return drug_obj
    
for index, row in drugbank_human_targets_df.iterrows():
    entry = {}
    entry['db_symbol'] = row['Gene Name']
    entry['accession'] = row['UniProt ID']
    drug_ids = row['Drug IDs'].split('; ')
    all_drugs = [(next((drug for drug in all_drugs_list if drug['drugbank_id'] == x), handle_no_drug_data(x))) for x in drug_ids]
    entry['db_all_drugs_in_clinic'] = all_drugs
    entry['db_small_molecule_drugs_in_clinic'] = filter_drug_list(all_drugs, 'small molecule') 
    entry['db_antibody_drugs_in_clinic'] = filter_drug_list(all_drugs, 'antibody')
    entry['db_other_drugs_in_clinic'] = filter_drug_list(all_drugs, 'other')
    entry['db_is_small_molecule_in_clinic'] = True if len(entry['db_small_molecule_drugs_in_clinic']) > 0 else False
    entry['db_is_antibody_in_clinic'] = True if len(entry['db_antibody_drugs_in_clinic']) > 0 else False
    entry['db_is_other_in_clinic'] = True if len(entry['db_other_drugs_in_clinic']) > 0 else False
    all_rows.append(entry)

In [13]:
drugbank_clinical_targets_df = pd.DataFrame(all_rows)

In [14]:
reordered_columns = [
    'accession',
    'db_symbol',
    'db_is_small_molecule_in_clinic',
    'db_is_antibody_in_clinic',
    'db_is_other_in_clinic',
    'db_small_molecule_drugs_in_clinic',
    'db_antibody_drugs_in_clinic',
    'db_other_drugs_in_clinic',
    'db_all_drugs_in_clinic',
]

In [15]:
drugbank_clinical_targets_df = drugbank_clinical_targets_df[reordered_columns]
drugbank_clinical_targets_df.to_csv('output/drugbank_clinical_targets.csv', index=False)