Script to parse the DrugBank code for the drug target information

In [1]:
from collections import defaultdict
import pandas as pd
import xml.etree.ElementTree as ET

In [2]:
def parse_drugbank_xml(xml_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Define the namespace used in DrugBank XML
    ns = {'db': 'http://www.drugbank.ca'}
    
    # List to store all drug-target pairs
    drug_target_pairs = []
    
    # Iterate through all drugs
    for drug in root.findall('db:drug', ns):
        drug_name = drug.find('db:name', ns).text
        
        # Get all targets for the drug
        targets = drug.findall('.//db:target', ns)
        
        for target in targets:
            target_data = {
                'drug_name': drug_name,
                'target_name': None,
                'target_DrugBank_ID': None,
                'GenBank_Protein_ID': None,
                'GenBank_Gene_ID': None,
                'UniProtKB_ID': None,
                'GenAtlas_ID': None,
                'HGNC_ID': None
            }
            
            # Get target name
            polypeptide = target.find('.//db:polypeptide', ns)
            if polypeptide is not None:
                target_name = polypeptide.find('db:name', ns)
                if target_name is not None:
                    target_data['target_name'] = target_name.text
            
            # Get target DrugBank ID
            target_id = target.find('.//db:id', ns)
            if target_id is not None:
                target_data['target_DrugBank_ID'] = target_id.text
            
            # Get external identifiers
            if polypeptide is not None:
                external_ids = polypeptide.findall('.//db:external-identifier', ns)
                for ext_id in external_ids:
                    resource = ext_id.find('db:resource', ns).text
                    identifier = ext_id.find('db:identifier', ns).text
                    
                    if resource == 'GenBank Protein Database':
                        target_data['GenBank_Protein_ID'] = identifier
                    elif resource == 'GenBank Gene Database':
                        target_data['GenBank_Gene_ID'] = identifier
                    elif resource == 'UniProtKB':
                        target_data['UniProtKB_ID'] = identifier
                    elif resource == 'GenAtlas':
                        target_data['GenAtlas_ID'] = identifier
                    elif resource == 'HUGO Gene Nomenclature Committee (HGNC)':
                        target_data['HGNC_ID'] = identifier
            
            drug_target_pairs.append(target_data)
    
    # Create DataFrame
    df = pd.DataFrame(drug_target_pairs)
    return df

In [3]:
xml_file_path = 'data/DrugBank/full database.xml'
drugs_to_targets_df = parse_drugbank_xml(xml_file_path)
print(drugs_to_targets_df.head())

   drug_name                                        target_name  \
0  Lepirudin                                        Prothrombin   
1  Cetuximab                   Epidermal growth factor receptor   
2  Cetuximab  Low affinity immunoglobulin gamma Fc region re...   
3  Cetuximab              Complement C1q subcomponent subunit A   
4  Cetuximab              Complement C1q subcomponent subunit B   

  target_DrugBank_ID GenBank_Protein_ID GenBank_Gene_ID UniProtKB_ID  \
0          BE0000048             339641          M17262       P00734   
1          BE0000767             757924          X00588       P00533   
2          BE0000901              31322          X16863       O75015   
3          BE0002094            4894854        AF135157       P02745   
4          BE0002095             573114          X03084       P02746   

  GenAtlas_ID    HGNC_ID  
0          F2  HGNC:3535  
1        EGFR  HGNC:3236  
2      FCGR3B  HGNC:3620  
3        C1QA  HGNC:1241  
4        C1QB  HGNC:1242  


In [4]:
drugs_to_targets_df.to_csv('data_processed/drugbank_drug_targets.csv', index=False)