Script to parse the DrugBank code for the drug-drug interactions

In [2]:
from collections import defaultdict
import pandas as pd
import xml.etree.ElementTree as ET

In [3]:
def parse_drugbank_xml(xml_file_path):
    """
    Parse DrugBank XML file to extract drug interactions and drug information.
    
    Parameters:
    xml_file_path (str): Path to the DrugBank XML file
    
    Returns:
    tuple: (interactions_df, all_drugs)
        - interactions_df: DataFrame with columns [drug1, drug2, severity, description]
        - all_drugs: Set of all drug names in the database
    """
    # Register the DrugBank namespace
    ns = {'db': 'http://www.drugbank.ca'}
    
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    
    # Lists to store the extracted data
    interactions = []
    all_drugs = set()
    
    # Iterate through all drug entries
    for drug in root.findall('db:drug', ns):
        # Get the drug name
        drug_name = drug.find('db:name', ns).text
        all_drugs.add(drug_name)
        
        # Find all drug interactions
        drug_interactions = drug.find('db:drug-interactions', ns)
        if drug_interactions is not None:
            for interaction in drug_interactions.findall('db:drug-interaction', ns):
                interacting_drug = interaction.find('db:name', ns).text
                description = interaction.find('db:description', ns).text
                
                # Note: DrugBank XML might not have a direct severity field
                # You might need to parse it from the description or use a different field
                # This is a placeholder for severity extraction
                severity = 'Unknown'
                if description:
                    # Example severity extraction logic - customize based on your needs
                    description_lower = description.lower()
                    if 'severe' in description_lower or 'major' in description_lower:
                        severity = 'Severe'
                    elif 'moderate' in description_lower:
                        severity = 'Moderate'
                    elif 'mild' in description_lower or 'minor' in description_lower:
                        severity = 'Mild'
                
                interactions.append({
                    'drug1': drug_name,
                    'drug2': interacting_drug,
                    'severity': severity,
                    'description': description
                })
    
    # Convert to DataFrame
    interactions_df = pd.DataFrame(interactions)
    
    return interactions_df, all_drugs

In [4]:
xml_file_path = 'data/DrugBank/full database.xml'
interactions_df, all_drugs = parse_drugbank_xml(xml_file_path)
print(interactions_df.head())

       drug1                 drug2 severity  \
0  Lepirudin              Apixaban  Unknown   
1  Lepirudin  Dabigatran etexilate  Unknown   
2  Lepirudin             Dasatinib  Unknown   
3  Lepirudin           Deferasirox  Unknown   
4  Lepirudin  Ursodeoxycholic acid  Unknown   

                                         description  
0  Apixaban may increase the anticoagulant activi...  
1  Dabigatran etexilate may increase the anticoag...  
2  The risk or severity of bleeding and hemorrhag...  
3  The risk or severity of gastrointestinal bleed...  
4  The risk or severity of bleeding and bruising ...  
