Parsing the DrugBank full database XML file.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup 

In [None]:
soup = BeautifulSoup(open(r"...path_to\RDsqr-KG\Datasets\DrugBank\drugbank_all_full_database.xml\full database.xml"),"xml")

In [3]:
#initializing lists to store the drug interactions
drug_interaction = []
drug_pathway = []
drug_atc = []

In [4]:
#classify the drugs based on the status - approved, investigational, etc.,
drug_status = []

In [None]:
for drug in soup.find_all("drug"):

  drug_id = drug.find("drugbank-id").text
  groups = {group.get_text(strip=True).lower() for group in drug.find_all("group")}

  is_include = groups.intersection({'approved','investigational'})
  is_exclude = groups.intersection({'withdrawn','illicit','nutraceutical','experimental','vet_approved'})

  if is_include and not is_exclude:
     
        drug_status.append({
            'drug_name': drug_id,
            'include': bool(is_include),
            'inclusion_types': ', '.join(is_include) if is_include else 'None' 
        })
        
        #Extract drug interactions
        interactions = drug.find_all("drug-interaction")
        for interaction in interactions:
            interaction_id_tag = interaction.find("drugbank-id")
            if interaction_id_tag:
                interaction_id = interaction_id_tag.get_text(strip=True)
                drug_interaction.append((drug_id, interaction_id))
        
        #Extract drug pathways
        pathways = drug.find_all("pathway")
        for pathway in pathways:
            smpdb_id_tag = pathway.find("smpdb-id")
            pathway_name_tag = pathway.find("name")
            smpdb_id = smpdb_id_tag.get_text(strip=True) if smpdb_id_tag else 'N/A'
            pathway_name = pathway_name_tag.get_text(strip=True) if pathway_name_tag else 'N/A'
            drug_pathway.append((drug_id, smpdb_id, pathway_name))
        
        #Extract drug atc
        atc_codes = drug.find_all("atc-code")
        for atc in atc_codes:
            atc_code = atc.get('code')
            if atc_code:
                drug_atc.append((drug_id, atc_code))

In [7]:
df_ddi = pd.DataFrame(drug_interaction, columns=['drug1_id', 'drug2_id'])
df_pathways = pd.DataFrame(drug_pathway, columns=['drugbank_id', 'smpdb_id', 'pathway_name'])
df_drug_atc = pd.DataFrame(drug_atc, columns=['drugbank_id', 'atc_code'])
df_drug_groups = pd.DataFrame(drug_status)

In [8]:
df_pathways.shape

(2467, 3)

In [9]:
df_ddi.shape

(2298533, 2)

In [8]:
df_drug_atc.shape

(4346, 2)

In [9]:
df_drug_groups.shape

(15564, 3)

In [None]:
df_pathways.to_csv(r"...path_to\RDsqr-KG\Preprocessed_datasets\drug_pathway.csv", index=False)
df_ddi.to_csv(r"...path_to\RDsqr-KG\Preprocessed_datasets\drug_drug.csv", index=False)
df_drug_atc.to_csv(r"...path_to\RDsqr-KG\Preprocessed_datasets\drug_atc_drugbank.csv")
df_drug_groups.to_csv(r"...path_to\RDsqr-KG\Preprocessed_datasets\drug_status.csv", index=False)