In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm.notebook import tqdm
from collections import defaultdict

# Parsing Drug Interactions into a single file

In [None]:
PREFIX = os.path.expanduser("~/Projects/deep-learning-26wi/datasets")
MEDDRA_NAME = f"{PREFIX}/MEDDRA.xlsx"
FILENAME = f"{PREFIX}/drugbank.xml"
OUT_CSV_NAME = f"{PREFIX}/interactions.csv"
OUT_TSV_NAME = f"{PREFIX}/interactions.tsv"
OUT_AGNOSTIC_DESC_NAME = f"{PREFIX}/agnostic_interaction_descriptions.tsv"
NAMESPACE = {"db": "http://www.drugbank.ca"}
tree = ET.parse(FILENAME)
root = tree.getroot()

In [None]:
agnostic_descriptions = {}
agnostic_description_counts = defaultdict(int)
with open(OUT_TSV_NAME, "w") as f:
    f.write(f"drug_primary_id\tdrug_name\tother_drug_id\tother_drug_name\tinteraction_description\tagnostic_interaction_id\tagnostic_interaction_description\n")
    for drug in tqdm(root.findall("db:drug", NAMESPACE)):
        drug_name = drug.find("db:name", NAMESPACE).text
        drug_id = drug.find("db:drugbank-id[@primary='true']", NAMESPACE).text
        drug_interactions = drug.find("db:drug-interactions", NAMESPACE)
        for interaction in drug_interactions.findall("db:drug-interaction", NAMESPACE):
            other_drug_name = interaction.find("db:name", NAMESPACE).text
            other_drug_id = interaction.find("db:drugbank-id", NAMESPACE).text
            interaction_description = interaction.find("db:description", NAMESPACE).text
            agnostic_description = interaction_description.replace(drug_name, "").replace(other_drug_name, "").replace(".", "").strip()
            if agnostic_description not in agnostic_descriptions:
                agnostic_description_id = len(agnostic_descriptions)
                agnostic_descriptions[agnostic_description] = agnostic_description_id
            else:
                agnostic_description_id = agnostic_descriptions[agnostic_description]
            agnostic_description_counts[agnostic_description_id] += 1
            f.write(f"{drug_id}\t{drug_name}\t{other_drug_id}\t{other_drug_name}\t" +
                    f"{interaction_description}\t{agnostic_description_id}\t{agnostic_description}\n")

with open(OUT_AGNOSTIC_DESC_NAME, "w") as f:
    f.write("id,description,count\n")
    for desc, desc_id in sorted(agnostic_descriptions.items(), key=lambda p: p[1]):
        f.write(f"{desc_id},{desc},{agnostic_description_counts[desc_id]}\n")

  0%|          | 0/19830 [00:00<?, ?it/s]

In [39]:
# total interaction strings vs unique interaction strings
sum(agnostic_description_counts.values()), len(agnostic_descriptions)

(2910010, 683)

# Finding unique interaction strings

In [38]:
# Top interaction descriptions
for desc, desc_id in sorted(agnostic_descriptions.items(), key=lambda p: agnostic_description_counts[agnostic_descriptions[p[0]]], reverse=True)[:10]:
    print(desc, agnostic_description_counts[desc_id])

may decrease the excretion rate of  which could result in a higher serum level 360050
The risk or severity of adverse effects can be increased when  is combined with 339194
The metabolism of  can be decreased when combined with 304446
The risk or severity of CNS depression can be increased when  is combined with 288870
The therapeutic efficacy of  can be decreased when used in combination with 236965
The serum concentration of  can be increased when it is combined with 137039
The metabolism of  can be increased when combined with 110119
The risk or severity of QTc prolongation can be increased when  is combined with 101499
The risk or severity of hypertension can be increased when  is combined with 77457
may decrease the antihypertensive activities of 71294


# Extracting classifications

In [43]:
hierarchy = pd.read_excel(MEDDRA_NAME, sheet_name="_ID2HIERARCHY")
name = pd.read_excel(MEDDRA_NAME, sheet_name="_ID2NAME")
broader = pd.read_excel(MEDDRA_NAME, sheet_name="_BROADER")

In [49]:
hierarchy["HIERARCHY"].unique()

<StringArray>
['LLT', 'PT', 'HLT', 'HGLT', 'SOC']
Length: 5, dtype: str

In [50]:
EXPECTED_HLGT = 337
high_level_group_terms = hierarchy[hierarchy["HIERARCHY"] == "HGLT"].copy()
num_found_hlgt = len(high_level_group_terms)
assert num_found_hlgt == EXPECTED_HLGT, f"Expected {EXPECTED_HLGT}, was {num_found_hlgt}"

In [51]:
EXPECTED_SOC = 27
system_organ_class = hierarchy[hierarchy["HIERARCHY"] == "SOC"].copy()
num_found_soc = len(system_organ_class)
assert num_found_soc == EXPECTED_SOC, f"Expected {EXPECTED_SOC}, was {num_found_soc}"

In [None]:
high_level_group_terms.merge(name, on="id")