In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
xml_path = "/content/drive/MyDrive/ALL-ADDITIONAL-TRAINING/full database.xml"

with open(xml_path, "rb") as f:
    print(f.read(200))



b'<?xml version="1.0" encoding="UTF-8"?>\n<drugbank xmlns="http://www.drugbank.ca" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.drugbank.ca http://www.drugbank.ca/'


In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

xml_path = "/content/drive/MyDrive/ALL-ADDITIONAL-TRAINING/full database.xml"

tree = ET.parse(xml_path)
root = tree.getroot()

ns = {"db": "http://www.drugbank.ca"}



In [None]:
import pandas as pd
ns = {"db": "http://www.drugbank.ca"}
rows = []

for drug in root.findall("db:drug", ns):

    # Source drug info
    src_id = drug.find("db:drugbank-id[@primary='true']", ns)
    src_name = drug.find("db:name", ns)

    src_id = src_id.text if src_id is not None else None
    src_name = src_name.text if src_name is not None else None

    interactions = drug.find("db:drug-interactions", ns)
    if interactions is None:
        continue

    for interaction in interactions.findall("db:drug-interaction", ns):
        tgt_id = interaction.find("db:drugbank-id", ns)
        tgt_name = interaction.find("db:name", ns)
        description = interaction.find("db:description", ns)

        rows.append({
            "drugbank_id_1": src_id,
            "drug_name_1": src_name,
            "drugbank_id_2": tgt_id.text if tgt_id is not None else None,
            "drug_name_2": tgt_name.text if tgt_name is not None else None,
            "interaction_description": description.text if description is not None else None
        })

ddi_df = pd.DataFrame(rows)



In [None]:
ddi_df

Unnamed: 0,drugbank_id_1,drug_name_1,drugbank_id_2,drug_name_2,interaction_description
0,DB00001,Lepirudin,DB06605,Apixaban,Apixaban may increase the anticoagulant activi...
1,DB00001,Lepirudin,DB06695,Dabigatran etexilate,Dabigatran etexilate may increase the anticoag...
2,DB00001,Lepirudin,DB01254,Dasatinib,The risk or severity of bleeding and hemorrhag...
3,DB00001,Lepirudin,DB01609,Deferasirox,The risk or severity of gastrointestinal bleed...
4,DB00001,Lepirudin,DB01586,Ursodeoxycholic acid,The risk or severity of bleeding and bruising ...
...,...,...,...,...,...
2910005,DB31654,Influenza A virus A/Perth/722/2024 (H3N2) live...,DB13509,Aloxiprin,The risk or severity of Reye's syndrome can be...
2910006,DB31654,Influenza A virus A/Perth/722/2024 (H3N2) live...,DB13538,Guacetisal,The risk or severity of Reye's syndrome can be...
2910007,DB31654,Influenza A virus A/Perth/722/2024 (H3N2) live...,DB13612,Carbaspirin calcium,The risk or severity of Reye's syndrome can be...
2910008,DB31654,Influenza A virus A/Perth/722/2024 (H3N2) live...,DB14006,Choline salicylate,The risk or severity of Reye's syndrome can be...


In [None]:
import pandas as pd
from lxml import etree

# -----------------------------
# File path
# -----------------------------
xml_file = "/content/drive/MyDrive/ALL-ADDITIONAL-TRAINING/full database.xml"

# -----------------------------
# Load XML
# -----------------------------
tree = etree.parse(xml_file)
root = tree.getroot()

# DrugBank namespace
ns = {"db": "http://www.drugbank.ca"}

# -----------------------------
# Step 1: Build ATC lookup dictionary
# drugbank_id -> list of ATC codes
# -----------------------------
drug_atc_lookup = {}

for drug in root.findall("db:drug", ns):

    drug_id_elem = drug.find("db:drugbank-id[@primary='true']", ns)
    if drug_id_elem is None:
        continue

    drug_id = drug_id_elem.text

    atc_list = [
        atc_elem.get("code")
        for atc_elem in drug.findall(".//db:atc-code", ns)
        if atc_elem.get("code")
    ]

    drug_atc_lookup[drug_id] = atc_list


# -----------------------------
# Helper function to classify ATC
# -----------------------------
def classify_atc(atc_list):
    if not atc_list:
        return False, False

    is_cardiovascular = any(code.startswith("C") for code in atc_list)
    is_antithrombotic = any(code.startswith("B01") for code in atc_list)

    return is_cardiovascular, is_antithrombotic


# -----------------------------
# Step 2: Extract interactions
# -----------------------------
rows = []

for drug in root.findall("db:drug", ns):

    # Source drug
    src_id_elem = drug.find("db:drugbank-id[@primary='true']", ns)
    src_name_elem = drug.find("db:name", ns)

    if src_id_elem is None:
        continue

    src_id = src_id_elem.text
    src_name = src_name_elem.text if src_name_elem is not None else None

    # Get ATC from lookup
    atc_list_1 = drug_atc_lookup.get(src_id, [])
    is_cardio_1, is_antithrombotic_1 = classify_atc(atc_list_1)

    interactions = drug.find("db:drug-interactions", ns)
    if interactions is None:
        continue

    for interaction in interactions.findall("db:drug-interaction", ns):

        tgt_id_elem = interaction.find("db:drugbank-id", ns)
        tgt_name_elem = interaction.find("db:name", ns)
        description_elem = interaction.find("db:description", ns)

        if tgt_id_elem is None:
            continue

        tgt_id = tgt_id_elem.text
        tgt_name = tgt_name_elem.text if tgt_name_elem is not None else None

        # Get ATC from lookup
        atc_list_2 = drug_atc_lookup.get(tgt_id, [])
        is_cardio_2, is_antithrombotic_2 = classify_atc(atc_list_2)

        rows.append({
            "drugbank_id_1": src_id,
            "drug_name_1": src_name,
            "atc_1": atc_list_1,
            "is_cardiovascular_1": is_cardio_1,
            "is_antithrombotic_1": is_antithrombotic_1,
            "drugbank_id_2": tgt_id,
            "drug_name_2": tgt_name,
            "atc_2": atc_list_2,
            "is_cardiovascular_2": is_cardio_2,
            "is_antithrombotic_2": is_antithrombotic_2,
            "interaction_description": description_elem.text if description_elem is not None else None
        })


# -----------------------------
# Convert to DataFrame
# -----------------------------
ddi_df = pd.DataFrame(rows)

ddi_df.head()


Unnamed: 0,drugbank_id_1,drug_name_1,atc_1,is_cardiovascular_1,is_antithrombotic_1,drugbank_id_2,drug_name_2,atc_2,is_cardiovascular_2,is_antithrombotic_2,interaction_description
0,DB00001,Lepirudin,[B01AE02],False,True,DB06605,Apixaban,[B01AF02],False,True,Apixaban may increase the anticoagulant activi...
1,DB00001,Lepirudin,[B01AE02],False,True,DB06695,Dabigatran etexilate,[B01AE07],False,True,Dabigatran etexilate may increase the anticoag...
2,DB00001,Lepirudin,[B01AE02],False,True,DB01254,Dasatinib,[L01EA02],False,False,The risk or severity of bleeding and hemorrhag...
3,DB00001,Lepirudin,[B01AE02],False,True,DB01609,Deferasirox,[V03AC03],False,False,The risk or severity of gastrointestinal bleed...
4,DB00001,Lepirudin,[B01AE02],False,True,DB01586,Ursodeoxycholic acid,[A05AA02],False,False,The risk or severity of bleeding and bruising ...


In [None]:
import pandas as pd
from lxml import etree

# -----------------------------
# File path
# -----------------------------
xml_file = "/content/drive/MyDrive/ALL-ADDITIONAL-TRAINING/full database.xml"

# -----------------------------
# Load XML
# -----------------------------
tree = etree.parse(xml_file)
root = tree.getroot()

# DrugBank namespace
ns = {"db": "http://www.drugbank.ca"}

# -----------------------------
# Step 1: Build ATC lookup dictionary
# -----------------------------
drug_atc_lookup = {}

for drug in root.findall("db:drug", ns):

    drug_id_elem = drug.find("db:drugbank-id[@primary='true']", ns)
    if drug_id_elem is None:
        continue

    drug_id = drug_id_elem.text

    atc_list = [
        atc_elem.get("code")
        for atc_elem in drug.findall(".//db:atc-code", ns)
        if atc_elem.get("code")
    ]

    drug_atc_lookup[drug_id] = atc_list


# -----------------------------
# Helper function to classify ATC
# -----------------------------
def classify_atc(atc_list):
    if not atc_list:
        return False, False

    is_cardiovascular = any(code.startswith("C") for code in atc_list)
    is_antithrombotic = any(code.startswith("B01") for code in atc_list)

    return is_cardiovascular, is_antithrombotic


# -----------------------------
# Step 2: Extract interactions
# -----------------------------
rows = []

for drug in root.findall("db:drug", ns):

    src_id_elem = drug.find("db:drugbank-id[@primary='true']", ns)
    src_name_elem = drug.find("db:name", ns)

    if src_id_elem is None:
        continue

    src_id = src_id_elem.text
    src_name = src_name_elem.text if src_name_elem is not None else None

    atc_list_1 = drug_atc_lookup.get(src_id, [])
    is_cardio_1, is_antithrombotic_1 = classify_atc(atc_list_1)

    interactions = drug.find("db:drug-interactions", ns)
    if interactions is None:
        continue

    for interaction in interactions.findall("db:drug-interaction", ns):

        tgt_id_elem = interaction.find("db:drugbank-id", ns)
        tgt_name_elem = interaction.find("db:name", ns)
        description_elem = interaction.find("db:description", ns)

        if tgt_id_elem is None:
            continue

        tgt_id = tgt_id_elem.text
        tgt_name = tgt_name_elem.text if tgt_name_elem is not None else None

        atc_list_2 = drug_atc_lookup.get(tgt_id, [])
        is_cardio_2, is_antithrombotic_2 = classify_atc(atc_list_2)

        rows.append({
            "drugbank_id_1": src_id,
            "drug_name_1": src_name,
            "atc_1": atc_list_1,
            "is_cardiovascular_1": is_cardio_1,
            "is_antithrombotic_1": is_antithrombotic_1,
            "drugbank_id_2": tgt_id,
            "drug_name_2": tgt_name,
            "atc_2": atc_list_2,
            "is_cardiovascular_2": is_cardio_2,
            "is_antithrombotic_2": is_antithrombotic_2,
            "interaction_description": description_elem.text if description_elem is not None else None
        })


# -----------------------------
# Create full DataFrame
# -----------------------------
ddi_df = pd.DataFrame(rows)

print("Full interaction dataframe shape:", ddi_df.shape)


# -----------------------------
# Step 3: Filter interactions
# At least one drug is cardiovascular OR antithrombotic
# -----------------------------
ddi_cardio_or_antithrombotic = ddi_df[
    (
        ddi_df["is_cardiovascular_1"] |
        ddi_df["is_antithrombotic_1"] |
        ddi_df["is_cardiovascular_2"] |
        ddi_df["is_antithrombotic_2"]
    )
].copy()




Full interaction dataframe shape: (2910010, 11)


In [None]:
ddi_cardio_or_antithrombotic

Unnamed: 0,drugbank_id_1,drug_name_1,atc_1,is_cardiovascular_1,is_antithrombotic_1,drugbank_id_2,drug_name_2,atc_2,is_cardiovascular_2,is_antithrombotic_2,interaction_description
0,DB00001,Lepirudin,[B01AE02],False,True,DB06605,Apixaban,[B01AF02],False,True,Apixaban may increase the anticoagulant activi...
1,DB00001,Lepirudin,[B01AE02],False,True,DB06695,Dabigatran etexilate,[B01AE07],False,True,Dabigatran etexilate may increase the anticoag...
2,DB00001,Lepirudin,[B01AE02],False,True,DB01254,Dasatinib,[L01EA02],False,False,The risk or severity of bleeding and hemorrhag...
3,DB00001,Lepirudin,[B01AE02],False,True,DB01609,Deferasirox,[V03AC03],False,False,The risk or severity of gastrointestinal bleed...
4,DB00001,Lepirudin,[B01AE02],False,True,DB01586,Ursodeoxycholic acid,[A05AA02],False,False,The risk or severity of bleeding and bruising ...
...,...,...,...,...,...,...,...,...,...,...,...
2909947,DB22790,Berahyaluronidase alfa,[],False,False,DB13378,Norfenefrine,[C01CA05],True,False,The risk or severity of adverse effects can be...
2909988,DB22790,Berahyaluronidase alfa,[],False,False,DB00695,Furosemide,"[C03EB01, C03CA01, G01AE10, C03CB01]",True,False,The therapeutic efficacy of Furosemide can be ...
2909996,DB31654,Influenza A virus A/Perth/722/2024 (H3N2) live...,[],False,False,DB00945,Acetylsalicylic acid,"[B01AC06, C07FX04, C10BX04, M01BA03, C10BX02, ...",True,True,The risk or severity of Reye's syndrome can be...
2910005,DB31654,Influenza A virus A/Perth/722/2024 (H3N2) live...,[],False,False,DB13509,Aloxiprin,"[N02BA02, B01AC15]",False,True,The risk or severity of Reye's syndrome can be...


In [None]:
import torch
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# --------------------------------------------
# 1. Clean descriptions
# --------------------------------------------
ddi_cardio_or_antithrombotic["interaction_description"] = (
    ddi_cardio_or_antithrombotic["interaction_description"]
    .fillna("")
)

# Filter out empty descriptions to save computation
ddi_nonempty = ddi_cardio_or_antithrombotic[
    ddi_cardio_or_antithrombotic["interaction_description"].str.strip() != ""
].copy()

# --------------------------------------------
# 2. Select device (GPU if available)
# --------------------------------------------
device = 0 if torch.cuda.is_available() else -1

# --------------------------------------------
# 3. Load zero-shot classifier (efficient)
# --------------------------------------------
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device
)

# --------------------------------------------
# 4. Define severity labels
# --------------------------------------------
severity_labels = [
    "Minor interaction",
    "Moderate interaction",
    "Major interaction",
    "Contraindicated interaction"
]

# --------------------------------------------
# 5. Batched classification (FAST)
# --------------------------------------------
def classify_batches(texts, batch_size=16):
    """
    Efficiently classify a list of texts in batches.
    Returns predicted label and confidence for each text.
    """
    labels, scores = [], []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]

        results = classifier(
            batch,
            candidate_labels=severity_labels,
            hypothesis_template="This drug interaction is {}.",
            batch_size=batch_size
        )

        # Handle single-text batch output (dict) vs list
        if isinstance(results, dict):
            results = [results]

        for res in results:
            labels.append(res["labels"][0])
            scores.append(res["scores"][0])

    return labels, scores

# --------------------------------------------
# 6. Run classification on the dataset
# --------------------------------------------
texts = ddi_nonempty["interaction_description"].tolist()

pred_labels, pred_scores = classify_batches(texts, batch_size=16)

ddi_nonempty["severity_label"] = pred_labels
ddi_nonempty["severity_confidence"] = pred_scores

# Numeric severity mapping
severity_mapping = {
    "Minor interaction": 1,
    "Moderate interaction": 2,
    "Major interaction": 3,
    "Contraindicated interaction": 4
}

ddi_nonempty["severity_numeric"] = ddi_nonempty["severity_label"].map(severity_mapping)

# --------------------------------------------
# 7. Merge back rows with empty descriptions (optional)
# --------------------------------------------
ddi_empty = ddi_cardio_or_antithrombotic[
    ddi_cardio_or_antithrombotic["interaction_description"].str.strip() == ""
].copy()

ddi_empty["severity_label"] = None
ddi_empty["severity_confidence"] = None
ddi_empty["severity_numeric"] = None

ddi_final = pd.concat([ddi_nonempty, ddi_empty], ignore_index=True)

# --------------------------------------------
# 8. Save to CSV
# --------------------------------------------
output_path = "ddi_cardio_or_antithrombotic_labeled.csv"
ddi_final.to_csv(output_path, index=False)
print("Saved CSV:", output_path)

# --------------------------------------------
# 9. Download in Colab
# --------------------------------------------
try:
    from google.colab import files
    files.download(output_path)
except:
    print("Download skipped (not in Colab).")


Loading weights:   0%|          | 0/515 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 47486/47486 [2:18:52<00:00,  5.70it/s]
  ddi_final = pd.concat([ddi_nonempty, ddi_empty], ignore_index=True)


Saved CSV: ddi_cardio_or_antithrombotic_labeled.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
!pip install -q networkx python-louvain pyvis pandas numpy matplotlib seaborn plotly

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/756.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m756.0/756.0 kB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.6/1.6 MB[0m [31m92.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import files

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter, defaultdict
from google.colab import files
import community as community_louvain
from pyvis.network import Network
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("‚úì All libraries imported successfully!")

‚úì All libraries imported successfully!


In [4]:
from google.colab import files

In [5]:
# Upload your CSV file
print("üì§ Please upload your DDI CSV file (ddi_cardio_or_antithrombotic_labeled.csv)")
uploaded = files.upload()

# Get filename
filename = list(uploaded.keys())[0]
print(f"\n‚úì Uploaded: {filename}")

üì§ Please upload your DDI CSV file (ddi_cardio_or_antithrombotic_labeled.csv)


Saving ddi_cardio_or_antithrombotic_labeled (1).csv to ddi_cardio_or_antithrombotic_labeled (1).csv

‚úì Uploaded: ddi_cardio_or_antithrombotic_labeled (1).csv


In [6]:
# Load the dataset
df = pd.read_csv(filename)

print(f"üìä Dataset Shape: {df.shape}")
print(f"\nüìã Columns:\n{df.columns.tolist()}")
print(f"\nüîç First few rows:")
df.head()

üìä Dataset Shape: (759774, 14)

üìã Columns:
['drugbank_id_1', 'drug_name_1', 'atc_1', 'is_cardiovascular_1', 'is_antithrombotic_1', 'drugbank_id_2', 'drug_name_2', 'atc_2', 'is_cardiovascular_2', 'is_antithrombotic_2', 'interaction_description', 'severity_label', 'severity_confidence', 'severity_numeric']

üîç First few rows:


Unnamed: 0,drugbank_id_1,drug_name_1,atc_1,is_cardiovascular_1,is_antithrombotic_1,drugbank_id_2,drug_name_2,atc_2,is_cardiovascular_2,is_antithrombotic_2,interaction_description,severity_label,severity_confidence,severity_numeric
0,DB00001,Lepirudin,['B01AE02'],False,True,DB06605,Apixaban,['B01AF02'],False,True,Apixaban may increase the anticoagulant activi...,Major interaction,0.478405,3
1,DB00001,Lepirudin,['B01AE02'],False,True,DB06695,Dabigatran etexilate,['B01AE07'],False,True,Dabigatran etexilate may increase the anticoag...,Major interaction,0.456309,3
2,DB00001,Lepirudin,['B01AE02'],False,True,DB01254,Dasatinib,['L01EA02'],False,False,The risk or severity of bleeding and hemorrhag...,Contraindicated interaction,0.711817,4
3,DB00001,Lepirudin,['B01AE02'],False,True,DB01609,Deferasirox,['V03AC03'],False,False,The risk or severity of gastrointestinal bleed...,Contraindicated interaction,0.733675,4
4,DB00001,Lepirudin,['B01AE02'],False,True,DB01586,Ursodeoxycholic acid,['A05AA02'],False,False,The risk or severity of bleeding and bruising ...,Contraindicated interaction,0.717433,4


In [7]:
# Basic statistics
print("üìà Dataset Statistics:")
print(f"   Total interactions: {len(df):,}")
print(f"   Unique Drug 1: {df['drug_name_1'].nunique():,}")
print(f"   Unique Drug 2: {df['drug_name_2'].nunique():,}")

# Severity distribution
print(f"\nüè∑Ô∏è Severity Distribution:")
print(df['severity_label'].value_counts())

üìà Dataset Statistics:
   Total interactions: 759,774
   Unique Drug 1: 4,313
   Unique Drug 2: 4,314

üè∑Ô∏è Severity Distribution:
severity_label
Contraindicated interaction    432226
Major interaction              326716
Minor interaction                 808
Moderate interaction               24
Name: count, dtype: int64


In [8]:
# Create network graph
print("üî® Building drug interaction network...")

# Map severity to numeric weight (higher = more severe)
severity_weights = {
    'Minor interaction': 1,
    'Moderate interaction': 2,
    'Major interaction': 3,
    'Contraindicated interaction': 4
}

# Use severity_numeric if available, otherwise map from label
if 'severity_numeric' in df.columns:
    df['weight'] = df['severity_numeric']
else:
    df['weight'] = df['severity_label'].map(severity_weights).fillna(2)

# Create the graph
G = nx.Graph()

# Add edges with attributes
for idx, row in df.iterrows():
    drug1 = row['drug_name_1']
    drug2 = row['drug_name_2']
    weight = row['weight']
    severity = row['severity_label']

    # Add or update edge
    if G.has_edge(drug1, drug2):
        # Keep the higher severity
        if weight > G[drug1][drug2]['weight']:
            G[drug1][drug2]['weight'] = weight
            G[drug1][drug2]['severity'] = severity
    else:
        G.add_edge(drug1, drug2, weight=weight, severity=severity)

# Add node attributes
for node in G.nodes():
    # Check if cardiovascular
    is_cardio = df[df['drug_name_1'] == node]['is_cardiovascular_1'].any() or \
                df[df['drug_name_2'] == node]['is_cardiovascular_2'].any()

    # Check if antithrombotic
    is_antithrom = df[df['drug_name_1'] == node]['is_antithrombotic_1'].any() or \
                   df[df['drug_name_2'] == node]['is_antithrombotic_2'].any()

    G.nodes[node]['is_cardiovascular'] = is_cardio
    G.nodes[node]['is_antithrombotic'] = is_antithrom

print(f"\n‚úì Network built successfully!")
print(f"   Nodes (drugs): {G.number_of_nodes():,}")
print(f"   Edges (interactions): {G.number_of_edges():,}")
print(f"   Network density: {nx.density(G):.4f}")

üî® Building drug interaction network...

‚úì Network built successfully!
   Nodes (drugs): 4,314
   Edges (interactions): 379,917
   Network density: 0.0408


In [None]:
# Calculate centrality metrics
print("üìä Calculating centrality metrics...")

# Degree centrality (number of interactions)
degree_centrality = nx.degree_centrality(G)

# Weighted degree (sum of severity weights)
weighted_degree = dict(G.degree(weight='weight'))

# Betweenness centrality (drugs that bridge different clusters)
print("   Computing betweenness centrality (may take a moment)...")
betweenness = nx.betweenness_centrality(G, k=min(500, G.number_of_nodes()))

# Eigenvector centrality (connected to important drugs)
try:
    eigenvector = nx.eigenvector_centrality(G, max_iter=1000)
except:
    eigenvector = degree_centrality  # Fallback

# Create centrality dataframe
centrality_df = pd.DataFrame({
    'drug': list(G.nodes()),
    'degree': [G.degree(n) for n in G.nodes()],
    'weighted_degree': [weighted_degree[n] for n in G.nodes()],
    'degree_centrality': [degree_centrality[n] for n in G.nodes()],
    'betweenness': [betweenness[n] for n in G.nodes()],
    'eigenvector': [eigenvector[n] for n in G.nodes()],
    'is_cardiovascular': [G.nodes[n].get('is_cardiovascular', False) for n in G.nodes()],
    'is_antithrombotic': [G.nodes[n].get('is_antithrombotic', False) for n in G.nodes()]
})

# Calculate risk score (composite)
centrality_df['risk_score'] = (
    centrality_df['weighted_degree'] / centrality_df['weighted_degree'].max() * 0.4 +
    centrality_df['degree_centrality'] / centrality_df['degree_centrality'].max() * 0.3 +
    centrality_df['betweenness'] / centrality_df['betweenness'].max() * 0.3
)

centrality_df = centrality_df.sort_values('risk_score', ascending=False)

print("\n‚úì Centrality analysis complete!")

üìä Calculating centrality metrics...
   Computing betweenness centrality (may take a moment)...

‚úì Centrality analysis complete!


In [None]:
# Top 20 highest-risk drugs
print("üî¥ TOP 20 HIGHEST-RISK DRUGS (by interaction network position):")
print("="*80)

top_20 = centrality_df.head(20)[['drug', 'degree', 'weighted_degree', 'risk_score', 'is_cardiovascular', 'is_antithrombotic']]
top_20 = top_20.reset_index(drop=True)
top_20.index = top_20.index + 1
top_20

In [None]:
# Visualize top drugs by different metrics
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Top 15 by degree
top_degree = centrality_df.nlargest(15, 'degree')
axes[0, 0].barh(top_degree['drug'], top_degree['degree'], color='steelblue')
axes[0, 0].set_xlabel('Number of Interactions')
axes[0, 0].set_title('üîó Top 15 Drugs by Number of Interactions', fontsize=12, fontweight='bold')
axes[0, 0].invert_yaxis()

# Top 15 by weighted degree (severity)
top_weighted = centrality_df.nlargest(15, 'weighted_degree')
axes[0, 1].barh(top_weighted['drug'], top_weighted['weighted_degree'], color='crimson')
axes[0, 1].set_xlabel('Weighted Degree (Severity Sum)')
axes[0, 1].set_title('‚ö†Ô∏è Top 15 Drugs by Severity-Weighted Interactions', fontsize=12, fontweight='bold')
axes[0, 1].invert_yaxis()

# Top 15 by betweenness
top_between = centrality_df.nlargest(15, 'betweenness')
axes[1, 0].barh(top_between['drug'], top_between['betweenness'], color='forestgreen')
axes[1, 0].set_xlabel('Betweenness Centrality')
axes[1, 0].set_title('üåâ Top 15 Bridge Drugs (Connect Different Groups)', fontsize=12, fontweight='bold')
axes[1, 0].invert_yaxis()

# Top 15 by risk score
top_risk = centrality_df.nlargest(15, 'risk_score')
colors = ['red' if r > 0.7 else 'orange' if r > 0.4 else 'gold' for r in top_risk['risk_score']]
axes[1, 1].barh(top_risk['drug'], top_risk['risk_score'], color=colors)
axes[1, 1].set_xlabel('Composite Risk Score')
axes[1, 1].set_title('üéØ Top 15 Drugs by Overall Risk Score', fontsize=12, fontweight='bold')
axes[1, 1].invert_yaxis()

plt.tight_layout()
plt.savefig('drug_centrality_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: drug_centrality_analysis.png")

In [None]:
# Detect communities using Louvain algorithm
print("üîç Detecting drug communities...")

partition = community_louvain.best_partition(G, weight='weight', resolution=1.0)

# Add community to nodes
for node, comm in partition.items():
    G.nodes[node]['community'] = comm

# Add to centrality dataframe
centrality_df['community'] = centrality_df['drug'].map(partition)

# Community statistics
n_communities = len(set(partition.values()))
community_sizes = Counter(partition.values())

print(f"\n‚úì Found {n_communities} drug communities")
print(f"\nüìä Community Sizes:")
for comm, size in sorted(community_sizes.items(), key=lambda x: -x[1])[:10]:
    print(f"   Community {comm}: {size} drugs")

In [None]:
# Analyze each community
print("\nüìã TOP DRUGS IN EACH MAJOR COMMUNITY:")
print("="*80)

for comm in sorted(community_sizes.keys(), key=lambda x: -community_sizes[x])[:5]:
    comm_drugs = centrality_df[centrality_df['community'] == comm]
    top_in_comm = comm_drugs.nlargest(5, 'risk_score')

    # Check drug types in community
    n_cardio = comm_drugs['is_cardiovascular'].sum()
    n_antithrom = comm_drugs['is_antithrombotic'].sum()

    print(f"\nüîπ Community {comm} ({community_sizes[comm]} drugs)")
    print(f"   Cardiovascular: {n_cardio}, Antithrombotic: {n_antithrom}")
    print(f"   Top drugs: {', '.join(top_in_comm['drug'].tolist())}")

In [None]:
# Visualize community distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Community size distribution
comm_sizes = pd.Series(community_sizes).sort_values(ascending=False)
axes[0].bar(range(len(comm_sizes)), comm_sizes.values, color='steelblue')
axes[0].set_xlabel('Community ID')
axes[0].set_ylabel('Number of Drugs')
axes[0].set_title('üìä Drug Community Sizes', fontsize=12, fontweight='bold')

# Risk distribution by community
top_communities = comm_sizes.head(10).index.tolist()
comm_risk_data = centrality_df[centrality_df['community'].isin(top_communities)]
comm_risk_data.boxplot(column='risk_score', by='community', ax=axes[1])
axes[1].set_xlabel('Community ID')
axes[1].set_ylabel('Risk Score')
axes[1].set_title('‚ö†Ô∏è Risk Score Distribution by Community', fontsize=12, fontweight='bold')
plt.suptitle('')

plt.tight_layout()
plt.savefig('community_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: community_analysis.png")

In [None]:
# Analyze severity distribution
print("üìä SEVERITY ANALYSIS")
print("="*80)

severity_counts = df['severity_label'].value_counts()
print("\nInteraction Severity Distribution:")
for sev, count in severity_counts.items():
    pct = count / len(df) * 100
    print(f"   {sev}: {count:,} ({pct:.1f}%)")

In [None]:
# Visualize severity
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Severity pie chart
colors_sev = {'Contraindicated interaction': '#d62728',
              'Major interaction': '#ff7f0e',
              'Moderate interaction': '#ffbb78',
              'Minor interaction': '#98df8a'}
severity_counts.plot(kind='pie', ax=axes[0], autopct='%1.1f%%',
                     colors=[colors_sev.get(s, 'gray') for s in severity_counts.index])
axes[0].set_ylabel('')
axes[0].set_title('üéØ Interaction Severity Distribution', fontsize=12, fontweight='bold')

# Severity by cardiovascular drugs
cardio_df = df[df['is_cardiovascular_1'] | df['is_cardiovascular_2']]
cardio_sev = cardio_df['severity_label'].value_counts()
cardio_sev.plot(kind='bar', ax=axes[1], color='steelblue')
axes[1].set_xlabel('Severity')
axes[1].set_ylabel('Count')
axes[1].set_title('üíä Cardiovascular Drug Interactions', fontsize=12, fontweight='bold')
axes[1].tick_params(axis='x', rotation=45)

# Severity by antithrombotic drugs
antithrom_df = df[df['is_antithrombotic_1'] | df['is_antithrombotic_2']]
antithrom_sev = antithrom_df['severity_label'].value_counts()
antithrom_sev.plot(kind='bar', ax=axes[2], color='crimson')
axes[2].set_xlabel('Severity')
axes[2].set_ylabel('Count')
axes[2].set_title('ü©∏ Antithrombotic Drug Interactions', fontsize=12, fontweight='bold')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('severity_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: severity_analysis.png")

In [None]:
# Drugs with most contraindicated interactions
print("\nüî¥ DRUGS WITH MOST CONTRAINDICATED INTERACTIONS:")
print("="*80)

contraindicated = df[df['severity_label'] == 'Contraindicated interaction']

# Count for each drug
drug1_contra = contraindicated['drug_name_1'].value_counts()
drug2_contra = contraindicated['drug_name_2'].value_counts()
total_contra = drug1_contra.add(drug2_contra, fill_value=0).sort_values(ascending=False)

print("\nTop 15 drugs with most contraindicated interactions:")
for i, (drug, count) in enumerate(total_contra.head(15).items(), 1):
    print(f"   {i:2}. {drug}: {int(count)} contraindicated interactions")

In [None]:
# Create subgraph of high-risk drugs for visualization
print("üé® Creating interactive network visualization...")

# Get top 100 drugs by risk score
top_drugs = centrality_df.nlargest(100, 'risk_score')['drug'].tolist()

# Create subgraph
G_sub = G.subgraph(top_drugs).copy()

print(f"   Visualizing top {len(top_drugs)} high-risk drugs")
print(f"   Edges in subgraph: {G_sub.number_of_edges()}")

In [None]:
# Create PyVis network
net = Network(height='700px', width='100%', bgcolor='#222222', font_color='white')
net.barnes_hut(gravity=-3000, central_gravity=0.3, spring_length=200)

# Color mapping
community_colors = plt.cm.tab20(np.linspace(0, 1, 20))

# Add nodes
for node in G_sub.nodes():
    risk = centrality_df[centrality_df['drug'] == node]['risk_score'].values[0]
    comm = G_sub.nodes[node].get('community', 0)
    is_cardio = G_sub.nodes[node].get('is_cardiovascular', False)
    is_antithrom = G_sub.nodes[node].get('is_antithrombotic', False)

    # Size based on risk
    size = 10 + risk * 40

    # Color based on type
    if is_antithrom:
        color = '#e74c3c'  # Red for antithrombotic
    elif is_cardio:
        color = '#3498db'  # Blue for cardiovascular
    else:
        color = '#95a5a6'  # Gray for others

    title = f"{node}\nRisk Score: {risk:.3f}\nDegree: {G_sub.degree(node)}\nCommunity: {comm}"
    if is_cardio:
        title += "\nüíä Cardiovascular"
    if is_antithrom:
        title += "\nü©∏ Antithrombotic"

    net.add_node(node, label=node, size=size, color=color, title=title)

# Add edges
for edge in G_sub.edges(data=True):
    weight = edge[2].get('weight', 1)
    severity = edge[2].get('severity', 'Unknown')

    # Edge color based on severity
    if weight >= 4:
        edge_color = '#e74c3c'  # Red for contraindicated
    elif weight >= 3:
        edge_color = '#f39c12'  # Orange for major
    else:
        edge_color = '#7f8c8d'  # Gray for others

    net.add_edge(edge[0], edge[1], color=edge_color,
                 title=f"{severity}", width=weight)

# Save interactive visualization
net.save_graph('drug_risk_network.html')
print("\n‚úì Saved: drug_risk_network.html")
print("\nüé® Legend:")
print("   üî¥ Red nodes = Antithrombotic drugs")
print("   üîµ Blue nodes = Cardiovascular drugs")
print("   ‚ö™ Gray nodes = Other drugs")
print("   Node size = Risk score")
print("   Red edges = Contraindicated")
print("   Orange edges = Major interaction")

In [None]:
# Display the interactive network
from IPython.display import HTML
HTML(filename='drug_risk_network.html')

In [None]:
# Create a smaller subgraph for static visualization
top_30 = centrality_df.nlargest(30, 'risk_score')['drug'].tolist()
G_small = G.subgraph(top_30).copy()

# Create layout
pos = nx.spring_layout(G_small, k=2, iterations=50, seed=42)

# Figure
fig, ax = plt.subplots(figsize=(16, 12))

# Node colors
node_colors = []
for node in G_small.nodes():
    if G_small.nodes[node].get('is_antithrombotic', False):
        node_colors.append('#e74c3c')
    elif G_small.nodes[node].get('is_cardiovascular', False):
        node_colors.append('#3498db')
    else:
        node_colors.append('#95a5a6')

# Node sizes based on risk
node_sizes = [centrality_df[centrality_df['drug'] == n]['risk_score'].values[0] * 3000 + 500
              for n in G_small.nodes()]

# Edge colors based on severity
edge_colors = []
edge_widths = []
for u, v, data in G_small.edges(data=True):
    w = data.get('weight', 1)
    if w >= 4:
        edge_colors.append('#e74c3c')
    elif w >= 3:
        edge_colors.append('#f39c12')
    else:
        edge_colors.append('#bdc3c7')
    edge_widths.append(w)

# Draw
nx.draw_networkx_edges(G_small, pos, edge_color=edge_colors, width=edge_widths, alpha=0.6, ax=ax)
nx.draw_networkx_nodes(G_small, pos, node_color=node_colors, node_size=node_sizes, alpha=0.9, ax=ax)
nx.draw_networkx_labels(G_small, pos, font_size=8, font_weight='bold', ax=ax)

# Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#e74c3c', label='Antithrombotic'),
    Patch(facecolor='#3498db', label='Cardiovascular'),
    Patch(facecolor='#95a5a6', label='Other'),
]
ax.legend(handles=legend_elements, loc='upper left', fontsize=10)

ax.set_title('üî¨ Drug-Drug Interaction Risk Network (Top 30 High-Risk Drugs)',
             fontsize=14, fontweight='bold')
ax.axis('off')

plt.tight_layout()
plt.savefig('drug_network_static.png', dpi=200, bbox_inches='tight', facecolor='white')
plt.show()

print("\n‚úì Saved: drug_network_static.png")

In [None]:
# Create interaction heatmap for top drugs
top_20_drugs = centrality_df.nlargest(20, 'risk_score')['drug'].tolist()

# Create adjacency matrix
adj_matrix = pd.DataFrame(0, index=top_20_drugs, columns=top_20_drugs)

for u, v, data in G.subgraph(top_20_drugs).edges(data=True):
    weight = data.get('weight', 1)
    adj_matrix.loc[u, v] = weight
    adj_matrix.loc[v, u] = weight

# Plot heatmap
fig, ax = plt.subplots(figsize=(14, 12))

mask = adj_matrix == 0
cmap = sns.color_palette("YlOrRd", as_cmap=True)

sns.heatmap(adj_matrix, mask=mask, cmap=cmap, annot=True, fmt='g',
            linewidths=0.5, ax=ax, cbar_kws={'label': 'Severity (1-4)'})

ax.set_title('üî• Drug-Drug Interaction Severity Heatmap (Top 20 High-Risk Drugs)',
             fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

plt.tight_layout()
plt.savefig('drug_interaction_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úì Saved: drug_interaction_heatmap.png")