In [4]:
import os
import requests

def download_file(url, filename):
    """Download a file from a given URL and save it locally.
    If the file already exists, delete it and download the new content."""
    
    # If file exists, remove it
    if os.path.exists(filename):
        os.remove(filename)
        print(f"Existing file {filename} removed.")
    
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an error for bad status codes
        
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        
        print(f"Downloaded: {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}. Error: {e}")

# URLs and corresponding filenames
datasets = {
    "https://www.genome.jp/kegg-bin/download_htext?htext=br08332&format=json&filedir=": "drug_interaction.json",
    "https://ddinter.scbdd.com/static/media/download/ddinter_downloads_code_A.csv": "ddinter_data.csv"
}

# Download each file
for url, filename in datasets.items():
    download_file(url, filename)


Existing file drug_interaction.json removed.
Downloaded: drug_interaction.json
Existing file ddinter_data.csv removed.
Downloaded: ddinter_data.csv


In [12]:
import json
import os
import re
import pandas as pd

def clean_drug_name(name):
    """
    Splitting off any code portion (e.g., 'D01251  Ampicillin hydrate' -> 'Ampicillin hydrate').
    Removing common suffixes (e.g., 'sodium', 'hydrate', 'phosphate', etc.).
    Trimming extra spaces.
    """
    name_parts = name.split('  ', 1)
    
    # The actual drug name is the second part if the code is present, otherwise the whole name
    drug_name = name_parts[-1]
    
    # Remove common suffixes that may cause mismatches (case-insensitive)
    drug_name = re.sub(
        r'\b(sodium|hydrate|phosphate|sulfate|chloride|mesylate|acetate|hydrochloride|anhydrous|glubionate)\b',
        '',
        drug_name,
        flags=re.IGNORECASE
    )
    
    # Remove extra spaces
    drug_name = re.sub(r'\s+', ' ', drug_name).strip()
    
    return drug_name

def extract_drug_classes(data, parent_class=None):
    drug_class_mapping = {}
    
    if 'children' in data:
        for child in data['children']:
            if 'children' in child:
                class_parts = child['name'].split('  ', 1)
                class_name = class_parts[-1]  # e.g. "Penicillin" if child['name'] is "DG01480  Penicillin"
                
                drug_class_mapping.update(extract_drug_classes(child, class_name))
            else:
                drug_name = clean_drug_name(child['name'])
                if parent_class:
                    class_parts = parent_class.split('  ', 1)
                    clean_class = class_parts[-1].strip()
                    drug_class_mapping[drug_name] = clean_class
                else:
                    drug_class_mapping[drug_name] = None
    
    return drug_class_mapping

def main():
    json_file = "drug_interaction.json"
    csv_file = "ddinter_data.csv"
    output_file = "merged_drug_data.xlsx"

    with open(json_file, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    drug_classes = extract_drug_classes(json_data)

    df_csv = pd.read_csv(csv_file)

    df_csv['Cleaned_Drug_A'] = df_csv['Drug_A'].apply(clean_drug_name)
    df_csv['Cleaned_Drug_B'] = df_csv['Drug_B'].apply(clean_drug_name)

    df_csv['Class_A'] = df_csv['Cleaned_Drug_A'].map(drug_classes)
    df_csv['Class_B'] = df_csv['Cleaned_Drug_B'].map(drug_classes)

    if os.path.exists(output_file):
        os.remove(output_file)
        print(f"Existing file '{output_file}' has been deleted.")

    df_csv.to_excel(output_file, index=False)
    print(f"Merged data has been written to '{output_file}'.")

if __name__ == "__main__":
    main()


Existing file 'merged_drug_data.xlsx' has been deleted.
Merged data has been written to 'merged_drug_data.xlsx'.
