In [1]:
import csv
import os
import re

Drug-Disease Pairs

In [None]:
input_file = r"...path_to\RDsqr-KG\Datasets\TTD\P1-05-Drug_disease.txt"
output_file = r"...path_to\RDsqr-KG\Preprocessed_datasets\drug_disease_ttd.csv"

In [3]:
data = []
current_ttd_id = None
current_drug_name = None
line_count = 0 

In [4]:
with open(input_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        
        #Detect new drug-disease groups separated by blank lines
        if line == "":
            current_ttd_id = None
            current_drug_name = None
            continue

        #Extract TTD-ID
        if line.startswith("TTDDRUID"):
            current_ttd_id = line.split('\t')[1].strip()
        
        #Extract Drug Name
        elif line.startswith("DRUGNAME"):
            current_drug_name = line.split('\t')[1].strip()
        
        #Extract Indication (Disease Name), ICD Code, and Status
        elif line.startswith("INDICATI"):
            parts = line.split('\t')
            if len(parts) >= 4:  #Check if all expected parts are present
                disease_name = parts[1].strip()
                icd_code = parts[2].replace("ICD-11:", "").strip()
                status = parts[3].strip()
                
                #Append extracted data to the list
                data.append([current_ttd_id, current_drug_name, disease_name, icd_code, status])

In [5]:
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['TTD_ID', 'drug_name', 'disease_name', 'ICD_11', 'status'])  # Header
    csv_writer.writerows(data[1:])

Drug Crosslinks

In [None]:
input_file2 = r"...path_to\RDsqr-KG\Datasets\TTD\P1-03-TTD_crossmatching.txt"
output_file2 = r"...path_to\RDsqr-KG\Preprocessed_datasets\drug_crosslinks_ttd.csv"

In [11]:
with open(input_file2, 'r') as infile, open(output_file2, 'w', newline='') as csvfile:
    
    fieldnames = ['TTD_ID', 'drug_name', 'cas_id']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    #Initialize variables to store current TTD_ID, drug_name, and cas_id
    current_id = None
    current_name = None
    current_cas = None
    
    # Process each line in the input file
    for line in infile:
        # Check for TTD_ID (TTDDRUID)
        if line.startswith("D") and "TTDDRUID" in line:
            current_id = line.split()[2]
        
        # Check for drug name (DRUGNAME)
        elif "DRUGNAME" in line:
            current_name = line.split("\t")[-1].strip()
        
        # Check for CAS number (CASNUMBE) and remove 'CAS ' prefix
        elif "CASNUMBE" in line:
            current_cas = re.sub(r'^CAS ', '', line.split("\t")[-1].strip())
            
            # Write row to CSV if all fields are populated
            if current_id and current_name and current_cas:
                writer.writerow({'TTD_ID': current_id, 'drug_name': current_name, 'cas_id': current_cas})
                
                # Reset fields for the next entry
                current_id, current_name, current_cas = None, None, None
