In [1]:
import pandas as pd
import json
from pathlib import Path

In [None]:
df = pd.read_csv("CTD_chemicals_diseases.csv")

In [3]:
print("Original shape:", df.shape)

Original shape: (1048546, 10)


In [4]:
df.head()

Unnamed: 0,ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs
0,10074-G5,C534883,,Adenocarcinoma,MESH:D000230,,MYC,4.06,,26432044
1,10074-G5,C534883,,Adenocarcinoma of Lung,MESH:D000077192,,MYC,4.3,,26656844|27602772
2,10074-G5,C534883,,Alopecia,MESH:D000505,,AR,4.51,,15902657
3,10074-G5,C534883,,Androgen-Insensitivity Syndrome,MESH:D013734,,AR,6.88,300068|312300,1303262|8281139
4,10074-G5,C534883,,Astrocytoma,MESH:D001254,,AR,4.95,,24680642


In [5]:
# Drop rows where chemical or disease name is missing
df = df.dropna(subset=["ChemicalName", "DiseaseName"])

In [6]:
# Remove self-loop cases (chemical == disease)
df = df[df["ChemicalName"].str.strip().str.lower() != df["DiseaseName"].str.strip().str.lower()]

In [7]:
print("After dropping invalid rows:", df.shape)
df.head()


After dropping invalid rows: (1048546, 10)


Unnamed: 0,ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs
0,10074-G5,C534883,,Adenocarcinoma,MESH:D000230,,MYC,4.06,,26432044
1,10074-G5,C534883,,Adenocarcinoma of Lung,MESH:D000077192,,MYC,4.3,,26656844|27602772
2,10074-G5,C534883,,Alopecia,MESH:D000505,,AR,4.51,,15902657
3,10074-G5,C534883,,Androgen-Insensitivity Syndrome,MESH:D013734,,AR,6.88,300068|312300,1303262|8281139
4,10074-G5,C534883,,Astrocytoma,MESH:D001254,,AR,4.95,,24680642


In [8]:
# Use DirectEvidence if available, otherwise fallback to "associated_with"
def resolve_relation(row):
    return row["DirectEvidence"] if pd.notna(row["DirectEvidence"]) else "associated_with"

df["Relation"] = df.apply(resolve_relation, axis=1)

# Peek at the new column
df[["ChemicalName", "Relation", "DiseaseName"]].head()


Unnamed: 0,ChemicalName,Relation,DiseaseName
0,10074-G5,associated_with,Adenocarcinoma
1,10074-G5,associated_with,Adenocarcinoma of Lung
2,10074-G5,associated_with,Alopecia
3,10074-G5,associated_with,Androgen-Insensitivity Syndrome
4,10074-G5,associated_with,Astrocytoma


In [9]:
# Keep only relevant fields for triplet extraction
df = df[["ChemicalName", "Relation", "DiseaseName", "ChemicalID", "DiseaseID", "PubMedIDs"]]

# Drop duplicates
df = df.drop_duplicates(subset=["ChemicalName", "Relation", "DiseaseName"])

print("After deduplication:", df.shape)


After deduplication: (470725, 6)


In [10]:
# Convert each row to a structured dictionary
triplets = []

for _, row in df.iterrows():
    triplets.append({
        "head": row["ChemicalName"].strip(),
        "relation": row["Relation"].strip(),
        "tail": row["DiseaseName"].strip(),
        "source": "CTD_chemicals_diseases",
        "pubmed_ids": str(row["PubMedIDs"]).split("|") if pd.notna(row["PubMedIDs"]) else [],
        "chemical_id": row["ChemicalID"] if pd.notna(row["ChemicalID"]) else None,
        "disease_id": row["DiseaseID"] if pd.notna(row["DiseaseID"]) else None
    })

print("Sample triplet:")
triplets[0]


Sample triplet:


{'head': '10074-G5',
 'relation': 'associated_with',
 'tail': 'Adenocarcinoma',
 'source': 'CTD_chemicals_diseases',
 'pubmed_ids': ['26432044'],
 'chemical_id': 'C534883',
 'disease_id': 'MESH:D000230'}

In [None]:
# Save to file
output_path = Path("triplets_chem_dis_cleaned.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(triplets, f, indent=2)

print(f"Saved {len(triplets)} cleaned triplets to {output_path}")


✅ Saved 470725 cleaned triplets to triplets_chem_dis_cleaned.json
