In [24]:
import json
import pandas as pd
import numpy as np

### Load Creeds Dataset
The diseases where organism is Homo Sapines are selected

In [25]:
with open('data/disease_signatures-v1.0.json', 'r') as file:
    data = json.load(file)

In [26]:
dis = []
cell=[]
dgenes = []
dscore=[]
ugenes=[]
uscore=[]

In [27]:
k=20 # top k genes according to score

In [28]:
for record in data:
    disease = record.get('do_id')
    organism = record.get('organism')
    if organism == 'human' and disease is not None:
        down_genes = record.get('down_genes', [])
        down_genes.sort(key=lambda x: x[1])
        genes = np.array(down_genes)[:k%len(down_genes),0]
        scores = np.array(down_genes)[:k%len(down_genes),1]
        dgenes.append(genes)
        dscore.append(scores)
        up_genes = record.get('up_genes', [])
        up_genes.sort(key=lambda x: x[1], reverse=True)
        genes = np.array(up_genes)[:k%len(down_genes),0]
        scores = np.array(up_genes)[:k%len(down_genes),1]
        ugenes.append(genes)
        uscore.append(scores)
        dis.append(disease)
        cell.append(record.get('cell_type', 'Unknown'))
        
df = pd.DataFrame({
    'disease': dis,
    'cell_type': cell,
    'down_genes': dgenes,
    'down_reg_scores': dscore,
    'up_genes': ugenes,
    'up_reg_scores': uscore
})

In [29]:
df.head()

Unnamed: 0,disease,cell_type,down_genes,down_reg_scores,up_genes,up_reg_scores
0,DOID:12930,Myocardial tissue,"[CKM, SMPX, COX6A2, XIRP2, MYL3, APOD, LINC003...","[-0.20369680225849152, -0.17748942971229553, -...","[MB, MYL2, TNNI3, MYZAP, MPP4, HSPB8, ASAH1, M...","[0.3557915985584259, 0.21216897666454315, 0.20..."
1,DOID:104,Peripheral blood mononuclear cell,"[RPS27, CD74, HLA-A, RPL36A, EEF1A1, IL32, RPS...","[-0.20674751698970795, -0.1724962443113327, -0...","[HLA-DRB5, IGHA2, RPL18AP3, IFI30, TMSB4X, RPS...","[0.26361608505249023, 0.16105417907238007, 0.1..."
2,DOID:2797,Lung Tissue,"[FOS, FOSB, ZFP36, RGS2, CXCL9, CEBPB, ACTB, I...","[-0.23119544982910156, -0.2288919985294342, -0...","[IGHA2, SFTPC, CYAT1, IGHV4-31, NAPSA, DMBT1, ...","[0.26548025012016296, 0.15447232127189636, 0.1..."
3,DOID:1240,B Cell Lymphocyte,"[MIRNLET7D, MIRNLET7G, MIRNLET7C, MIRNLET7B, M...","[-0.6032065749168396, -0.49166399240493774, -0...","[MIRN18A, MIRN25, MIRN148A, MIRN93, MIRN328, M...","[0.02107345126569271, 0.017946461215615273, 0...."
4,DOID:11612,Cumulus cells cultured (Lean patients),"[HLA-A, SNHG5, PKD1P1, SRPX, ASS1, ACTG1, DCAF...","[-0.11198557913303375, -0.10037083923816681, -...","[HSD3B2, BEX1, STAR, TXNDC5, MRO, LDHB, LDHA, ...","[0.1968005746603012, 0.14767995476722717, 0.13..."


#### Analysing unique disease IDs

In [30]:
tot_dis = len(df)
print(f'Total number of disease signatures processed: {tot_dis}')
if df['disease'].is_unique:
    print('All disease identifiers are unique.')
else:
    print(df['disease'].unique().shape[0], 'unique disease identifiers found.')


Total number of disease signatures processed: 493
178 unique disease identifiers found.


In [31]:
duplicate_counts = df['disease'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates.shape[0], "disease identifiers have duplicates.")
for disease_id, count in duplicates.items():
    print(f"{disease_id}: {count}")

83 disease identifiers have duplicates.
DOID:8577: 21
DOID:0060041: 21
DOID:2841: 18
DOID:5419: 16
DOID:11612: 12
DOID:1380: 12
DOID:1612: 11
DOID:9074: 10
DOID:3312: 10
DOID:10283: 10
DOID:12858: 8
DOID:0050156: 8
DOID:848: 7
DOID:10652: 7
DOID:684: 7
DOID:8893: 7
DOID:9261: 7
DOID:14250: 7
DOID:1883: 6
DOID:14330: 6
DOID:289: 6
DOID:1909: 6
DOID:11722: 6
DOID:2377: 5
DOID:9352: 5
DOID:10923: 5
DOID:0050908: 5
DOID:1969: 5
DOID:8778: 4
DOID:11727: 4
DOID:13810: 4
DOID:12930: 4
DOID:9206: 4
DOID:3969: 4
DOID:1793: 4
DOID:7148: 4
DOID:3042: 4
DOID:10534: 3
DOID:13223: 3
DOID:13809: 3
DOID:9119: 3
DOID:10223: 3
DOID:5842: 3
DOID:8552: 3
DOID:3181: 3
DOID:9538: 3
DOID:12217: 3
DOID:12849: 3
DOID:3310: 3
DOID:2355: 3
DOID:526: 3
DOID:10588: 3
DOID:3083: 3
DOID:1936: 2
DOID:9884: 2
DOID:3008: 2
DOID:3907: 2
DOID:3069: 2
DOID:1040: 2
DOID:11335: 2
DOID:3068: 2
DOID:1070: 2
DOID:1059: 2
DOID:0050700: 2
DOID:3748: 2
DOID:4362: 2
DOID:6543: 2
DOID:9970: 2
DOID:0050211: 2
DOID:8866: 2
DOID:8398:

In [32]:
ex_dis = 'DOID:11722'
ex_records = df[df['disease'] == ex_dis]
print(len(ex_records), f"records found for disease ID {ex_dis}")
for cell_type, group in ex_records.groupby('cell_type'):
    print(f"Cell Type: {cell_type}")
    print(group[['down_genes', 'up_genes', 'up_reg_scores']])

6 records found for disease ID DOID:11722
Cell Type: embryonic stem cells
                                            down_genes  \
298  [RPS4Y1, LDHA, YBX1, COX2, NLRP2, DDIT4, DCAF6...   
335  [RPS4Y1, RPL3, RPL12, RPL13A, RAN, RPL34, EEF1...   

                                              up_genes  \
298  [TMSB4XP8, RPL36A, APELA, BEX1, RPS4X, LEFTY1,...   
335  [TMSB4XP8, APELA, RPL39, RPL36A, RPS4X, PRDX4,...   

                                         up_reg_scores  
298  [0.25904086232185364, 0.16791898012161255, 0.1...  
335  [0.26566487550735474, 0.18700337409973145, 0.1...  
Cell Type: mesodermal precursor cells
                                            down_genes  \
296  [RPS4Y1, CTGF, THBS2, GREM1, TMSB10, MMP1, OXT...   
300  [GREM1, COX1, CTGF, RPS4Y1, RPL4, RPL3, AMIGO2...   

                                              up_genes  \
296  [TMSB4XP8, GJA1, AMIGO2, RPL39, PTX3, DYNLT3, ...   
300  [LUM, PRDX4, BEX1, MAGED1, NGFRAP1, RPS4X, RPL...   

                 

Filter out only the unique diseases

In [33]:
unique_diseases = df['disease'].value_counts()
unique_disease_ids = unique_diseases[unique_diseases == 1].index
df2 = df[df['disease'].isin(unique_disease_ids)].copy()
df2.head()

Unnamed: 0,disease,cell_type,down_genes,down_reg_scores,up_genes,up_reg_scores
3,DOID:1240,B Cell Lymphocyte,"[MIRNLET7D, MIRNLET7G, MIRNLET7C, MIRNLET7B, M...","[-0.6032065749168396, -0.49166399240493774, -0...","[MIRN18A, MIRN25, MIRN148A, MIRN93, MIRN328, M...","[0.02107345126569271, 0.017946461215615273, 0...."
24,DOID:10591,Placenta,"[OR7A10, DNAI2, DEFA3, ABCB8, IL20, INTS6-AS1,...","[-0.10038889199495316, -0.07733052223920822, -...","[LEP, CRH, INHBA, FSTL3, HTRA4, CXCL8, ANKRD37...","[0.16935595870018005, 0.09644939750432968, 0.0..."
26,DOID:5603,Blood monocyte,"[RGS2, PLEK, GPR183, HSPA1B, CEBPD, DNAJB1, SA...","[-0.3419916331768036, -0.21757015585899353, -0...","[DUSP6, MAFB, CD14, NAPSB, CD52, TSC22D3, FOS,...","[0.23078623414039612, 0.20958639681339264, 0.1..."
27,DOID:2801,Lung Tissue,"[C13orf15, PPP1R14A, PRX, DUOXA1, SDPR, IFI27,...","[-0.23005644977092743, -0.19364526867866516, -...","[RRAD, CXCL14, COL1A2, CD52, HLA-DPB1, LST1, A...","[0.3728449046611786, 0.28406837582588196, 0.21..."
31,DOID:11723,quadriceps,"[ENO3, PYGM, ALDOA, HBA2, MYH7, CRYAB, UBC, HB...","[-0.14721274375915527, -0.12653785943984985, -...","[ACTC1, B2M, EEF1A1, SLN, MYL6B, MYH8, C3, COL...","[0.22241078317165375, 0.18766309320926666, 0.1..."


In [34]:
df2.to_csv("data/unique_disease_signatures.csv",index=False)

### Match the diseases with external Data

In [35]:
import os
import urllib.request
import pandas as pd

# -----------------------------
# 1. DOWNLOAD doid.obo
# -----------------------------
DOID_URL = "https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo"
DOID_OBO = "doid.obo"

if not os.path.exists(DOID_OBO):
    print("Downloading doid.obo ...")
    urllib.request.urlretrieve(DOID_URL, DOID_OBO)
    print("Downloaded doid.obo")
else:
    print("doid.obo already exists")



doid.obo already exists


In [36]:

# -----------------------------
# 2. Load doid -> umls mapping
# -----------------------------
def load_doid_to_umls(obo_file):
    umls_to_doid = {}
    current_doid = None

    with open(obo_file, encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if line.startswith("id: DOID:"):
                current_doid = line.split("id: ")[1]

            elif line.startswith("xref: UMLS_CUI:") and current_doid:
                umls = line.split("UMLS_CUI:")[1]
                umls_to_doid.setdefault(umls, set()).add(current_doid)

    return umls_to_doid


umls_to_doid = load_doid_to_umls(DOID_OBO)

print("Loaded DOID→umls mappings:", len(umls_to_doid))


Loaded DOID→umls mappings: 7019


In [38]:
# -----------------------------
# 3. LOAD YOUR DATA
# -----------------------------
doids = (
    pd.read_csv(r"data\unique_disease_signatures.csv")["disease"]
    .dropna()
    .astype(str)
    .str.replace("DOID_", "DOID:", regex=False)
)

umls = (
    pd.read_csv(r"data\external_data.csv")["ind_id"]
    .dropna()
    .astype(str)
)


doid_set = set(doids)
umls_set = set(umls)


In [39]:

# -----------------------------
# 4. MATCH
# -----------------------------
matches = []
df2 = pd.read_csv(r"data\external_data.csv")

for umls in umls_set:
    doids_for_umls = umls_to_doid.get(umls, set())
    for doid in doids_for_umls:
        if doid in doid_set:
            drug_name=df2["drug_name"][df2['ind_id']==umls].values
            matches.append((umls, doid,set(drug_name)))

df = pd.DataFrame(matches, columns=["UMLS", "DOID",'drugs'])
print(df)
print("Total matches:", len(df))


        UMLS          DOID                                              drugs
0   C0345958     DOID:4556                          {Paclitaxel, Carboplatin}
1   C0011644      DOID:419                                         {Imatinib}
2   C0149925     DOID:5409              {Doxorubicin, Carboplatin, Cisplatin}
3   C0282488    DOID:13949                                {Mycophenolic acid}
4   C1140680     DOID:2394  {Carboplatin, Methotrexate, Celecoxib, Doxorub...
5   C0039503      DOID:971                                   {Hydrocortisone}
6   C0020542     DOID:6432                           {Imatinib, Nitric Oxide}
7   C0157749      DOID:381                                    {Ciprofloxacin}
8   C0152013     DOID:3910                          {Paclitaxel, Carboplatin}
9   C0023890     DOID:5082                                    {Dexamethasone}
10  C0919267     DOID:2394             {Doxorubicin, Paclitaxel, Carboplatin}
11  C0023418     DOID:1240  {Dexamethasone, Hydrocortisone, Imat

In [None]:
df.to_csv("data/testing_data.csv",index=False)