In [1]:
from disease import Disease
import requests
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import pandas as pd
import json
import numpy as np
import random
import obonet
import networkx as nx


In [5]:
disease = Disease('something', 'EFO_0000178')
disease.quantify_clustering()

something EFO_0000178
No genes found for something.
Genes: 0



In [None]:
def compute_clustering(diseases, filename):
    output = []
    not_included = []
    for d in diseases:
        disease = Disease(d['name'], d['efoID'])
        try:
            result = disease.quantify_clustering()
            if result is None or len(result) != 2 or any(r is None for r in result):
                not_included.append({'name' : d['name'], 'efoID' : d['efoID']})
            num_edges, z_score = result
            output.append({'name' : d['name'], 'efoID' : d['efoID'], 'num_edges' : num_edges, 'z_score' : z_score})
        except Exception as e:
            print(f"Error processing {d['name']}: {e}")
    with open(filename, 'w') as file:
        for entry in output:
            file.write(f"Name: {entry['name']}, EFO ID: {entry['efoID']}, Num Edges: {entry['num_edges']}, Z-Score: {entry['z_score']}\n")
    return output, not_included

def compare(cancerous, noncancerous, sample_size=50):
    if len(noncancerous) > sample_size:
        non_cancerous_sample = random.sample(noncancerous, sample_size)
    else:
        non_cancerous_sample = noncancerous
        
    if len(cancerous) > sample_size:
        cancerous_sample = random.sample(cancerous, sample_size)
    else:
        cancerous_sample = cancerous
        
    cancer_edges = [d['num_edges'] for d in cancerous_sample]
    non_cancer_edges = [d['num_edges'] for d in non_cancerous_sample]
    
    cancer_z = [d['z_score'] for d in cancerous_sample]
    non_cancer_z = [d['z_score'] for  d in non_cancerous_sample]
    
    t_stat, p_value = ttest_ind(cancer_edges, non_cancer_edges, equal_var=False)

    plt.boxplot([cancer_edges, non_cancer_edges], tick_labels=['Cancerous', 'Non-Cancerous'])
    plt.title(f'Clustering Comparison\np-value: {p_value:.2e}')
    plt.ylabel('Number of Edges')
    plt.show()

    print(f"True number of edges\nT-statistic: {t_stat}, P-value: {p_value}\n")
    
    t_stat, p_value = ttest_ind(cancer_z, non_cancer_z, equal_var=False)

    plt.boxplot([cancer_z, non_cancer_z], tick_labels=['Cancerous', 'Non-Cancerous'])
    plt.title(f'Clustering Comparison\np-value: {p_value:.2e}')
    plt.ylabel('Z scores for random numbers of edges')
    plt.show()

    print(f"Z scores for random numbers of edges\nT-statistic: {t_stat}, P-value: {p_value}\n")


In [None]:
dataPath = "data/"
url = dataPath + "www.ebi.ac.uk.txt"
efo_graph = obonet.read_obo(open(url, "r", encoding="utf8"))
allDiseases = list(set(nx.ancestors(efo_graph, "EFO:0000408")))
allCancers = list(set(nx.ancestors(efo_graph, "MONDO:0004992")))

In [None]:
gwasCatalogEns = pd.read_csv(
    dataPath + "gwas_catalog_v1.0.2-associations_e113_r2024-12-19.tsv",
    low_memory=False,
    sep="\t"
)

gwasCatalogEns["efoID"] = (
    gwasCatalogEns["MAPPED_TRAIT_URI"]
    .str.split("/")
    .str[-1]
    .str.replace("_", ":")
)
gwasCatalogEns["name"] = (
    gwasCatalogEns["MAPPED_TRAIT"]
)

In [None]:
efoID_intersection = set(gwasCatalogEns.efoID).intersection(set(allDiseases))
filtered_df = gwasCatalogEns[gwasCatalogEns['efoID'].isin(efoID_intersection)]
allDiseasesInGWAS = dict(zip(filtered_df['efoID'], filtered_df['name']))
try:
    with open("allDiseases.txt", "w") as file:
        for efo_id, name in allDiseasesInGWAS.items():
            file.write(f"{efo_id}\t{name}\n")
    print("Data successfully written to allDiseases.txt")
except Exception as e:
    print(f"An error occurred while writing to the file: {e}")
    
print(len(allDiseasesInGWAS))

In [None]:
efoID_intersection = set(gwasCatalogEns.efoID).intersection(set(allCancers))
filtered_df = gwasCatalogEns[gwasCatalogEns['efoID'].isin(efoID_intersection)]
allCancersInGWAS = dict(zip(filtered_df['efoID'], filtered_df['name']))
try:
    with open("allCancers.txt", "w") as file:
        for efo_id, name in allCancersInGWAS.items():
            file.write(f"{efo_id}\t{name}\n")
    print("Data successfully written to allCancers.txt")
except Exception as e:
    print(f"An error occurred while writing to the file: {e}")
    
print(len(allCancersInGWAS))

In [None]:
efoID_intersection = set(set(gwasCatalogEns.efoID).intersection(set(allDiseases))).difference(set(allCancers))
filtered_df = gwasCatalogEns[gwasCatalogEns['efoID'].isin(efoID_intersection)]
allNonCancersInGWAS = dict(zip(filtered_df['efoID'], filtered_df['name']))
try:
    with open("allNonCancers.txt", "w") as file:
        for efo_id, name in allNonCancersInGWAS.items():
            file.write(f"{efo_id}\t{name}\n")
    print("Data successfully written to allNonCancers.txt")
except Exception as e:
    print(f"An error occurred while writing to the file: {e}")
    
print(len(allNonCancersInGWAS))

In [None]:
with open("allCancers.txt", "r") as file:
    cancers = []
    for line in file:
        id, name = line.strip().split('\t')
        cancers.append({'efoID' : id.replace(":", "_"), 'name' : name})
print(len(cancers))

In [None]:
with open("allNonCancers.txt", "r") as file:
    noncancers = []
    for line in file:
        id, name = line.strip().split('\t')
        noncancers.append({'efoID' : id.replace(":", "_"), 'name' : name})
print(len(noncancers))

In [None]:
cancerous = compute_clustering(cancers, "cancerous_output.txt")

In [None]:
noncancerous, not_included = compute_clustering(noncancers, "noncancerous_output.txt")

In [None]:
print(not_included)
print(len(not_included))

In [4]:
import re

def read_disease_file(filename, pattern):
    with open(filename, "r") as file:
        disease_names = []
        for line in file:
            match = re.search(pattern, line)
            if match:
                name, efo_id, num_edges, z_score = match.groups()
                if num_edges != '0':
                    disease_names.append({
                        'name' : name,
                        'efo_id' : efo_id,
                        'num_edges' : int(num_edges),
                        'z_score' : float(z_score)
                        })
            else:
                print("No match:", line)
    return disease_names

In [None]:
pattern = r"Name:\s*(.+?),\s*EFO ID:\s*(.*?),\s*Num Edges:\s*(\d+),\s*Z-Score:\s*(-?[\d\.]+|nan+|inf)"
noncancers_names = read_disease_file("noncancerous_output.txt", pattern)
print(len(noncancers_names))

In [None]:
cancers_names = read_disease_file("cancerous_output.txt", pattern)
print(len(cancers_names))

In [None]:
print(noncancers_names)

In [None]:
def remove_nones(l):
    none_output = []
    to_remove = []  # Keep track of indices or items to remove

    for d in l:
        try:
            disease = Disease(d['name'], d['efo_id'])
            
            # Continue looping until the z_score is no longer infinity
            while d['z_score'] == float('inf'):
                result = disease.quantify_clustering()
                
                # If result is invalid, add to none_output and mark for removal
                if result is None or len(result) != 2 or any(r is None for r in result):
                    none_output.append({'name': d['name'], 'efoID': d['efo_id']})
                    to_remove.append(d)
                    break  # Break out of while loop to go to the next item

                num_edges, z_score = result
                d['num_edges'] = num_edges
                d['z_score'] = z_score
            
            # If num_edges is 1, remove the item
            if d['num_edges'] == 1:
                none_output.append({'name': d['name'], 'efoID': d['efo_id']})
                to_remove.append(d)

        except Exception as e:
            print(f"Error processing {d['name']}: {e}")
    
    # Filter out the items to be removed
    l = [d for d in l if d not in to_remove]
    
    return l, none_output


In [None]:
cancers, cancers_nones = remove_nones(cancers_names)
print(len(cancers))
print(cancers_nones)

In [2]:
disease = Disease('multiple sclerosis', 'MONDO_0005301')
disease.quantify_clustering()
disease.get_ppi_enrichment()

multiple sclerosis MONDO_0005301
Genes mapped to multiple sclerosis: ['TSBP1', 'TSBP1-AS1', 'L3MBTL3', 'DLEU1', 'MAZ', 'MVP-DT', 'TOP3A', 'ERG', 'TNFRSF14 - PRXL2B', 'GFI1', 'CD58', 'RGS1', 'Y_RNA - CD86', 'ANKRD55', 'HLA-DQA1', 'RPL32P23 - RBM17', 'ZNF438 - LINC02664', 'TNFRSF1A', 'CLEC16A', 'CD6 - LINC02954', 'OS9', 'CDHR3 - SYPL1', 'TGFBR3', 'LINC02132 - LINC01082', 'IL2RA', 'SP140', 'RNA5SP431 - MAF', 'IL20RA - IL22RA2', 'RMI2', 'SLC2A4RG', 'GTDC1', 'LINC01967', 'TIMMDC1', 'TNFSF14', 'RGS21 - RGS1', 'JAK1', 'TYK2', 'IL7R', 'IRF1-AS1', 'IRF1', 'FAM167A', 'FAM167A-AS1', 'JAZF1', 'PHKG1', 'CYP24A1', 'MYBPC3', 'LINC01624', 'LTBR - RPL31P10', 'CD226', 'NDFIP1 - SPRY4', 'BCAS1 - CYP24A1', 'BCL6 - LINC01991', 'LINC01980', 'MCTP1', 'AFF1', 'LINC02732', 'RN7SL89P - PPA2', 'LNC-RHL1 - APOA4', 'PKIA-AS1', 'ZNF652', 'FLJ40194', 'ESPN', 'ITK', 'GARIN3', 'TNFAIP8', 'PHLDB1', 'IFITM3', 'C1orf52 - Y_RNA', 'STAT3', 'ZC3HAV1', 'SLC7A10 - CEBPA', 'CD27', 'CD27-AS1', 'HNRNPA1P41 - JAK2', 'ETV7', 'FPR1

'0.0'

In [None]:
print(len(noncancers_names))
noncancers, noncancers_nones = remove_nones(noncancers_names)
print(len(noncancers))
print(noncancers_nones)

In [None]:
with open('cancerous_final.txt', 'w') as file:
        for entry in cancers:
            file.write(f"Name: {entry['name']}, EFO ID: {entry['efo_id']}, Num Edges: {entry['num_edges']}, Z-Score: {entry['z_score']}\n")

In [None]:
with open('noncancerous_final.txt', 'w') as file:
        for entry in noncancers:
            file.write(f"Name: {entry['name']}, EFO ID: {entry['efo_id']}, Num Edges: {entry['num_edges']}, Z-Score: {entry['z_score']}\n")

In [None]:
compare(cancers, noncancers, len(cancers))

In [7]:
def get_pvalue(l):
    for d in l:
        d['pvalue'] = None
        disease = Disease(d['name'], d['efo_id'])
        disease.quantify_clustering()
        try:
            pvalue = disease.get_ppi_enrichment()
        except Exception as e:
                print(f"Error processing {d['name']}: {e}")
        d['pvalue'] = pvalue
    return l    

In [5]:
pattern = r"Name:\s*(.+?),\s*EFO ID:\s*(.*?),\s*Num Edges:\s*(\d+),\s*Z-Score:\s*(-?[\d\.]+|nan+|inf)"
noncancers = read_disease_file("noncancerous_final.txt", pattern)
print(len(noncancers))

446


In [6]:
pattern = r"Name:\s*(.+?),\s*EFO ID:\s*(.*?),\s*Num Edges:\s*(\d+),\s*Z-Score:\s*(-?[\d\.]+|nan+|inf)"
cancers = read_disease_file("cancerous_final.txt", pattern)
print(len(cancers))

57


In [8]:
cancers = get_pvalue(cancers)

carrier status, prostate carcinoma EFO_0001663
Genes mapped to carrier status, prostate carcinoma: ['GARIN3P1 - CAAP1', 'CASC17', 'CASC19', 'PRNCR1', 'PCAT1', 'NAV1', 'IPO9-AS1']
Genes: 7

Proteins: 1

Number of edges: 42

P-value: 7.05e-14
esophageal carcinoma EFO_0002916
Genes mapped to esophageal carcinoma: ['ALDH2', 'ADH1B', 'PLCE1', 'CHEK2', 'ADH4', 'PSMB3P2 - IGFBP2', 'ATP6V1G1P7 - RPL7P45', 'HECTD4', 'CSNK1A1', 'NAALADL2 - ACTG1P23', 'SEMA5B', 'CARS1P1 - ANP32A', 'PDE4D', 'ACAD10', 'RUNX1', 'LRFN2 - UNC5CL', 'LINC00160 - LINC01426', 'RASSF10 - BMAL1', 'IFT81 - ATP2A2', 'CASP8', 'FLACC1']
Genes: 21

Proteins: 13

Number of edges: 2

P-value: 0.0819
squamous cell carcinoma EFO_0000707
Genes mapped to squamous cell carcinoma: ['BRCA2', 'CHEK2', 'SLC45A2', 'ANKRD11', 'CDYL - RPP40', 'MICA - LINC01149', 'LPP', 'FLACC1', 'CASP8', 'TRPS1', 'UBAC2 - CCR12P', 'DTNBP1', 'HERC2', 'SEC16A', 'BACH2', 'BNC2 - RN7SL720P', 'HLA-DRB1 - HLA-DQA1', 'KRT5', 'TYR', 'OCA2', 'TPM3P2 - PIGPP3', 'MC1R',

In [14]:
noncancers = get_pvalue(noncancers)

atopic eczema EFO_0000274
Genes mapped to atopic eczema: ['LCE1F - LCE1E', 'CRCT1 - LCE3E', 'FLG-AS1 - LCE5A', 'RN7SL335P - BLTP1', 'SMARCA4', 'CIART - MRPS21', 'LINC00299', 'PUS10', 'CD207 - LINC01143', 'IL7R - CAPSL', 'RPL13AP18 - RNU6-1213P', 'IL15RA - IL2RA', 'LINC02098 - ETS1', 'PPP2R3C', 'FAM177A1', 'STAT3', 'IL1R1', 'KIF3A', 'CLEC16A', 'IL2 - IL21', 'EMSY - LINC02757', 'IL18R1', 'AP5B1 - OVOL1', 'NFILZ', 'RTEL1-TNFRSF6B', 'RTEL1', 'IL6R', 'OVOL1', 'TH2LCRR', 'IL18R1 - SDR42E1P5', 'HLA-DRB1 - HLA-DQA1', 'LINC02929', 'DPY19L2P2', 'PRR5L', 'LCE3A - LCEP4', 'SLC9A4', 'IL13', 'TNFRSF6B', 'ZNF652-AS1', 'LINC02571 - HLA-B', 'LINC02757', 'KRT8P26 - AP5B1', 'PBX2 - GPSM3', 'TSBP1-AS1', 'TSBP1', 'OR10A3 - NLRP10', 'MIR4435-2HG', 'GLB1', 'CCDC80 - MAT2AP1', 'CARD11 - SDK1-AS1', 'LINC00824', 'ALDH7A1P4 - ADO', 'CYP24A1 - PFDN4', 'NCF4', 'NCF4-AS1', 'ATP6V1G2-DDX39B', 'DDX39B', 'FLG-AS1', 'TNXB', 'MFN2', 'GRM4', 'RN7SL474P - FGF20', 'AUTS2 - GALNT17', 'XIRP2-AS1', 'XIRP2', 'CDKN2B-AS1 - DMRT

In [9]:
print(cancers)

[{'name': 'carrier status, prostate carcinoma', 'efo_id': 'EFO_0001663', 'num_edges': 42, 'z_score': 2.272683867591024, 'pvalue': '7.05e-14'}, {'name': 'esophageal carcinoma', 'efo_id': 'EFO_0002916', 'num_edges': 3, 'z_score': 4.7654566532944, 'pvalue': '0.0819'}, {'name': 'squamous cell carcinoma', 'efo_id': 'EFO_0000707', 'num_edges': 26, 'z_score': 7.869171484068971, 'pvalue': '8.39e-08'}, {'name': 'lung adenocarcinoma', 'efo_id': 'EFO_0000571', 'num_edges': 32, 'z_score': 6.520595269065408, 'pvalue': '2.11e-06'}, {'name': 'multiple myeloma', 'efo_id': 'EFO_0001378', 'num_edges': 15, 'z_score': 6.40288775120254, 'pvalue': '2.35e-05'}, {'name': 'myeloproliferative disorder', 'efo_id': 'EFO_0004251', 'num_edges': 11, 'z_score': 18.86437411412275, 'pvalue': '3.2e-05'}, {'name': 'non-small cell lung carcinoma', 'efo_id': 'EFO_0003060', 'num_edges': 5, 'z_score': 2.7720134932161673, 'pvalue': '0.032'}, {'name': 'triple-negative breast cancer', 'efo_id': 'EFO_0005537', 'num_edges': 3, 'z

In [16]:
print(len(noncancers))

446


In [11]:
import csv

def write_to_csv(filename, data):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=list(data[0].keys()))
        writer.writeheader()
        writer.writerows(data)

In [12]:
write_to_csv("cancerous_final.csv", cancers)

In [15]:
write_to_csv("noncancerous_final.csv", noncancers)