In [1]:
import obonet  # For reading OBO files into network graphs
import networkx as nx  # For working with network/graph data structures
import pandas as pd

In [2]:
dataPath = "data/"
url = dataPath + "www.ebi.ac.uk.txt"

In [3]:
efo_graph = obonet.read_obo(open(url, "r", encoding="utf8"))

In [4]:
allDiseases = list(set(nx.ancestors(efo_graph, "EFO:0000408")))  # List of all parent terms related to "disease"
print(len(allDiseases))
allCancers = list(set(nx.ancestors(efo_graph, "MONDO:0004992")))
print(len(allCancers))

19317
2958


In [5]:
gwasCatalogEns = pd.read_csv(
    dataPath + "gwas_catalog_v1.0.2-associations_e113_r2024-12-19.tsv",
    low_memory=False,  # Avoid warnings for mixed types in columns
    sep="\t"  # Specify tab as the column separator
)

In [6]:
gwasCatalogEns["efoID"] = (
    gwasCatalogEns["MAPPED_TRAIT_URI"]
    .str.split("/")  # Split the URI string by "/"
    .str[-1]  # Take the last segment (EFO ID)
    .str.replace("_", ":")  # Replace underscores with colons for correct formatting
)

In [7]:
gwasCatalogEns["name"] = (
    gwasCatalogEns["MAPPED_TRAIT"]
)
print(len(gwasCatalogEns))

692444


In [8]:
notDiseasesInGWAS = list(set(gwasCatalogEns.efoID).difference(set(allDiseases)))

In [9]:
efoID_intersection = set(gwasCatalogEns.efoID).intersection(set(allDiseases))
filtered_df = gwasCatalogEns[gwasCatalogEns['efoID'].isin(efoID_intersection)]
allDiseasesInGWAS = dict(zip(filtered_df['efoID'], filtered_df['name']))

In [10]:
try:
    with open("data/allDiseases.txt", "w") as file:
        for efo_id, name in allDiseasesInGWAS.items():
            file.write(f"{efo_id}\t{name}\n")
    print("Data successfully written to allDiseases.txt")
except Exception as e:
    print(f"An error occurred while writing to the file: {e}")
    
print(len(allDiseasesInGWAS))
    

Data successfully written to allDiseases.txt
1322


In [12]:
efoID_cancer_intersection = set(gwasCatalogEns.efoID).intersection(set(allCancers))
filtered_df = gwasCatalogEns[gwasCatalogEns['efoID'].isin(efoID_cancer_intersection)]
allCancersInGWAS = dict(zip(filtered_df['efoID'], filtered_df['name']))
try:
    with open("data/allCancers.txt", "w") as file:
        for efo_id, name in allCancersInGWAS.items():
            file.write(f"{efo_id}\t{name}\n")
    print("Data successfully written to allCancers.txt")
except Exception as e:
    print(f"An error occurred while writing to the file: {e}")
    
print(len(allCancersInGWAS))

Data successfully written to allCancers.txt
123


In [14]:
efoID_noncancer_intersection = set(set(gwasCatalogEns.efoID).intersection(set(allDiseases))).difference(set(allCancers))
filtered_df = gwasCatalogEns[gwasCatalogEns['efoID'].isin(efoID_noncancer_intersection)]
allNonCancersInGWAS = dict(zip(filtered_df['efoID'], filtered_df['name']))
try:
    with open("allNonCancers.txt", "w") as file:
        for efo_id, name in allNonCancersInGWAS.items():
            file.write(f"{efo_id}\t{name}\n")
    print("Data successfully written to allNonCancers.txt")
except Exception as e:
    print(f"An error occurred while writing to the file: {e}")
    
print(len(allNonCancersInGWAS))

Data successfully written to allNonCancers.txt
1199
