In [2]:
from rdflib import Graph

# Load the EDAM ontology OWL file
owl_file_path = 'EDAM/EDAM_dev.owl'
g = Graph()
g.parse(owl_file_path, format='xml')

# SPARQL query to retrieve EDAM terms
sparql_query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX edam: <http://edamontology.org/>

    SELECT ?term
    WHERE {
        ?term rdf:type owl:Class ;
              rdfs:subClassOf* edam:data_0006 .
    }
"""

# Execute the SPARQL query
results = g.query(sparql_query)

# Extract EDAM terms from the results
edam_terms = [str(result.term) for result in results]

# Print the list of EDAM terms
print(edam_terms)


['http://edamontology.org/data_0006', 'http://edamontology.org/data_0842', 'http://edamontology.org/data_0976', 'http://edamontology.org/data_0977', 'http://edamontology.org/data_1190', 'http://edamontology.org/data_1191', 'http://edamontology.org/data_1192', 'http://edamontology.org/data_1193', 'http://edamontology.org/data_1194', 'http://edamontology.org/data_1195', 'http://edamontology.org/data_0982', 'http://edamontology.org/data_0984', 'http://edamontology.org/data_0987', 'http://edamontology.org/data_2706', 'http://edamontology.org/data_0990', 'http://edamontology.org/data_0997', 'http://edamontology.org/data_0998', 'http://edamontology.org/data_0999', 'http://edamontology.org/data_1000', 'http://edamontology.org/data_1001', 'http://edamontology.org/data_1005', 'http://edamontology.org/data_1007', 'http://edamontology.org/data_2899', 'http://edamontology.org/data_1009', 'http://edamontology.org/data_1012', 'http://edamontology.org/data_1013', 'http://edamontology.org/data_2755', 

In [None]:
# Download them...

## Via CSV from the bioportal page

https://data.bioontology.org/ontologies/EDAM/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv

In [82]:
import pandas as pd

edam_data = pd.read_csv("EDAM/EDAM.csv")
edam_topics = edam_data['Preferred Label']

In [83]:
edam_topics.to_csv("full_edam_list.txt", header=False, index=False)

### NOTE:

Currently quotes (single and double) (and forward slashes?) need to be manually purged from the data. 

In [84]:
import pandas as pd

edam_data = pd.read_csv("EDAM/EDAM.csv")

In [85]:
edam_data = edam_data[edam_data['Class ID'].str.startswith("http://edamontology.org/topic_")].sort_values(by='Preferred Label')

In [86]:
edam_data.head()

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Obsolete,CUI,Semantic Types,Parents,Citation,Created in,...,Old parent,Old related,Ontology used,Organisation,refactor_candidate,refactor_comment,Regular expression,Related term,Repository,thematic_editor
2996,http://edamontology.org/topic_3521,2D PAGE experiment,,Two-dimensional gel electrophoresis experiment...,True,,,http://www.w3.org/2002/07/owl#DeprecatedClass,,1.8,...,http://www.w3.org/2002/07/owl#Thing,,,,,,,,,
3002,http://edamontology.org/topic_0174,Ab initio structure prediction,,The prediction of three-dimensional structure ...,True,,,http://www.w3.org/2002/07/owl#DeprecatedClass,,beta12orEarlier,...,http://www.w3.org/2002/07/owl#Thing,,,,,,,,,
17,http://edamontology.org/topic_4029,Acoustics,,"The study of mechanical waves in liquids, soli...",False,,,http://edamontology.org/topic_3318,,1.26,...,,,,,,,,,,
3366,http://edamontology.org/topic_3810,Agricultural science,Agriculture|Agronomy|Agroecology|Animal breedi...,"Multidisciplinary study, research and developm...",False,,,http://edamontology.org/topic_3070,,1.20,...,,,,,,,,,,
3329,http://edamontology.org/topic_0083,Alignment,,The alignment (equivalence between sites) of m...,True,,,http://www.w3.org/2002/07/owl#DeprecatedClass,,beta12orEarlier,...,http://www.w3.org/2002/07/owl#Thing,,,,,,,,,


In [87]:
edam_data['Preferred Label'].to_csv("edam_topics.txt", header=False, index=False)

## Remove Topics

'Laboratory techniques', 'Literature and language',  'Experimental design and studies', and 'Mathematics' and all children

### Drop topics whose "Parents" are part of a deprecated class


In [88]:
edam_data[~edam_data['Parents'].str.contains("http://edamontology.org/topic_")].head()[['Class ID', 'Preferred Label', 'Parents']]

Unnamed: 0,Class ID,Preferred Label,Parents
2996,http://edamontology.org/topic_3521,2D PAGE experiment,http://www.w3.org/2002/07/owl#DeprecatedClass
3002,http://edamontology.org/topic_0174,Ab initio structure prediction,http://www.w3.org/2002/07/owl#DeprecatedClass
3329,http://edamontology.org/topic_0083,Alignment,http://www.w3.org/2002/07/owl#DeprecatedClass
1690,http://edamontology.org/topic_0786,Arabidopsis,http://www.w3.org/2002/07/owl#DeprecatedClass
2435,http://edamontology.org/topic_3075,Biological system modelling,http://www.w3.org/2002/07/owl#DeprecatedClass


In [89]:
edam_data = edam_data[edam_data['Parents'].str.contains("http://edamontology.org/topic_")]

### Remove topics based on Parent ID

Laboratory Techniques - 3361

Literate and Language - 3068

Experimental design and studies - 3678

Mathematics - 3315

In [90]:
from collections import defaultdict

topics = ['3361', '3068', '3678', '3315']

subtopics = defaultdict(lambda: [])

def get_children_topics(parent_id):
    children_ids = edam_data[edam_data['Parents'].str.contains(parent_id)]['Class ID'].apply(lambda url: url.split('topic_')[1]).to_list()
    if not len(children_ids):
        return []
    
    # print(parent_id, children_ids)
    
    grandchildren = []
    for child_id in children_ids:
        grandchildren.append(get_children_topics(child_id))
    
    children_ids.append(grandchildren)
    return children_ids

for parent_topic in topics:
    subtopics[parent_topic] = get_children_topics(parent_topic)

In [91]:
# Flatten each topics lists
for key, value in subtopics.items():
    print(f"{key}: {value}")

def flatten_list(nested_list):
    flattened = []
    for item in nested_list:
        if isinstance(item, list):
            flattened.extend(flatten_list(item))
        else:
            flattened.append(item)
    return flattened

# Flatten each value in the dictionary
flattened_dict = {key: flatten_list(value) for key, value in subtopics.items()}

# Print the flattened dictionary
print()
for key, value in flattened_dict.items():
    print(f"{key}: {value}")

3361: ['3940', '3934', '3516', '3382', '3656', '3518', '3519', '3957', '3520', '3523', '3168', '3524', [[], [], [], ['3383', '3954', '4016', '4014', '0611', '3385', '3444', '3384', '0593', '3448', '3452', '2828', [[], [], [], [], ['4017', [[]]], [], [], [], [], [], [], []]], ['3179', '3169', '3674', '3794', [[], [], [], []]], [], [], [], [], [], ['3169', '3676', '3923', '3837', '3170', '4028', '3673', [[], [], [], [], [], [], []]], []]]
3068: ['0218', [[]]]
3678: ['3679', '3517', '3379', [[], [], []]]
3315: ['3569', '3570', '2269', [[], [], []]]

3361: ['3940', '3934', '3516', '3382', '3656', '3518', '3519', '3957', '3520', '3523', '3168', '3524', '3383', '3954', '4016', '4014', '0611', '3385', '3444', '3384', '0593', '3448', '3452', '2828', '4017', '3179', '3169', '3674', '3794', '3169', '3676', '3923', '3837', '3170', '4028', '3673']
3068: ['0218']
3678: ['3679', '3517', '3379']
3315: ['3569', '3570', '2269']


In [92]:
topics_to_remove = topics

for key, value in flattened_dict.items():
    topics.extend(value)

topics_to_remove = set(topics)

print('Number of Topics to remove: ', len(topics))

Number of Topics to remove:  47


In [93]:
edam_data[edam_data['Class ID'].apply(lambda topic: str(topic).split('topic_')[1] in topics_to_remove)][['Class ID', 'Preferred Label']]

Unnamed: 0,Class ID,Preferred Label
1810,http://edamontology.org/topic_3679,Animal study
2963,http://edamontology.org/topic_3569,Applied mathematics
2501,http://edamontology.org/topic_3383,Bioimaging
1106,http://edamontology.org/topic_3179,ChIP-on-chip
52,http://edamontology.org/topic_3169,ChIP-seq
1748,http://edamontology.org/topic_3940,Chromosome conformation capture
3486,http://edamontology.org/topic_4017,Cryogenic electron microscopy
399,http://edamontology.org/topic_3934,Cytometry
983,http://edamontology.org/topic_3954,Echography
2089,http://edamontology.org/topic_4016,Electrocardiography


In [94]:
edam_data = edam_data[~edam_data['Class ID'].apply(lambda topic: str(topic).split('topic_')[1] in topics_to_remove)]

In [97]:
# Note, all children of the above topics are removed even if they are a child of another topic. Adjust code if necessary for the other behavior

edam_data['Preferred Label'].to_csv("edam_topics.txt", header=False, index=False)

## Verify topics in EDAM topic list and dataset

In [98]:
dataset = pd.read_csv('datasets/staging_test_set.csv')

In [100]:
with open('edam_topics.txt', 'r') as edam_file:
    edam_topics = edam_file.readlines()

edam_topics = [topic.strip() for topic in edam_topics]

In [103]:
indices_true = dataset.loc[dataset['EDAM Topics'].apply(lambda edam_list: not all(term in edam_topics for term in edam_list))].index

In [104]:
for index in indices_true:
    edam_list = dataset.loc[index, 'EDAM Topics']
    terms_not_in_edam_topics = [term for term in edam_list if term not in edam_topics]
    
    print(f"Index {index}: Terms not in edam_topics: {terms_not_in_edam_topics}")

Index 0: Terms not in edam_topics: ['[', "'", 'c', 'e', 'l', "'", ',', ' ', "'", 'C', 'e', 'l', 'l', 'M', 'L', "'", ',', ' ', "'", 'C', 'e', 'l', 'l', ' ', 't', 'y', 'p', 'e', ' ', 'a', 'c', 'c', 'e', 's', 's', 'i', 'o', 'n', "'", ',', ' ', "'", 'Z', 'o', 'o', 'l', 'o', 'g', 'y', "'", ',', ' ', "'", 'L', 'a', 'b', 'o', 'r', 'a', 't', 'o', 'r', 'y', ' ', 'a', 'n', 'i', 'm', 'a', 'l', ' ', 's', 'c', 'i', 'e', 'n', 'c', 'e', "'", ',', ' ', "'", 'A', 'n', 'i', 'm', 'a', 'l', ' ', 's', 't', 'u', 'd', 'y', "'", ',', ' ', "'", 'S', 'y', 'n', 't', 'h', 'e', 't', 'i', 'c', ' ', 'b', 'i', 'o', 'l', 'o', 'g', 'y', "'", ',', ' ', "'", 'P', 'a', 't', 'h', 'w', 'a', 'y', ' ', 'I', 'D', ' ', '(', 'P', 'a', 'n', 't', 'h', 'e', 'r', ')', "'", ',', ' ', "'", 'P', 'a', 'n', 't', 'h', 'e', 'r', ' ', 'P', 'a', 't', 'h', 'w', 'a', 'y', 's', ' ', 'e', 'n', 't', 'r', 'y', ' ', 'f', 'o', 'r', 'm', 'a', 't', "'", ',', ' ', "'", 'C', 'e', 'l', 'l', ' ', 'l', 'i', 'n', 'e', ' ', 'n', 'a', 'm', 'e', "'", ',', ' ',