# Analyse EDAM
Get the number of concepts and the depth of the ontology

It is important to have rdflib, SPARQLWrapper, pandas, and numpy
* pip install rdflib
* pip install SPARQLWrapper
* pip install pandas
* pip install numpy

The version 1.25 of EDAM is already available in this repository.

However, you can download the lastest version from https://edamontology.org/page

In [1]:
from rdflib import Graph
from rdflib.namespace import RDFS
from rdflib import URIRef
import rdflib
import json
from collections import deque
import numpy as np

In [2]:
input_file = "EDAM_1.25.owl"
g = Graph()
g.parse(input_file)

<Graph identifier=Nde4853ed8ac24310b52bbb1e5e234204 (<class 'rdflib.graph.Graph'>)>

In [3]:
qres = g.query(
    """SELECT DISTINCT ?a
       WHERE {
          ?a a owl:Class .
          ?a oboInOwl:inSubset edam:topics
       }""")
topics = dict()
for row in qres:
#     print("%s" % row)
    topics[row[0]] = True
    
print(len(topics))  

264


In [4]:
qres = g.query(
    """SELECT DISTINCT ?a ?b
       WHERE {
          ?a rdfs:subClassOf ?b .
          ?a oboInOwl:inSubset edam:topics
       }""")

broaders = dict()
narrowers = dict()
for row in qres:
    if row[0] not in broaders:
        broaders[row[0]] = list()
    broaders[row[0]].append(row[1])
    if row[1] not in narrowers:
        narrowers[row[1]] = list()
    narrowers[row[1]].append(row[0])

# Checking if it is Polyhierarchical

If it prints items it means it is polyhierarchical

In [6]:
for key, broad in broaders.items():
    #print(len(broad))
    if (len(broad) > 1):
        print(key, broad)

http://edamontology.org/topic_0084 [rdflib.term.URIRef('http://edamontology.org/topic_3299'), rdflib.term.URIRef('http://edamontology.org/topic_3307')]
http://edamontology.org/topic_0085 [rdflib.term.URIRef('http://edamontology.org/topic_0622'), rdflib.term.URIRef('http://edamontology.org/topic_1775')]
http://edamontology.org/topic_0097 [rdflib.term.URIRef('http://edamontology.org/topic_0077'), rdflib.term.URIRef('http://edamontology.org/topic_0081')]
http://edamontology.org/topic_0122 [rdflib.term.URIRef('http://edamontology.org/topic_0622'), rdflib.term.URIRef('http://edamontology.org/topic_1317')]
http://edamontology.org/topic_0128 [rdflib.term.URIRef('http://edamontology.org/topic_0078'), rdflib.term.URIRef('http://edamontology.org/topic_0602')]
http://edamontology.org/topic_0176 [rdflib.term.URIRef('http://edamontology.org/topic_0082'), rdflib.term.URIRef('http://edamontology.org/topic_3892')]
http://edamontology.org/topic_0194 [rdflib.term.URIRef('http://edamontology.org/topic_00

# Assessing the depth

In [7]:
unhier = broaders
concepts = topics
for concept, value in concepts.items():
    queue = deque() 
    max_depth = value
    queue.append({"t":concept,"d":value})
    while len(queue) > 0:
        dequeued = queue.popleft()
        if dequeued["t"] in unhier:
            broads = unhier[dequeued["t"]]
            new_depth = dequeued["d"]+1
            if new_depth > max_depth:
                max_depth = new_depth
            for broader in broads:
                queue.append({"t":broader,"d":dequeued["d"]+1})
    
    concepts[concept] = max_depth

In [8]:
import pandas as pd
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])

In [9]:
list_of_depths.sort_values('depth', inplace=True, ascending=False)
list_of_depths.head()

Unnamed: 0,depth
http://edamontology.org/topic_3941,7
http://edamontology.org/topic_0798,6
http://edamontology.org/topic_2885,6
http://edamontology.org/topic_0204,6
http://edamontology.org/topic_2830,6


In [10]:
list_of_depths.sort_values('depth', inplace=True, ascending=True)
list_of_depths.head()

Unnamed: 0,depth
http://edamontology.org/topic_0003,True
http://edamontology.org/topic_3678,2
http://edamontology.org/topic_0605,2
http://edamontology.org/topic_3307,2
http://edamontology.org/topic_3361,2


# Top Concepts

In [11]:
qres = g.query(
    """SELECT DISTINCT ?a ?b
       WHERE {
          ?a rdfs:subClassOf :topic_0003 .
          ?a rdfs:label ?b
       }""")
topics = dict()
for row in qres:
    print(row[1])
 

Informatics
Literature and language
Biology
Data management
Medicine
Computational biology
Chemistry
Mathematics
Computer science
Physics
Biomedical science
Laboratory techniques
Omics
Experimental design and studies
Open science
