# Analyse the Computer Science Ontology
Get the number of concepts and the depth of the ontology

It is important to have rdflib, SPARQLWrapper, pandas, tqdm, and numpy
* pip install rdflib
* pip install SPARQLWrapper
* pip install pandas
* pip install numpy
* pip install tqdm


The version 3.3 of CSO is already available in this repository.

However, you can download the lastest version from http://cso.kmi.open.ac.uk/downloads

In [1]:
from rdflib import Graph
from rdflib.namespace import RDFS
from rdflib import URIRef
import rdflib
import json
from collections import deque
import numpy as np
from tqdm import tqdm

In [2]:
input_file = "CSO.3.3.owl"
g = Graph()
g.parse(input_file)

<Graph identifier=N778b8ce0bb4046d0bf3faadb2533d89d (<class 'rdflib.graph.Graph'>)>

In [3]:
qres = g.query(
    """SELECT DISTINCT ?a
       WHERE {
          ?a rdf:type <http://cso.kmi.open.ac.uk/schema/cso#Topic>
       }""")
topics = dict()
for row in qres:
    topics[row[0]] = True
    
print(len(topics))  

14290


In [4]:
qres = g.query(
    """SELECT DISTINCT ?a ?b
       WHERE {
          ?a <http://cso.kmi.open.ac.uk/schema/cso#superTopicOf> ?b .
       }""")

broaders = dict()
narrowers = dict()
for row in qres:
    if row[0] not in narrowers:
        narrowers[row[0]] = list()
    narrowers[row[0]].append(row[1])
    if row[1] not in broaders:
        broaders[row[1]] = list()
    broaders[row[1]].append(row[0])

# Checking if it is Polyhierarchical

In [5]:
count = 0
for key, broad in broaders.items():
    if (len(broad) > 1):
        count += 1
        #print(key, broad)
        
print("Found {} topics that have more than one parent".format(count))

Found 9685 topics that have more than one parent


# Assessing the depth

In [6]:
unhier = broaders
concepts = topics
with tqdm(total=len(concepts)) as pbar:
    for concept, value in concepts.items():       
        queue = deque() 
        max_depth = value
        queue.append({"t":concept,"d":value})
        while len(queue) > 0:
            dequeued = queue.popleft()
            if dequeued["t"] in unhier:
                broads = unhier[dequeued["t"]]
                new_depth = dequeued["d"]+1
                if new_depth > max_depth:
                    max_depth = new_depth
                for broader in broads:
                    queue.append({"t":broader,"d":dequeued["d"]+1})

        concepts[concept] = max_depth
        pbar.update(1)

100%|██████████| 14290/14290 [00:07<00:00, 1820.71it/s]


In [7]:
import pandas as pd
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])

In [8]:
list_of_depths.sort_values('depth', inplace=True, ascending=False)
list_of_depths.head()

Unnamed: 0,depth
https://cso.kmi.open.ac.uk/topics/rfid_tag_antennas,13
https://cso.kmi.open.ac.uk/topics/passive_tags,13
https://cso.kmi.open.ac.uk/topics/ultrawideband_applications,12
https://cso.kmi.open.ac.uk/topics/ultra-high_frequency,12
https://cso.kmi.open.ac.uk/topics/passive_uhf_rfid,12


In [9]:
list_of_depths.sort_values('depth', inplace=True, ascending=True)
list_of_depths.head()

Unnamed: 0,depth
https://cso.kmi.open.ac.uk/topics/computer_science,True
https://cso.kmi.open.ac.uk/topics/semantics,True
https://cso.kmi.open.ac.uk/topics/geology,True
https://cso.kmi.open.ac.uk/topics/sociology,True
https://cso.kmi.open.ac.uk/topics/engineering,True


# Top Concepts

In [10]:
top_concepts = set(narrowers.keys())-set(broaders.keys())
for concept in top_concepts:
    print(concept)

https://cso.kmi.open.ac.uk/topics/linguistics
https://cso.kmi.open.ac.uk/topics/semantics
https://cso.kmi.open.ac.uk/topics/communication
https://cso.kmi.open.ac.uk/topics/sociology
https://cso.kmi.open.ac.uk/topics/education
https://cso.kmi.open.ac.uk/topics/topology
https://cso.kmi.open.ac.uk/topics/mathematics
https://cso.kmi.open.ac.uk/topics/geology
https://cso.kmi.open.ac.uk/topics/computer_science
https://cso.kmi.open.ac.uk/topics/geometry
https://cso.kmi.open.ac.uk/topics/engineering
https://cso.kmi.open.ac.uk/topics/economics
