# Analyse ChemOnt
Get the number of concepts and the depth of the ontology

It is important to have rdflib, SPARQLWrapper, pandas, and numpy
* pip install rdflib
* pip install SPARQLWrapper
* pip install pandas
* pip install numpy

Download the lastest version of ChemOnt from http://classyfire.wishartlab.com/tax_nodes.json

In [3]:
import json
from collections import deque, defaultdict
from tqdm import tqdm

DOWNLOAD = True

In [4]:
chem_ont_file = "tax_nodes.json"

In [5]:
if DOWNLOAD:
    import urllib.request

    print('Beginning file download with ChemOnt...')

    url = 'http://classyfire.wishartlab.com/tax_nodes.json'
    urllib.request.urlretrieve(url, chem_ont_file)

Beginning file download with urllib2...


In [6]:
with open(chem_ont_file,'r') as file:
    chemont = json.load(file)
    
print(chemont[0])

{'name': 'Chemical entities', 'chemont_id': 'CHEMONTID:9999999', 'parent_chemont_id': None}


In [8]:
broaders = defaultdict(list)
narrowers = defaultdict(list)
topics = defaultdict(list)
for result in chemont:
    x = result["chemont_id"]
    y = result["parent_chemont_id"]
    topics[x] = 0
    if y is not None:
        broaders[x].append(y)
        narrowers[y].append(x)
        
print("Number of concepts: {}".format(len(topics)))

Number of concepts: 4825


In [9]:
def find_depth(concept, unhier):
    inspected = dict()
    queue = deque() 
    value = 1
    max_depth = value
    queue.append({"t":concept,"d":value})
    while len(queue) > 0:
        dequeued = queue.popleft()
        #print(dequeued)
        if dequeued["t"] in unhier:
            broads = unhier[dequeued["t"]]
            new_depth = dequeued["d"]+1
            if new_depth > max_depth:
                max_depth = new_depth
            for broader in broads:
                if broader not in inspected:
                    queue.append({"t":broader,"d":dequeued["d"]+1})
                    inspected[broader] = True

    concepts[concept] = max_depth
    #print("{} - {}".format(concept,max_depth))

In [10]:
unhier = broaders
concepts = topics
#concepts = {"300224863":1}
with tqdm(total=len(concepts)) as pbar:
    for concept, value in concepts.items():
        #print(concept, strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()))
        value = find_depth(concept, unhier)
        pbar.update(1)

100%|██████████| 4825/4825 [00:00<00:00, 150097.66it/s]


In [11]:
import pandas as pd
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])

In [19]:
print("CHEMONTID:9999999 is the concept scheme not a topic. So max depth is 1 less")
list_of_depths.sort_values('depth', inplace=True, ascending=False)
list_of_depths.head(50)

CHEMONTID:9999999 is the concept scheme not a topic. So max depth is 1 less


Unnamed: 0,depth
CHEMONTID:0004261,12
CHEMONTID:0004279,11
CHEMONTID:0004278,11
CHEMONTID:0004267,11
CHEMONTID:0004260,11
CHEMONTID:0003048,11
CHEMONTID:0002522,11
CHEMONTID:0003442,11
CHEMONTID:0004673,10
CHEMONTID:0003573,10


In [15]:
print("If it does not print anything after this line, it means Nature Subjects is monohierarchical (a narrower has only one broader)")
for k, v in broaders.items(): 
    if len(v) > 1: 
        print("{} has {} parents".format(k, len(v)))

If it does not print anything after this line, it means Nature Subjects is monohierarchical (a narrower has only one broader)
