# Analysis of number of concepts and depth of UMLS

!important

To run this code, you need to have your own instance of UMLS installed in a MySQL database

## Import

In [1]:
import json
import pandas as pd
import mysql.connector
from mysql.connector import Error
from collections import deque
from tqdm import tqdm

# Parameters

In [2]:
#########################
## PARAMETERS

HOST='localhost'
DB_NAME='umls'
USERNAME='umls'
PASS='umls'
#########################

# Read Concepts

In [3]:
concepts = dict()
try:
    connection = mysql.connector.connect(host=HOST,
                                         database=DB_NAME,
                                         user=USERNAME,
                                         password=PASS)
    
    if connection.is_connected():
        cursor = connection.cursor(dictionary=True)
        query = "SELECT DISTINCT(CUI) FROM `MRCONSO`"
        #CUI1 is the specific topic, #CUI2 is the general
        #print(query)
        cursor.execute(query)
        rows = cursor.fetchall()

        for row in rows:
            if row['CUI'] not in concepts:
                concepts[row['CUI']] = 1
            
    
except Error as e:
    print("Error while connecting to MySQL", e)
finally:
    if (connection.is_connected()):
        cursor.close()
        connection.close()
        print("MySQL connection is closed")

MySQL connection is closed


# Number of Concepts

In [4]:
print(len(concepts))

2983840


# Read relationships

In [5]:
hier = dict()
unhier = dict()
try:
    connection = mysql.connector.connect(host=HOST,
                                         database=DB_NAME,
                                         user=USERNAME,
                                         password=PASS)
    
    if connection.is_connected():
        cursor = connection.cursor(dictionary=True)
        query = "SELECT CUI1, CUI2 FROM `MRREL` WHERE `REL` LIKE 'RB'"
        #CUI1 is the specific topic, #CUI2 is the general
        #print(query)
        cursor.execute(query)
        rows = cursor.fetchall()

        for row in rows:
            if row['CUI2'] not in hier:
                hier[row['CUI2']] = list()
            hier[row['CUI2']].append(row['CUI1'])
            
            if row['CUI1'] not in unhier:
                unhier[row['CUI1']] = list()
            unhier[row['CUI1']].append(row['CUI2'])
            
    
except Error as e:
    print("Error while connecting to MySQL", e)
finally:
    if (connection.is_connected()):
        cursor.close()
        connection.close()
        print("MySQL connection is closed")

MySQL connection is closed


In [6]:
# Cleaning
for key, value in unhier.items():
    unhier[key] = list(set(value))

In [7]:
hier[list(hier.keys())[1]]

['C0003787', 'C0027972', 'C0028914', 'C0039711', 'C0003787', 'C0027972']

In [8]:
unhier[list(unhier.keys())[0]]

['C0270715', 'C0524851']

In [13]:
i=0
with tqdm(total=len(concepts)) as pbar:
    for concept, value in concepts.items():
        i+=1
        pbar.update(i)
        queue = deque() 
        max_depth = value
        queue.append({"t":concept,"d":value})
        while len(queue) > 0:
            dequeued = queue.popleft()
            if dequeued["t"] in unhier:
                broaders = unhier[dequeued["t"]]
                new_depth = dequeued["d"]+1
                if new_depth > max_depth:
                    max_depth = new_depth
                for broader in broaders:
                    queue.append({"t":broader,"d":dequeued["d"]+1})

        concepts[concept] = max_depth

  0%|          | 10/2983840 [00:51<4284:20:47,  5.17s/it]


KeyboardInterrupt: 

In [None]:
with open('new_concepts.json','w') as file:
    json.dump(concepts, file)

## Reloading the file


In [5]:
concepts = dict()
with open('new_concepts.json','r') as file:
    concepts = json.load(file)

In [6]:
import pandas as pd
list_of_depths = pd.DataFrame.from_dict(concepts, orient='index', columns=['depth'])

In [7]:
list_of_depths.sort_values('depth', inplace=True, ascending=False)
list_of_depths.head()

Unnamed: 0,depth
C4284744,30
C1416058,29
C1416093,29
C1416092,29
C1416091,29
