In [1]:
import requests
import json
import pandas as pd
import xmltodict
from lxml.html import fromstring
import numpy as np
from datetime import datetime

In [2]:
start_time = datetime.now()

# Read the excel file

In [3]:
df = pd.read_excel('Disease test set.xlsx')
Disease = df[["Disease from model"]].to_numpy() 
Disease

array([['hypertension'],
       ['cancer'],
       ['tumors']], dtype=object)

# Declare the different Sources

In [4]:
source = n = np.array(['SNOMEDCT_US','MSH','OMIM','MDR'])

# Save the API Key and get the tgt

In [5]:
api_key = 'b97fa897-104d-4139-a516-47d8c31dbfee'
uri="https://utslogin.nlm.nih.gov"

In [6]:
h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python" }
params = {'apikey': api_key}
auth_endpoint = "/cas/v1/api-key"
r = requests.post(uri+auth_endpoint,data=params,headers=h)
response = fromstring(r.text)
tgt = response.xpath('//form/@action')[0]

# Get Disease Codes from UMLS

In [7]:
def get_UMLS_codes(tgt,Disease):
    synonyms_list = np.array([])
    ui_list = np.array([])
    source_list = np.array([])
    disease_list = np.array([])
    
    for source_name in source:
        for disease_name in Disease:
              
            service="http://umlsks.nlm.nih.gov"
            params = {'service': service}
            h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python" }
            r = requests.post(tgt,data=params,headers=h)
            st = r.text
            
            base_url = 'https://uts-ws.nlm.nih.gov/rest/search/current?string='+str(disease_name)+'&pageNumber=1&sabs='+source_name+'&returnIdType=sourceUi&sinputType=atom&includeObsolete=false&ticket='+st

            r = requests.get(base_url)
            data = r.json()
            jtopy=json.dumps(data)
            dict_json=json.loads(jtopy)
            n = len(dict_json['result']['results'])
            i = 0


            while (i < n):
                Synonyms  = str(dict_json['result']['results'][i]['name'])
                ui = str(dict_json['result']['results'][i]['ui'])
                synonyms_list = np.append(synonyms_list,Synonyms)
                ui_list = np.append(ui_list,str(ui))
                source_list = np.append(source_list,source_name)
                disease_list = np.append(disease_list,disease_name)
                i+=1


            
    Final_Array = np.array([source_list,disease_list,ui_list,synonyms_list])
    return Final_Array
    
            

# Get Parent Information

In [8]:
def get_UMLS_Parents(tgt,source,child_ui):
    
    service="http://umlsks.nlm.nih.gov"
    params = {'service': service}
    h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python" }
    r = requests.post(tgt,data=params,headers=h)
    st = r.text
    base_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/source/'+source+'/'+child_ui+'/parents?ticket='+st
    r = requests.get(base_url)
    data = r.json()
    jtopy=json.dumps(data)
    dict_json=json.loads(jtopy)
    n = len(dict_json['result'])
    i = 0 
    
    parents_list = np.array([])
    ui_list = np.array([])
    child_list= np.array([])
    source_list= np.array([])

    while (i < n ):
        parents = dict_json['result'][i]['name']
        ui = dict_json['result'][i]['ui']
        
        source_list = np.append(source_list,source)
        parents_list = np.append(parents_list,parents)
        ui_list = np.append(ui_list,ui)
        child_list = np.append(child_list,child_ui)
        i+=1
    Final_Array = np.array([source_list,child_list,ui_list,parents_list])
    
    return Final_Array
    

# Iterate through child's list

In [9]:
def itertate_child(tgt,child):
    i = 0
    temp_parent = np.array([])
    parent = np.array([])

    while ( i < len(child[0])):

        try:
            temp_parent = get_UMLS_Parents(tgt,child[0][i],child[2][i])
            temp_parent = temp_parent.reshape((4, -1))
            parent = parent.reshape((4, -1))
            parent= np.concatenate((parent,temp_parent),axis=1) 

        except:
            temp_parent=np.array([child[0][i],child[2][i],'None','None'])
            temp_parent = temp_parent.reshape((4,-1))
            parent = parent.reshape((4,-1))
            parent= np.concatenate((parent,temp_parent),axis=1) 

        i+=1
    return parent


In [10]:
synonyms = get_UMLS_codes(tgt,Disease)
source_df = pd.DataFrame(list(zip(synonyms[0],synonyms[1],synonyms[2],synonyms[3])))
source_df =source_df.rename(columns={ source_df.columns[0]: "Source" ,source_df.columns[1]: "Disease" , source_df.columns[2]: "UID" , source_df.columns[3]: "Synonyms"})
source_df.head()

Unnamed: 0,Source,Disease,UID,Synonyms
0,SNOMEDCT_US,hypertension,34742003,Portal hypertension
1,SNOMEDCT_US,hypertension,70995007,Pulmonary hypertension
2,SNOMEDCT_US,hypertension,59621000,Essential hypertension
3,SNOMEDCT_US,hypertension,271719001,Raised intracranial pressure
4,SNOMEDCT_US,hypertension,123799005,Renovascular hypertension


In [11]:
p1 = itertate_child(tgt,synonyms)
p1_df = pd.DataFrame(list(zip(p1[0],p1[1],p1[2],p1[3])))
p1_df=p1_df.rename(columns={ p1_df.columns[0]: "Source" ,p1_df.columns[1]: "Child_UI" , p1_df.columns[2]: "Parent_UI" , p1_df.columns[3]: "Parent"})
p1_df.head()

Unnamed: 0,Source,Child_UI,Parent_UI,Parent
0,SNOMEDCT_US,34742003,234072000,Venous hypertension
1,SNOMEDCT_US,34742003,128056006,Disorder of portal venous system
2,SNOMEDCT_US,70995007,39785005,Disorder of pulmonary circulation
3,SNOMEDCT_US,59621000,38341003,Hypertensive disorder
4,SNOMEDCT_US,271719001,299732003,Finding of intracranial pressure


In [12]:
source_df = pd.merge(source_df, p1_df, left_on=['Source','UID'], right_on=['Source','Child_UI'],how='left')
source_df.head()

Unnamed: 0,Source,Disease,UID,Synonyms,Child_UI,Parent_UI,Parent
0,SNOMEDCT_US,hypertension,34742003,Portal hypertension,34742003,234072000,Venous hypertension
1,SNOMEDCT_US,hypertension,34742003,Portal hypertension,34742003,128056006,Disorder of portal venous system
2,SNOMEDCT_US,hypertension,70995007,Pulmonary hypertension,70995007,39785005,Disorder of pulmonary circulation
3,SNOMEDCT_US,hypertension,59621000,Essential hypertension,59621000,38341003,Hypertensive disorder
4,SNOMEDCT_US,hypertension,271719001,Raised intracranial pressure,271719001,299732003,Finding of intracranial pressure


In [13]:
p2 = itertate_child(tgt,p1)
p2_df = pd.DataFrame(list(zip(p2[0],p2[1],p2[2],p2[3])))
p2_df=p2_df.rename(columns={ p2_df.columns[0]: "Source" ,p2_df.columns[1]: "Child_UI" , p2_df.columns[2]: "Parent" , p2_df.columns[3]: "Parent_UI"})
p2_df.head()

Unnamed: 0,Source,Child_UI,Parent,Parent_UI
0,SNOMEDCT_US,234072000,90507008,Disorder of vein
1,SNOMEDCT_US,128056006,128052008,Vascular disease of abdomen
2,SNOMEDCT_US,128056006,90507008,Disorder of vein
3,SNOMEDCT_US,39785005,373434004,Disorder of blood vessels of thorax
4,SNOMEDCT_US,38341003,366157005,Cardiovascular measurement - finding


In [14]:
source_df = pd.merge(source_df, p2_df, left_on=['Source','Parent_UI'], right_on=['Source','Child_UI'],how='left')
source_df.head()

Unnamed: 0,Source,Disease,UID,Synonyms,Child_UI_x,Parent_UI_x,Parent_x,Child_UI_y,Parent_y,Parent_UI_y
0,SNOMEDCT_US,hypertension,34742003,Portal hypertension,34742003,234072000,Venous hypertension,234072000,90507008,Disorder of vein
1,SNOMEDCT_US,hypertension,34742003,Portal hypertension,34742003,128056006,Disorder of portal venous system,128056006,128052008,Vascular disease of abdomen
2,SNOMEDCT_US,hypertension,34742003,Portal hypertension,34742003,128056006,Disorder of portal venous system,128056006,90507008,Disorder of vein
3,SNOMEDCT_US,hypertension,70995007,Pulmonary hypertension,70995007,39785005,Disorder of pulmonary circulation,39785005,373434004,Disorder of blood vessels of thorax
4,SNOMEDCT_US,hypertension,59621000,Essential hypertension,59621000,38341003,Hypertensive disorder,38341003,366157005,Cardiovascular measurement - finding


In [None]:
p3 = itertate_child(tgt,p2)
p3_df = pd.DataFrame(list(zip(p3[0],p3[1],p3[2],p3[3])))
p3_df=p3_df.rename(columns={ p3_df.columns[0]: "Source" ,p3_df.columns[1]: "Child_UI" , p3_df.columns[2]: "Parent" , p3_df.columns[3]: "Parent_UI"})
p3_df.head()

In [None]:
source_df1 = pd.merge(source_df, p3_df, left_on=['Source','Parent_y'], right_on=['Source','Child_UI'],how='left')
source_df1.head()

In [None]:
source_df1 = source_df1.drop(source_df1.columns[[4, 7,10]],axis = 1)
source_df1=source_df1.rename(columns={ source_df1.columns[4]: "Parent1" ,source_df1.columns[5]: "Parent1_UI" , source_df1.columns[6]: "Parent2" , source_df1.columns[7]: "Parent2_UI",source_df1.columns[8]: "Parent3" , source_df1.columns[9]: "Parent3_UI"})

source_df1.head()

In [None]:
end_time = datetime.now()

In [None]:
print('The total time taken to run this workbook is:',(end_time-start_time))