## **Retrieve concept descriptions for nodes from KG**

### **1. Get concept definitions of a concept name from UMLS APIs**

In [43]:
import requests
import pdb

def results_list(input_string, combined_top_k=10):
    apikey = "c48c3caa-a1cf-4ec5-8ff7-a91a77113bf5"
    version = "current"
    uri = "https://uts-ws.nlm.nih.gov"
    search_endpoint = "/rest/search/"+version
    def_endpoint = "/rest/content/"+version+"/CUI/<CUI>/definitions"
    full_url_search = uri + search_endpoint
    full_url_def = uri + def_endpoint
    cui_dict = {}
    def_dict = {}
    try:
        query = {'string':input_string, 'apiKey':apikey}
        rs = requests.get(full_url_search, params=query)
        rs.raise_for_status()
        # print(rs.url)
        rs.encoding = 'utf-8'
        os  = rs.json()
        items = (([os['result']])[0])['results']
        if len(items) == 0:
            print('No results found.'+'\n')
        for result in items[:combined_top_k]:
            cui_dict[result['ui']] = result['name']
        query = {'apiKey':apikey}
        for cui, name in cui_dict.items():
            full_url_def = full_url_def.replace('<CUI>', cui)
            rd = requests.get(full_url_def, params=query)
            rd.raise_for_status()
            # print(rd.url)
            rd.encoding = 'utf-8'
            od  = rd.json()
            if len(od['result']) > 0:
                def_dict[name] = od['result'][0]['value']
    except Exception as except_error:
        print(except_error)
    return def_dict



In [36]:
# get the most relevant concept from UMLS
defs = results_list("hepatocellular carcinoma", combined_top_k=2)

https://uts-ws.nlm.nih.gov/rest/search/current?string=hepatocellular+carcinoma&apiKey=c48c3caa-a1cf-4ec5-8ff7-a91a77113bf5
https://uts-ws.nlm.nih.gov/rest/content/current/CUI/C2239176/definitions?apiKey=c48c3caa-a1cf-4ec5-8ff7-a91a77113bf5
https://uts-ws.nlm.nih.gov/rest/content/current/CUI/C2239176/definitions?apiKey=c48c3caa-a1cf-4ec5-8ff7-a91a77113bf5


In [37]:
defs

{'Liver carcinoma': 'A primary malignant neoplasm of epithelial liver cells. It ranges from a well-differentiated tumor with EPITHELIAL CELLS indistinguishable from normal HEPATOCYTES to a poorly differentiated neoplasm. The cells may be uniform or markedly pleomorphic, or form GIANT CELLS. Several classification schemes have been suggested.',
 'Fibrolamellar Hepatocellular Carcinoma': 'A primary malignant neoplasm of epithelial liver cells. It ranges from a well-differentiated tumor with EPITHELIAL CELLS indistinguishable from normal HEPATOCYTES to a poorly differentiated neoplasm. The cells may be uniform or markedly pleomorphic, or form GIANT CELLS. Several classification schemes have been suggested.'}

### **2. Get all nodes and retrieve their desciptions as the new property**

In [24]:
import pandas as pd

df = pd.read_csv("E:\\dataset\\biomed-kg\\PrimeKG.csv", low_memory=False)

node_label_dict = {'gene/protein': 'GeneProtein', 'anatomy': 'Anatomy', 'biological_process': 'BiologicalProcess', 'cellular_component': 'CellularComponent', 'disease': 'Disease', 'drug': 'Drug', 'effect/phenotype': 'EffectPhenotype', 'exposure': 'Exposure', 'molecular_function': 'MolecularFunction', 'pathway':'Pathway'}
node_dict = {}
count = 0
for index, row in df.iterrows():
    if row["x_id"] not in node_dict:
        node_dict[row["x_id"]] = {'name':row["x_name"], 'label':node_label_dict[row["x_type"]]}
    if row["y_id"] not in node_dict:
        node_dict[row["y_id"]] = {'name':row["y_name"], 'label':node_label_dict[row["y_type"]]}
    count += 1
    if count % 100000 == 0:
        print("Processed: " + str(count) + " lines.")

Processed: 100000 nodes.
Processed: 200000 nodes.
Processed: 300000 nodes.
Processed: 400000 nodes.
Processed: 500000 nodes.
Processed: 600000 nodes.
Processed: 700000 nodes.
Processed: 800000 nodes.
Processed: 900000 nodes.
Processed: 1000000 nodes.
Processed: 1100000 nodes.
Processed: 1200000 nodes.
Processed: 1300000 nodes.
Processed: 1400000 nodes.
Processed: 1500000 nodes.
Processed: 1600000 nodes.
Processed: 1700000 nodes.
Processed: 1800000 nodes.
Processed: 1900000 nodes.
Processed: 2000000 nodes.
Processed: 2100000 nodes.
Processed: 2200000 nodes.
Processed: 2300000 nodes.
Processed: 2400000 nodes.
Processed: 2500000 nodes.
Processed: 2600000 nodes.
Processed: 2700000 nodes.
Processed: 2800000 nodes.
Processed: 2900000 nodes.
Processed: 3000000 nodes.
Processed: 3100000 nodes.
Processed: 3200000 nodes.
Processed: 3300000 nodes.
Processed: 3400000 nodes.
Processed: 3500000 nodes.
Processed: 3600000 nodes.
Processed: 3700000 nodes.
Processed: 3800000 nodes.
Processed: 3900000 no

In [None]:
node_desc_dict = {}

In [51]:
import json
count = 0
start = 79000
for id, node in node_dict.items():
    if count <= start:
        count += 1
        continue
    if id in node_desc_dict:
        continue
    else:
        defs = results_list(node['name'], combined_top_k=1)
        if defs:
            for k,v in defs.items():
                node_desc_dict[id] = v
    count += 1
    if count % 1000 == 0:
        with open('./cache/node_desc_' + str(count), 'w') as f:
            json.dump(node_desc_dict, f)
        print("Processed " + str(count) + " nodes")

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

No results found.

404 Client Error: Not Found for url: https://uts-ws.nlm.nih.gov/rest/content/current/CUI/C1519230/definitions?apiKey=c48c3caa-a1cf-4ec5-8ff7-a91a77113bf5
No results found.

No results found.

No results

In [52]:
with open('./cache/node_desc_' + str(count), 'w') as f:
    json.dump(node_desc_dict, f)

In [29]:
defs = results_list("PHYHIP", combined_top_k=1)

https://uts-ws.nlm.nih.gov/rest/search/current?string=PHYHIP&apiKey=c48c3caa-a1cf-4ec5-8ff7-a91a77113bf5
404 Client Error: Not Found for url: https://uts-ws.nlm.nih.gov/rest/content/current/CUI/C1424604/definitions?apiKey=c48c3caa-a1cf-4ec5-8ff7-a91a77113bf5


In [None]:
from py2neo import Graph, Node, Relationship, Subgraph

graph = Graph('bolt://localhost:7687', auth=('neo4j', 'zongc0725'))
