### Create Subgraphs for each Question based on UMLS CUI

In [None]:
import json
import sys
import os
import time
sys.path.append("subgraph.py")
from subgraph import get_onehop_subgraph, get_twohop_subgraph, get_threehop_subgraph, get_fourhop_subgraph
#from subgraph import get_twohop_subgraph
from collections import defaultdict

In [None]:
data_root = '/Users/arvin/dev/GreaseLM/data_kg-umls2hop_q-sciel-noex'
nephqa_root = f'{data_root}/nephqa'
linked_q_file_path1 = f'{nephqa_root}/statement/train.statement.umls_linked.jsonl'
linked_q_file_path2 = f'{nephqa_root}/statement/dev.statement.umls_linked.jsonl'
linked_q_file_path3 = f'{nephqa_root}/statement/test.statement.umls_linked.jsonl'

In [None]:
start = time.time()
subgraphs3 = await get_fourhop_subgraph(linked_q_file_path3)
#subgraphs3 = await get_threehop_subgraph(linked_q_file_path3)
#subgraphs3 = await get_twohop_subgraph(linked_q_file_path3)
#subgraphs3 = await get_onehop_subgraph(linked_q_file_path3)
end = time.time()
f"Time elapsed (s): {(end-start)}"

In [None]:
start = time.time()
subgraphs2 = await get_fourhop_subgraph(linked_q_file_path2)
#subgraphs2 = await get_threehop_subgraph(linked_q_file_path2)
#subgraphs2 = await get_twohop_subgraph(linked_q_file_path2)
#subgraphs2 = await get_onehop_subgraph(linked_q_file_path2)
end = time.time()
f"Time elapsed (s): {(end-start)}"

In [None]:
start = time.time()
subgraphs1 = await get_fourhop_subgraph(linked_q_file_path1)
#subgraphs1 = await get_threehop_subgraph(linked_q_file_path1)
#subgraphs1 = await get_twohop_subgraph(linked_q_file_path1)
#subgraphs1 = await get_onehop_subgraph(linked_q_file_path1)
end = time.time()
f"Time elapsed (s): {(end-start)}"

In [None]:
subgraphs = subgraphs1 + subgraphs2 + subgraphs3

In [None]:
subgraphs

In [None]:
graph_counts = []
for graph in subgraphs:
    graph_count = 0
    for path in graph:
        if len(path) == 4:
            graph_count += 1
    graph_counts.append(graph_count)

In [None]:
import numpy as np

np.mean(graph_counts), np.std(graph_counts), np.max(graph_counts), np.min(graph_counts)

### Create relevant DB files

In [None]:
db_dir = f"{data_root}/ddb"
if not os.path.exists(db_dir):
    os.makedirs(db_dir)
save_entities_file = os.path.join(db_dir, "ddb_names.json")
save_relations_file = os.path.join(db_dir, "ddb_relas.json")
save_ddb_to_umls_cui_file = os.path.join(db_dir, "ddb_to_umls_cui.txt")

In [None]:
db_entities_json = {}
db_to_umls = set()
umls_to_db =  dict()
db_entity_ids = defaultdict(lambda: len(db_entity_ids))

db_relations_json = {}
db_relation_ids = defaultdict(lambda: len(db_relation_ids))

In [None]:
for graph in subgraphs:
    for paths in graph:
        if len(paths) == 4:
            for path in paths:
                entity_cuis = path[0], path[2] 
                entity_names = path[1], path[3]
                rel = path[4]
                # add entities
                for cui, name in zip(entity_cuis, entity_names):
                    db_id = db_entity_ids[cui]
                    db_to_umls.add((db_id, cui))
                    umls_to_db[cui] = db_id
                    db_entities_json[name] = [db_id, "1"]
                # add relations
                subj, obj = entity_cuis
                subj_id = db_entity_ids[subj]
                obj_id = db_entity_ids[obj]
                add_relation = (subj_id, obj_id, rel)
                relation_id = db_relation_ids[add_relation]
                db_relations_json[relation_id] = list(add_relation)

In [None]:
with open(save_ddb_to_umls_cui_file, 'w', encoding='utf-8') as f:
    f.write('\t'.join(["LinkItemsToUMLSCUIID", "ItemPTR", "CUI", "ItemToUMLSCUILinkTypePTR"]) + '\n')
    for row in sorted(list(db_to_umls)):
        db_ptr = row[0]
        cui = row[1]
        row = ["0", str(db_ptr), cui, "0"]
        f.write('\t'.join(row) + '\n')

In [None]:
with open(save_entities_file, 'w') as f:
    #json.dump(db_entities_json, f)

In [None]:
with open(save_relations_file, 'w') as f:
    #json.dump(db_relations_json, f)

In [None]:
print(1)

In [None]:
print(2)