### Create Subgraphs for each Question based on UMLS CUI

In [None]:
import json
import sys
sys.path.append("relation.py")
from relation import async_get_relations, get_all_relations

In [None]:
nephqa_root = f'/Users/arvin/dev/GreaseLM/data_kg-dd-db-10text-sciel-1zero_q-sciel-noex/nephqa'

In [None]:
# load entity linking results
basefile = f'{nephqa_root}/statement/test.statement.umls_linked.jsonl'

with open(basefile) as f:
    lines = f.readlines()

json_lines = [json.loads(line) for line in lines]

In [None]:
# get all the CUIs per question text
question_cuis = []
for json_line in json_lines:
        ent_results = [v for ent in json_line['question']['stem_ents'] 
                       for k,v in ent.items() if k == 'linking_results']
        cuis = [ent['Concept ID'] for ent_matches in ent_results 
                for ent in ent_matches]
        question_cuis.append(cuis)

In [None]:
# get all the CUIs per choice per question 
answer_cuis = []
for json_line in json_lines:
        choice_cuis = []
        for choice in json_line['question']['choices']:
            cuis = [ent_match['Concept ID'] 
                    for ent_results in choice['text_ents'] 
                    for ent_match in ent_results['linking_results']]
            choice_cuis.append(cuis)
        answer_cuis.append(choice_cuis)

In [None]:
async def get_all_related_cuis(CUI, session):
    json_response = await async_get_relations(CUI, session)
    all_related_cuis = []
    if "result" in json_response:
        for rel in json_response["result"]:
            url_split = rel["relatedId"].split('/')
            related_cuis = [url_split[-1]]
            if len(related_cuis) > 0:
                all_related_cuis.extend([(rel_cui, rel["relationLabel"], 
                                          rel["additionalRelationLabel"]) 
                                         for rel_cui in related_cuis])
    return all_related_cuis


async def get_two_hop_paths(source_cuis, dest_cuis, session, index):
    two_hop_paths = []
    for i, source_cui in enumerate(source_cuis):
        print(f"{index}  {i}")
        int_cuis_rels = await get_all_related_cuis(source_cui, session)
        for j, (int_cui, int_rel1, int_rel2) in enumerate(int_cuis_rels):
            print(f"{index}  {i}.{j}")
            target_cuis_rels = await get_all_related_cuis(int_cui, session)
            two_hop_paths.extend([[(source_cui, int_cui, int_rel1, int_rel2), 
                                   (int_cui, target_cui, target_rel1, target_rel2)] 
                                  for target_cui, target_rel1, target_rel2 in target_cuis_rels 
                                  if target_cui in dest_cuis])
    return two_hop_paths

In [None]:
# get 2-hop subgraphs from question_cuis and answer_cuis using parallelized approach

import asyncio
import aiohttp
import time


# for parallelizing http requests, reference this: https://stackoverflow.com/questions/57126286/fastest-parallel-requests-in-python
async def main(question_cuis, answer_cuis):
    async with aiohttp.ClientSession() as session:
        subgraphs = await asyncio.gather(*[get_two_hop_paths(q_cuis, a_choice_cuis, session, (i,j)) 
                                     for i, (q_cuis, a_choices_cuis) in enumerate(zip(question_cuis, answer_cuis)) 
                                     for j, a_choice_cuis in enumerate(a_choices_cuis)])
    print("Finalized all. Return is a list of len {} outputs.".format(len(subgraphs)))
    return subgraphs


start = time.time()
subgraphs = await main(question_cuis, answer_cuis)
end = time.time()

In [None]:
print(1)

In [None]:
subgraphs[0]

In [None]:
graph_counts = []
for graph in subgraphs[1000:]:
    graph_count = 0
    for node in graph:
        if len(node) == 2:
            graph_count += 1
    graph_counts.append(graph_count)

In [None]:
import numpy as np

np.mean(graph_counts), np.std(graph_counts), np.max(graph_counts), np.min(graph_counts), len(graph_counts)