In [9]:
import os
from glob import glob
import json
import re
from tqdm.notebook import tqdm
import itertools
import pandas as pd

In [3]:
input_folder = "/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/"#"../data/merged-folder/merge_disease_chemical/"
sorted_files = sorted(glob(f'{input_folder}*.json'),key=lambda f: int(''.join(filter(str.isdigit, f))))
for i in sorted_files[:20]:
    print(i)

/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged1.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged2.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged3.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged4.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged5.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged6.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged7.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged8.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged9.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged10.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged11.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged12.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged13.json
/home/rafsan/aitslab/nlp/EasyNER/results/merged-folder_v2/merged14.json
/

In [20]:
def load_json(input_file):
    with open(input_file, "r",encoding="utf-8") as f:
        return json.loads(f.read())

def get_batch_index(input_file, k="n"):
    m = re.search(r'\d+$', input_file)
    return int(m.group()) if m else None
    #return int(os.path.splitext(os.path.basename(input_file))[0].split(k)[-1])
    
        

def get_pairs(sorted_files, entity1= "disease", entity2="chemical"):
    '''
    Description:
        get pairs of entity1 and entity2 from the list of articles 
    Parameters:
        sorted_files: sorted files from folder
        output_file: dataframe
        entity1: first entity
        entity2: second entity
        
    Returns:
        dictionary of pairs
    '''
    d = {}
    for input_file in tqdm(sorted_files):
        articles = {}
        # d = load_json(input_file=input_file)
        # batch = {k:d[k] for k in list(d)[:20]}
        batch = load_json(input_file=input_file)
        batch_idx = get_batch_index(input_file=input_file)
    
        
        for idx in batch:
            article=batch[idx]
            
            for s_idx, sent in enumerate(article["sentences"]):
                if len(sent["entities"])>=2:
                    if entity1 in sent["entities"] and entity2 in sent["entities"]:
                        
                        # create an entry for the dict for e1
                        for e1 in sent["entities"][entity1]:
                            if e1 not in d:
                                d[e1] = {}
                            
                            #for each pair e1 e2 create an entry in the subdict
                            for e2 in sent["entities"][entity2]:
                                if e2 not in d[e1]:
                                    d[e1][e2] = {"freq":0, "pmid":set(), "sent":set()}

                                d[e1][e2]["freq"]+=1
                                d[e1][e2]["pmid"].update([idx])
                                d[e1][e2]["sent"].update([sent["text"]])
                        


    return d

def get_self_pairs(sorted_files, entity= "disease"):
    '''
    Description:
        get co occurrance pairs within same model from the list of articles 
    Parameters:
        sorted_files: sorted files from folder
        output_file: dataframe
        entity1: first entity
        entity2: second entity
        
    Returns:
        dictionary of pairs
    '''
    d = {}
    l = []
    for input_file in tqdm(sorted_files):
        articles = {}
        # d = load_json(input_file=input_file)
        # batch = {k:d[k] for k in list(d)[:20]}
        batch = load_json(input_file=input_file)
        batch_idx = get_batch_index(input_file=input_file)
    
        
        for idx in tqdm(batch, desc=f'batch:{batch_idx}'):
            article=batch[idx]
            
            for s_idx, sent in enumerate(article["sentences"]):
                if entity in sent["entities"]: 
                    if len(sent["entities"][entity])>=2:
                        
                        for i in range(len(sent["entities"][entity])):
                            for j in range(i+1, len(sent["entities"][entity])):
                                
                                e1 = sent["entities"][entity][i]
                                s1 = sent["entity_spans"][entity][i]
                                e2 = sent["entities"][entity][j]
                                s2 = sent["entity_spans"][entity][j]
                                if e1!=e2 and len(e1)>1 and len(e2)>1:
                                    
                                    if (e1,e2) in d:
                                        d[(e1,e2)]["freq"]+=1
                                        d[(e1,e2)]["pmid"].update([idx])
#                                         print("first", d)
                                    
                                    elif (e2,e1) in d:
                                        d[(e2,e1)]["freq"]+=1
                                        d[(e2,e1)]["pmid"].update([idx])
#                                         print("sec", d)
                                        
                                    
                                    else:
                                        d[(e1,e2)]= {"freq":1, "pmid":set([idx])}


    return d
# 
# def create_df_from_pairs(d):
#     """
#     create a dataframe of pairs from the nested dictionary
#     """
#     l = []

#     for e1, val in tqdm(sorted(d.items())):
#         for e2 in sorted(val, key=lambda x: (val[x]['freq']), reverse=True):
#             l.append([e1,e2,val[e2]["freq"],",".join(val[e2]["pmid"])])

#     df = pd.DataFrame(l, columns=["entity_1", "entity_2", "frequency", "pmid"])
#     return df

def create_df_from_pairs2(d):
    """
    create a dataframe of pairs from the nested dictionary
    """
    l = []

    for (e1,e2), val in tqdm(sorted(d.items())):
            l.append([e1,e2,val["freq"],",".join(val["pmid"])])

    df = pd.DataFrame(l, columns=["entity_1", "entity_2", "frequency", "pmid"])
    return df.sort_values("frequency", ascending=False)


def get_pairs(sorted_files, entity1= "disease", entity2="chemical"):
    '''
    Description:
        get pairs of entity1 and entity2 from the list of articles 
    Parameters:
        sorted_files: sorted files from folder
        output_file: dataframe
        entity1: first entity
        entity2: second entity
        
    Returns:
        dictionary of pairs
    '''
    d = {}
    for input_file in tqdm(sorted_files):
        articles = {}
        # d = load_json(input_file=input_file)
        # batch = {k:d[k] for k in list(d)[:20]}
        batch = load_json(input_file=input_file)
        batch_idx = get_batch_index(input_file=input_file)
    
        
        for idx in batch:
            article=batch[idx]
            
            for s_idx, sent in enumerate(article["sentences"]):
                if len(sent["entities"])>=2:
                    if entity1 in sent["entities"] and entity2 in sent["entities"]:
                        
                        # create an entry for the dict for e1
                        for e1 in sent["entities"][entity1]:
                            if e1 not in d:
                                d[e1] = {}
                            
                            #for each pair e1 e2 create an entry in the subdict
                            for e2 in sent["entities"][entity2]:
                                if e2 not in d[e1]:
                                    d[e1][e2] = {"freq":0, "pmid":set(), "sent":set()}

                                d[e1][e2]["freq"]+=1
                                d[e1][e2]["pmid"].update([idx])
                                d[e1][e2]["sent"].update([sent["text"]])
                        


    return d

def create_df_from_pairs(d, entity1="disease", entity2="chemicals", entity1_alternate="", entity2_alternate=""):

    l = []

    for e1, val in tqdm(sorted(d.items())):
        for e2 in sorted(val, key=lambda x: (val[x]['freq']), reverse=True):
            l.append([e1,e2,val[e2]["freq"],",".join(val[e2]["pmid"]), val[e2]["sent"]])
    
    columns = [str(entity1), str(entity2), "frequency", "pmid", "sentences"]
    
    if entity1_alternate!="":
        columns[0]=str(entity1_alternate)
    if entity2_alternate!="":
        columns[1]=str(entity2_alternate)
#     print(columns)
    df = pd.DataFrame(l, columns=columns)
    return df.sort_values("frequency", ascending=False)

# Self pairs ex: disease-disease

In [11]:
d = get_self_pairs(sorted_files[:2], entity="disease")
d

  0%|          | 0/2 [00:00<?, ?it/s]

batch:None:   0%|          | 0/15377 [00:00<?, ?it/s]

batch:None:   0%|          | 0/13415 [00:00<?, ?it/s]

{('carcinosarcoma', 'hyperglycemia'): {'freq': 2, 'pmid': {'22'}},
 ('hypothermia', 'analgesia'): {'freq': 2, 'pmid': {'25', '6716'}},
 ('haemolytic crisis', 'pnh'): {'freq': 1, 'pmid': {'33'}},
 ('hypoxia', 'hypercapnia'): {'freq': 20,
  'pmid': {'103',
   '13884',
   '13941',
   '14277',
   '16390',
   '19864',
   '2406',
   '37370',
   '38862',
   '40092',
   '42038',
   '821',
   '8420',
   '8991'}},
 ('respiratory acidosis', 'asphyxia'): {'freq': 1, 'pmid': {'103'}},
 ('gastritis', 'ulcer'): {'freq': 1, 'pmid': {'123'}},
 ('gastritis', 'atrophy'): {'freq': 1, 'pmid': {'123'}},
 ('heartburn', 'epigastric pain'): {'freq': 1, 'pmid': {'123'}},
 ('heartburn', 'bile vomiting'): {'freq': 1, 'pmid': {'123'}},
 ('epigastric pain', 'bile vomiting'): {'freq': 1, 'pmid': {'123'}},
 ('papillary stenosis', 'ductal stones'): {'freq': 1, 'pmid': {'125'}},
 ('perforation', 'duodenocholedochal junction'): {'freq': 1, 'pmid': {'125'}},
 ('metabolic alkalaemia', 'decline in cardiac output'): {'freq'

In [12]:
df = create_df_from_pairs2(d)
df

  0%|          | 0/18237 [00:00<?, ?it/s]

Unnamed: 0,entity_1,entity_2,frequency,pmid
17137,tumor,tumors,68,"49416,50131,37817,2936,3660,42207,26872,49053,..."
7394,hepatoma,cirrhosis,34,54018491124782857625
4431,depression,anxiety,33,"23793,32777,19370,17303,28711,25022,11425,1248..."
17121,tumor,normal,28,"58067,24847,41958,33797,37814,25438,29133,3631..."
182,acidosis,alkalosis,27,"31597,38508,911,34811,54123,7231,15306,4259,24..."
...,...,...,...,...
6419,germ cell carcinoma,cancer,1,27442
6420,germ cell carcinoma,cryptorchidism,1,27442
6422,germ cell tumors,neoplasms,1,47695
6423,germ cell tumors,yolk sac tumor,1,47695


# Pairs ex: chemical-disease

In [16]:
d_ = get_pairs(sorted_files[:2], entity1="disease", entity2="chemical")
d_

  0%|          | 0/2 [00:00<?, ?it/s]

{'carcinosarcoma': {'ba 1': {'freq': 1,
   'pmid': {'22'},
   'sent': {'Experiments in 214 DS carcinosarcoma bearing Wistar rats have shown that BA 1, at a dosage of only about 12 percent LD50 (150 mg kg) and negligible lethality (1.7 percent), results in a recovery rate of 40 percent without hyperglycemia and, in one test, of 80 percent with hyperglycemia.'}},
  'if': {'freq': 1,
   'pmid': {'22'},
   'sent': {'Since IF can be taken as one of the most efficient cancerostatics--there is no other chemotherapeutic known up to now that has a more significant effect on the DS carcinosarcoma in rats -- these findings are of special importance.'}},
  'glucose': {'freq': 1,
   'pmid': {'979'},
   'sent': {'Tumour peracidity in otherwise moderately hyperacidulated tumours or tumour regions of DS carcinosarcoma-bearing Wistar rats attained by glucose infusion was substantially increased by simultaneous infusion of amygdalin and intratumoral i.m. or i.v. application of beta-glucosidase.'}},
  'a

In [21]:
df_ = create_df_from_pairs(d_)
df_

  0%|          | 0/4239 [00:00<?, ?it/s]

Unnamed: 0,disease,chemicals,frequency,pmid,sentences
1357,anxiety,lorazepam,34,"25820,24646,12915,30765,30768,30767,31063,2566...",{Twelve studies were undertaken under a common...
7977,hypertension,propranolol,30,"7576,14499,17892,6161,18018,32007,7688,40062,1...","{Thus, administration of the beta-adrenergic r..."
16245,tumor,bleomycin,30,"49220,56439,51079,56131,56230,53278,51922,6050...",{The further use of 99mTc-bleomycin scintigrap...
1358,anxiety,diazepam,28,"40840,27465,31063,35189,22419,20284,1213,27927...","{Of these thirteen, ten did not return and wer..."
9292,hypoxia,oxygen,27,"13884,1292,821,36182,3095,23212,29054,34112,25...",{The respiratory burst of human neutrophils wa...
...,...,...,...,...,...
6356,gm1 - gangliosidosis type 1 or type 2a,ganglioside,1,27879,{There was no active fraction for GM1-ganglios...
6357,gm1 gangliosidosis,ganglioside,1,23231,{The type A or 'acid' and type B or 'neutral' ...
6358,gm1 gangliosidosis,hemosiderin,1,45156,{Human splenic sinuses were observed for the i...
6359,gm2 - gangliosidosis,- ganglioside,1,53174,{6. Tay-Sachs disease (GM2-gangliosidosis): ce...
