## Setup

In [1]:
import os
from glob import glob
import json
import re
from tqdm.notebook import tqdm
import itertools
import pandas as pd


# Setup
input_folder = "/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/"
sorted_files = sorted(glob(f'{input_folder}*.json'),key=lambda f: int(''.join(filter(str.isdigit, f))))

for i in sorted_files[:10]:
    print(i)

/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/merged1.json
/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/merged2.json
/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/merged3.json
/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/merged4.json
/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/merged5.json
/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/merged6.json
/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/merged7.json
/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/merged8.json
/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/merged9.json
/proj/berzelius-2021-21/users/x_caoll/EasyNer_ner_output/merged/merged10.json


## Old code - Now refactored in co_occurence.py

In [10]:
def load_json(input_file):
    with open(input_file, "r",encoding="utf-8") as f:
        return json.loads(f.read())

def get_batch_index(input_file, k="n"):
    m = re.search(r'\d+$', input_file)
    return int(m.group()) if m else None
    #return int(os.path.splitext(os.path.basename(input_file))[0].split(k)[-1])
    
        

def get_pairs(sorted_files, entity1= "disease", entity2="chemical"):
    '''
    Description:
        get pairs of entity1 and entity2 from the list of articles 
    Parameters:
        sorted_files: sorted files from folder
        output_file: dataframe
        entity1: first entity
        entity2: second entity
        
    Returns:
        dictionary of pairs
    '''
    d = {}
    for input_file in tqdm(sorted_files):
        articles = {}
        # d = load_json(input_file=input_file)
        # batch = {k:d[k] for k in list(d)[:20]}
        batch = load_json(input_file=input_file)
        batch_idx = get_batch_index(input_file=input_file)
    
        
        for idx in batch:
            article=batch[idx]
            
            for s_idx, sent in enumerate(article["sentences"]):
                if len(sent["entities"])>=2:
                    if entity1 in sent["entities"] and entity2 in sent["entities"]:
                        
                        # create an entry for the dict for e1
                        for e1 in sent["entities"][entity1]:
                            if e1 not in d:
                                d[e1] = {}
                            
                            #for each pair e1 e2 create an entry in the subdict
                            for e2 in sent["entities"][entity2]:
                                if e2 not in d[e1]:
                                    d[e1][e2] = {"freq":0, "pmid":set(), "sent":set()}

                                d[e1][e2]["freq"]+=1
                                d[e1][e2]["pmid"].update([idx])
                                d[e1][e2]["sent"].update([sent["text"]])
                        


    return d

def get_self_pairs(sorted_files, entity= "disease"):
    '''
    Description:
        get co occurrance pairs within same model from the list of articles 
    Parameters:
        sorted_files: sorted files from folder
        output_file: dataframe
        entity1: first entity
        entity2: second entity
        
    Returns:
        dictionary of pairs
    '''
    d = {}
    l = []
    for input_file in tqdm(sorted_files):
        articles = {}
        # d = load_json(input_file=input_file)
        # batch = {k:d[k] for k in list(d)[:20]}
        batch = load_json(input_file=input_file)
        batch_idx = get_batch_index(input_file=input_file)
    
        
        for idx in tqdm(batch, desc=f'batch:{batch_idx}'):
            article=batch[idx]
            
            for s_idx, sent in enumerate(article["sentences"]):
                if entity in sent["entities"]: 
                    if len(sent["entities"][entity])>=2:
                        
                        for i in range(len(sent["entities"][entity])):
                            for j in range(i+1, len(sent["entities"][entity])):
                                
                                e1 = sent["entities"][entity][i]
                                s1 = sent["entity_spans"][entity][i]
                                e2 = sent["entities"][entity][j]
                                s2 = sent["entity_spans"][entity][j]
                                if e1!=e2 and len(e1)>1 and len(e2)>1:
                                    
                                    if (e1,e2) in d:
                                        d[(e1,e2)]["freq"]+=1
                                        d[(e1,e2)]["pmid"].update([idx])
#                                         print("first", d)
                                    
                                    elif (e2,e1) in d:
                                        d[(e2,e1)]["freq"]+=1
                                        d[(e2,e1)]["pmid"].update([idx])
#                                         print("sec", d)
                                        
                                    
                                    else:
                                        d[(e1,e2)]= {"freq":1, "pmid":set([idx])}


    return d
# 
# def create_df_from_pairs(d):
#     """
#     create a dataframe of pairs from the nested dictionary
#     """
#     l = []

#     for e1, val in tqdm(sorted(d.items())):
#         for e2 in sorted(val, key=lambda x: (val[x]['freq']), reverse=True):
#             l.append([e1,e2,val[e2]["freq"],",".join(val[e2]["pmid"])])

#     df = pd.DataFrame(l, columns=["entity_1", "entity_2", "frequency", "pmid"])
#     return df

def create_df_from_pairs2(d):
    """
    create a dataframe of pairs from the nested dictionary
    """
    l = []

    for (e1,e2), val in tqdm(sorted(d.items())):
            l.append([e1,e2,val["freq"],",".join(val["pmid"])])

    df = pd.DataFrame(l, columns=["entity_1", "entity_2", "frequency", "pmid"])
    return df.sort_values("frequency", ascending=False)


def get_pairs(sorted_files, entity1= "disease", entity2="chemical"):
    '''
    Description:
        get pairs of entity1 and entity2 from the list of articles 
    Parameters:
        sorted_files: sorted files from folder
        output_file: dataframe
        entity1: first entity
        entity2: second entity
        
    Returns:
        dictionary of pairs
    '''
    d = {}
    for input_file in tqdm(sorted_files):
        articles = {}
        # d = load_json(input_file=input_file)
        # batch = {k:d[k] for k in list(d)[:20]}
        batch = load_json(input_file=input_file)
        batch_idx = get_batch_index(input_file=input_file)
    
        
        for idx in batch:
            article=batch[idx]
            
            for s_idx, sent in enumerate(article["sentences"]):
                if len(sent["entities"])>=2:
                    if entity1 in sent["entities"] and entity2 in sent["entities"]:
                        
                        # create an entry for the dict for e1
                        for e1 in sent["entities"][entity1]:
                            if e1 not in d:
                                d[e1] = {}
                            
                            #for each pair e1 e2 create an entry in the subdict
                            for e2 in sent["entities"][entity2]:
                                if e2 not in d[e1]:
                                    d[e1][e2] = {"freq":0, "pmid":set(), "sent":set()}

                                d[e1][e2]["freq"]+=1
                                d[e1][e2]["pmid"].update([idx])
                                d[e1][e2]["sent"].update([sent["text"]])
                        


    return d

def create_df_from_pairs(d, entity1="disease", entity2="chemicals", entity1_alternate="", entity2_alternate=""):

    l = []

    for e1, val in tqdm(sorted(d.items())):
        for e2 in sorted(val, key=lambda x: (val[x]['freq']), reverse=True):
            l.append([e1,e2,val[e2]["freq"],",".join(val[e2]["pmid"]), val[e2]["sent"]])
    
    columns = [str(entity1), str(entity2), "frequency", "pmid", "sentences"]
    
    if entity1_alternate!="":
        columns[0]=str(entity1_alternate)
    if entity2_alternate!="":
        columns[1]=str(entity2_alternate)
#     print(columns)
    df = pd.DataFrame(l, columns=columns)
    return df.sort_values("frequency", ascending=False)

# Self pairs ex: disease-disease

In [15]:
from co_occurence import get_self_pairs
d = get_self_pairs(sorted_files[:2], entity="disease")
d


[AException ignored in: <function tqdm.__del__ at 0x7f2456959550>
Traceback (most recent call last):
  File "/home/x_caoll/.conda/envs/easyner_env/lib/python3.9/site-packages/tqdm/std.py", line 1162, in __del__
    self.close()
  File "/home/x_caoll/.conda/envs/easyner_env/lib/python3.9/site-packages/tqdm/notebook.py", line 288, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'
batch:None: 100%|██████████| 15377/15377 [00:00<00:00, 336024.20it/s]

batch:None: 100%|██████████| 13415/13415 [00:00<00:00, 305385.07it/s]

Processing files: 100%|██████████| 2/2 [00:01<00:00,  1.54it/s]


{('carcinosarcoma', 'hyperglycemia'): {'freq': 2, 'pmid': {'22'}},
 ('hypothermia', 'analgesia'): {'freq': 2, 'pmid': {'25', '6716'}},
 ('haemolytic crisis', 'pnh'): {'freq': 1, 'pmid': {'33'}},
 ('hypoxia', 'hypercapnia'): {'freq': 20,
  'pmid': {'103',
   '13884',
   '13941',
   '14277',
   '16390',
   '19864',
   '2406',
   '37370',
   '38862',
   '40092',
   '42038',
   '821',
   '8420',
   '8991'}},
 ('respiratory acidosis', 'asphyxia'): {'freq': 1, 'pmid': {'103'}},
 ('gastritis', 'ulcer'): {'freq': 1, 'pmid': {'123'}},
 ('gastritis', 'atrophy'): {'freq': 1, 'pmid': {'123'}},
 ('heartburn', 'epigastric pain'): {'freq': 1, 'pmid': {'123'}},
 ('heartburn', 'bile vomiting'): {'freq': 1, 'pmid': {'123'}},
 ('epigastric pain', 'bile vomiting'): {'freq': 1, 'pmid': {'123'}},
 ('papillary stenosis', 'ductal stones'): {'freq': 1, 'pmid': {'125'}},
 ('perforation', 'duodenocholedochal junction'): {'freq': 1, 'pmid': {'125'}},
 ('metabolic alkalaemia', 'decline in cardiac output'): {'freq'

In [9]:
df = create_df_from_pairs2(d)
df

Creating DataFrame:   0%|          | 0/18237 [00:00<?, ?it/s]


KeyError: 'sent'

# Pairs ex: chemical-disease

In [2]:
from co_occurence import get_pairs
d_ = get_pairs(sorted_files[:20], entity1="disease", entity2="phenoma")
d_

Processing files:   0%|          | 0/20 [00:00<?, ?it/s]

Processing files: 100%|██████████| 20/20 [00:09<00:00,  2.19it/s]


{('depression', 'stream'): {'freq': 1,
  'pmid': {'168'},
  'sent': {'In 4 dogs injected intravenously (i.v.) with 125I labeled fibrinogen, 51Cr labeled platelets and 99mTc labeled albumin, and subjected to successively increasing amounts of i.v. infused monomethylmethacrylate, doses corresponding to the amounts released into the blood stream following implantation of acrylic cement during total hip replacements did not affect the clotting mechanism, did not cause trapping of platelets and fibrin in the lungs, did not generate fat emboli, and did not cause depression of the arterial oxygen tension or blood pressure.'}},
 ('pneumococcal', 'current'): {'freq': 1,
  'pmid': {'421'},
  'sent': {'The quellung test and gentamicin plate show improved sensitivity over current techniques for pneumococcal detection and can be recommended for general use.'}},
 ('pdi', 'current'): {'freq': 1,
  'pmid': {'507'},
  'sent': {'Skin potential, short-circuit current and their relationship to PDI were di

In [3]:
from co_occurence import create_df_from_pairs
df_ = create_df_from_pairs(d_)
df_

Creating DataFrame: 100%|██████████| 2091/2091 [00:00<00:00, 634287.24it/s]


Unnamed: 0,entity_1,entity_2,frequency,pmids,sentences
145,cancer,current,40,"397756,453167,469237,107379,376111,380617,3677...",These observations are discussed in terms of t...
111,pain,current,27,"298841,285695,474949,380617,316769,359212,4610...",Results were as follows: no effect in nine cas...
174,depression,wave,26,"335012,607494,46817,426983,427282,65272,234363...","At the maximum tolerable dose of 1400 mg/m2, t..."
219,tumor,current,25,"68945,192689,367731,95452,167462,424548,189626...","Moreover, according to our current estimate, t..."
245,seizures,wave,19,"97076,67023,416952,89944,477631,388250,477642,...",The EEG showed right parietal sharp wave disch...
...,...,...,...,...,...
833,adrenal hemorrhage,current,1,205758,Two-thirds of the current series had impaired ...
831,deficiency of ldl receptors,current,1,205553,The current findings are consistent with the c...
829,lipolysis,ice,1,205405,In the reconstructed lipid micelles which cons...
828,amebic peritonitis,current,1,205190,Emetine hydrochloride alone or followed by met...


In [10]:
from co_occurence import save_pairs_to_csv

# save_pairs_to_csv(df_, "/home/x_caoll/EasyNer/results/co_occurrences.csv")
df_.to_excel("/home/x_caoll/EasyNer/results/co_occurrences.xlsx")
