In [2]:
## Prepare output of BERN (named entity recognition and normalizer) to use it in binary relation extraction

In [2]:
import os
import pandas as pd
import json
from tqdm.notebook import tqdm
from collections import defaultdict
import random

In [8]:
norm_path = "../Data/Abstract_Normalized/"
norm_files = os.listdir("../Data/Abstract_Normalized/")

In [6]:
with open("../Data/Abstract_Splitted/total.json", "r") as f:
    sentences = json.load(f)

In [9]:
norm_ids = [x[:x.index(".json")] for x in norm_files]

In [10]:
all_entities = []
for doc_id in tqdm(norm_ids):
    with open(os.path.join(norm_path, doc_id+".json"), "r") as f:
        norm = json.load(f)
    for sent_id, el in enumerate(norm):
        if not el:
            continue
        for ent_type in el["logits"]:
            for res in el["logits"][ent_type]:
                feat_dict = res[0]
                feat_dict["text"] = el["text"][feat_dict["start"]:feat_dict["end"]:]
                feat_dict["confidence"] = res[-1]
                feat_dict["type"] = ent_type
                feat_dict["doc_id"] = doc_id
                feat_dict["sent_id"] = sent_id
                all_entities.append(feat_dict)
    #             print(feat_dict)
    #             print(sentences[doc_id][sent_id])
    #             print("*"*50)

HBox(children=(FloatProgress(value=0.0, max=74727.0), HTML(value='')))




In [11]:
print(len(all_entities))

727460


## Save All Entities

In [15]:
df = pd.DataFrame(all_entities)

In [16]:
set(list(df["type"]))

{'disease', 'drug', 'gene', 'species'}

In [18]:
df.to_json("../Data/Abstract_Entities/entities.json")

## Finding Pairs

In [20]:
all_pairs = []
for doc_id in tqdm(norm_ids):
    with open(os.path.join(norm_path, doc_id+".json"), "r") as f:
        norm = json.load(f)
    for sent_id, el in enumerate(norm):
        if not el:
            continue
        temp = []
        for ent_type in el["logits"]:
            for res in el["logits"][ent_type]:
                feat_dict = res[0]
                feat_dict["text"] = el["text"][feat_dict["start"]:feat_dict["end"]:]
                feat_dict["confidence"] = res[-1]
                feat_dict["type"] = ent_type
                feat_dict["doc_id"] = doc_id
                feat_dict["sent_id"] = sent_id
                temp.append(feat_dict)
        for id1, t1 in enumerate(temp):
            for id2, t2 in enumerate(temp):
                if id1<=id2:
                    continue
                pair = {}
                pair["sentence"] = sentences[doc_id][sent_id]
                pair["e1"] = t1["text"]
                pair["e2"] = t2["text"]
                pair["e1_type"] = t1["type"]
                pair["e2_type"] = t2["type"]
                pair["e1_id"] = t1["id"] if "id" in t1 else "None"
                pair["e2_id"] = t2["id"] if "id" in t2 else "None"
                pair["e1_start"] = t1["start"]
                pair["e1_end"] = t1["end"]
                pair["e2_start"] = t2["start"]
                pair["e2_end"] = t2["end"]

                
                pair["doc_id"] = doc_id
                pair["sent_id"] = sent_id
                all_pairs.append(pair)

HBox(children=(FloatProgress(value=0.0, max=74727.0), HTML(value='')))




In [21]:
pd.DataFrame(all_pairs)

Unnamed: 0,sentence,e1,e2,e1_type,e2_type,e1_id,e2_id,e1_start,e1_end,e2_start,e2_end,doc_id,sent_id
0,Rubella virus (RV) genomic RNA contains two la...,NSPs,Rubella,gene,disease,CUI-less,MESH:D012409\tBERN:106907501,129,133,0,7,05185a9f8f03e3f557765009111155e015a2aa9fa2a3e1...,0
1,Rubella virus (RV) genomic RNA contains two la...,Rubella virus,Rubella,species,disease,NCBI:txid11041,MESH:D012409\tBERN:106907501,0,13,0,7,05185a9f8f03e3f557765009111155e015a2aa9fa2a3e1...,0
2,Rubella virus (RV) genomic RNA contains two la...,Rubella virus,NSPs,species,gene,NCBI:txid11041,CUI-less,0,13,129,133,05185a9f8f03e3f557765009111155e015a2aa9fa2a3e1...,0
3,Proteolytic processing of the RV NSP ORF trans...,p200,RV NSP ORF translation product,gene,gene,MIM:609539\tHGNC:18037\tEnsembl:ENSG0000018907...,CUI-less,61,65,30,60,05185a9f8f03e3f557765009111155e015a2aa9fa2a3e1...,1
4,Processing of p200 to two mature products (p15...,p150,p200,gene,gene,MIM:601246\tHGNC:1910\tEnsembl:ENSG00000167670...,MIM:609539\tHGNC:18037\tEnsembl:ENSG0000018907...,43,47,14,18,05185a9f8f03e3f557765009111155e015a2aa9fa2a3e1...,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115685,Prothrombin time was prolonged in 4 patients.,patient,Prothrombin,species,gene,,MIM:176930\tHGNC:3535\tEnsembl:ENSG00000180210...,36,43,0,11,beba54a62be2847562966396efb334f9bf951d8632d55f...,8
1115686,Four patients were diagnosed with disseminated...,DIC,disseminated intravascular coagulation,disease,disease,MESH:D004211\tBERN:107003001,MESH:D004211\tBERN:107003001,74,77,34,72,beba54a62be2847562966396efb334f9bf951d8632d55f...,10
1115687,Four patients were diagnosed with disseminated...,patient,disseminated intravascular coagulation,species,disease,,MESH:D004211\tBERN:107003001,5,12,34,72,beba54a62be2847562966396efb334f9bf951d8632d55f...,10
1115688,Four patients were diagnosed with disseminated...,patient,DIC,species,disease,,MESH:D004211\tBERN:107003001,5,12,74,77,beba54a62be2847562966396efb334f9bf951d8632d55f...,10


In [22]:
len(all_pairs)

1115690

## Filtering for drug and genes

In [31]:
dg_pairs = []
unique_pairs = set()
only_drug = True
for p in all_pairs:
    if  (p["e2_type"]=="drug" and p["e1_type"]=="gene"):
        e1t = p["e2_type"]
        e1 = p["e2"]
        e1i = p["e2_id"]
        e1s = p["e2_start"]
        e1e = p["e2_end"]
        p["e2_type"] = p["e1_type"]
        p["e2"] = p["e1"]
        p["e2_id"] = p["e1_id"] if type(p["e1_id"])!=list else p["e1_id"][0]
        p["e2_start"] = p["e1_start"]
        p["e2_end"] = p["e1_end"]
        
        p["e1_type"] = e1t
        p["e1"] = e1
        p["e1_id"] = e1i if type(e1i)!=list else e1i[0]
        p["e1_start"] = e1s
        p["e1_end"] = e1e
        dg_pairs.append(p)
        unique_pairs.add((p["e1_id"], p["e2_id"]))
    elif (p["e1_type"]=="drug" and p["e2_type"]=="gene"):
        p["e2_id"] = p["e2_id"] if type(p["e2_id"])!=list else p["e2_id"][0]
        p["e1_id"] = p["e1_id"] if type(p["e1_id"])!=list else p["e1_id"][0]
        dg_pairs.append(p)
        unique_pairs.add((p["e2_id"], p["e1_id"]))

In [32]:
dg_pairs_d = pd.DataFrame(dg_pairs)

In [33]:
len(dg_pairs)

46166

In [34]:
dg_pairs_d.to_json("../Data/Abstract_Entities/pairs.json")

In [39]:
idx = random.randint(0, len(dg_pairs))
print(idx)
print("Sentence:")
print(dg_pairs[idx]["sentence"])
print("\n")
print("Entity 1:", dg_pairs[idx]["e1"], "\t", dg_pairs[idx]["e1_type"], '\t', dg_pairs[idx]["e1_id"])
print("Entity 2:", dg_pairs[idx]["e2"], "\t", dg_pairs[idx]["e2_type"], '\t', dg_pairs[idx]["e2_id"])

26732
Sentence:
Deduced amino acid sequences for both HEV strains revealed 89/88, 80, 93/92 and 95/94% identities with the structural proteins HE, S, M and N of BCoV and HCoV-OC43, respectively.


Entity 1: amino acid 	 drug 	 CHEBI:33704	BERN:282227403
Entity 2: HE 	 gene 	 CUI-less
