In [5]:
%load_ext autoreload
%autoreload 2
import os
import re
import json
import copy
import sys
from tqdm import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import pytrec_eval
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
from haystack.pipelines import Pipeline
from haystack.nodes import BM25Retriever, ElasticsearchRetriever
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import BM25Retriever, SentenceTransformersRanker
import warnings
warnings.filterwarnings('ignore')

sys.path.append('../../')
import globals
from elastic_search_utils import elastic_utils
from haystack_utils.retrievers import BioASQ_Retriever
import bioasq_eval

working_folder = globals.PATH.home + '/data/working_folder'
eval_home = globals.PATH.eval_home + '/'
gs_google_docs = eval_home + '/examples/aueb_google_docs/aueb_nlp-bioasq6b-submissions/'
index_name = globals.BIOASQ.index + 'working_folder'
model_id = 'doc_retrieval_test'

es = Elasticsearch(globals.ES.server)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-base')
model = AutoModel.from_pretrained('michiyasunaga/BioLinkBERT-base')
inputs = tokenizer("Sunitinib is a tyrosine kinase inhibitor", return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state

In [33]:
v = last_hidden_states.detach().numpy()[0,-1,:]
print(v.shape)
distance.cosine(v, v)

(768,)


0

In [7]:
#evaluate over aueb documents
test_batch_docs = [ #('','8b5_ES_30_full.json')
                ('6B1_golden.json', gs_google_docs+'1-aueb-nlp-4.json'),
                ('6B2_golden.json', gs_google_docs+'2-aueb-nlp-4.json'),
                ('6B3_golden.json', gs_google_docs+'3-aueb-nlp-4.json'),
                ('6B4_golden.json', gs_google_docs+'4-aueb-nlp-4.json'),
                ('6B5_golden.json', gs_google_docs+'5-aueb-nlp-4.json')
               ]

In [10]:
# set document store
document_store = ElasticsearchDocumentStore()
# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)
# create the Query Pipeline
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
prediction = pipeline.run(query="covid", params={"Retriever": {"top_k": 100}})
# predict
print([p.id for p in prediction['documents']])

['35805530', '34290652', '33818619', '34622965', '32161968', '32594211', '36260597', '34303669', '32915702', '35144461', '33028754', '34812083', '36040960', '33486531', '36107493', '34266454', '35144622', '36352477', '36062398', '33685285', '33278457', '33738812', '33713816', '36309479', '35165971', '36208038', '35627510', '36238713', '34926521', '35319081', '35760548', '33259695', '33686558', '36011996', '34378115', '33225288', '34882130', '34114480', '35280932', '34206226', '34078004', '35837898', '33208116', '34150333', '33909072', '36324261', '35806891', '34414930', '32996452', '34399573', '33196505', '35502213', '36085292', '33419040', '33450530', '34558870', '33799284', '35883244', '36416240', '34866519', '33465496', '33218796', '33686325', '35891225', '34631652', '34478463', '34164664', '36326380', '33577740', '36350626', '33886442', '34192604', '36043349', '36459751', '34655644', '34696311', '36309368', '35240494', '35079646', '34891707', '36255221', '36401405', '35028662', '34

In [42]:
sim_vector = [(0, 0.9019076228141785), (1, 0.8989342451095581), (2, 0.9023183584213257), (3, 0.8771029710769653), (4, 0.8976577520370483), (5, 0.8969278335571289)]
sorted_list = sorted(sim_vector, key=lambda x: x[1], reverse=False)
print(sorted_list)

[(3, 0.8771029710769653), (5, 0.8969278335571289), (4, 0.8976577520370483), (1, 0.8989342451095581), (0, 0.9019076228141785), (2, 0.9023183584213257)]


In [43]:
from scipy.spatial import distance

df = pd.DataFrame( columns=('batch', 'Mean precision', 'Recall', 'F-Measure', 'MAP', 'GMAP') )

def tokenize(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    return last_hidden_states

def similarity_q_doc(q, doc_text):
    v_q = tokenize(q).detach().numpy()[0,-1,:]
    v_doc = tokenize(doc_text).detach().numpy()[0,-1,:]
    sim = (1 - distance.cosine(v_q, v_doc))
    return sim

def rerank(q, docs, top=10):
    sim_vector = []
    for i, d in enumerate(docs):
        doc_text = d.meta['title'] + d.meta['abstract'] 
        sim = similarity_q_doc(q, doc_text[0:511])
        sim_vector.append((i,sim))
    sorted_list = sorted(sim_vector, key=lambda x: x[1], reverse=True)
    sorted_docs = []
    for i, score in sorted_list:
        sorted_docs.append(docs[i])
    return sorted_docs
    
for i, batch_file in enumerate(test_batch_docs):
    test_batch_json = json.load(open(batch_file[1]))
    for sample in tqdm(test_batch_json['questions'], position=0):
        prediction = pipeline.run(query=sample['body'], params={"Retriever": {"top_k": 100}})
        docs = prediction['documents']
        reranked_docs = rerank(sample['body'],docs)
        doc_list = [ globals.BIOASQ.doc_relative_url + doc.id for doc in reranked_docs ]
        sample['documents'] = doc_list[0:10]
            
    submission = test_batch_json.copy()
    submission_file_name = working_folder + "/" + model_id + '_' + batch_file[1].split('/')[-1]
    json.dump(submission, open(submission_file_name, 'w'))
    docs_score, pass_score = bioasq_eval.get_scores_phaseA(batch_file[0], submission, path_home=eval_home)
    print('Document Scores',docs_score)
    print('Passage Scores',pass_score)
    df.loc[i] = [ batch_file[0].split('.')[0] + '_' + batch_file[1].split('/')[-1].split('.')[0] ] + pass_score
    break

df.to_csv(working_folder + "/" + model_id+'.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [21:57<00:00, 13.18s/it]


Document Scores [0.05099999999999999, 0.1598460705960706, 0.061606786243698995, 0.020852380952380952, 0.0002485286946946103]
Passage Scores [0.21269651172236054, 0.25011684801025924, 0.19239072397187112, 0.16589622913215266, 0.011068641301611446]


In [46]:
similarity_q_doc('what are the symtoms of covid','we are the champions my friend')

0.8489047884941101

In [49]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('michiyasunaga/BioLinkBERT-base')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/559 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/447k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/379 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/225k [00:00<?, ?B/s]



In [52]:
embedding_1= model.encode('Early symptoms of COVID-19 may include a loss of taste or smell. Other symptoms can include: Shortness of breath or difficulty breathing;', convert_to_tensor=True)
embedding_2 = model.encode('Early symptoms of COVID-19 may include a loss of taste or smell. Other symptoms can include: Shortness of breath or difficulty breathing', convert_to_tensor=True)

util.pytorch_cos_sim(embedding_1, embedding_2)

tensor([[0.9979]])