In [1]:
%load_ext autoreload
%autoreload 2
import os
import re
import json
import copy
import sys
from tqdm import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import pytrec_eval
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
from haystack.pipelines import Pipeline
from haystack.nodes import BM25Retriever, ElasticsearchRetriever
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import BM25Retriever, SentenceTransformersRanker
from haystack.nodes.reader import FARMReader
from haystack.utils import print_answers
import warnings
warnings.filterwarnings('ignore')

sys.path.append('../../')
import globals
from elastic_search_utils import elastic_utils
from haystack_utils.retrievers import BioASQ_Retriever
import bioasq_eval

working_folder = globals.PATH.home + '/data/working_folder'
eval_home = globals.PATH.eval_home + '/'
gs_google_docs = eval_home + '/examples/aueb_google_docs/aueb_nlp-bioasq6b-submissions/'
index_name = globals.BIOASQ.index + 'working_folder'
model_id = 'doc_retrieval_test'

es = Elasticsearch(globals.ES.server)

Home path : /opt/bioasq/col-un-bioasq11
Eval path : /opt/bioasq/Evaluation-Measures


In [2]:
test_batch_doc = f'{working_folder}/test11b/BioASQ-task11bPhaseA-testset1.json'

In [3]:
test_batch_json = json.load(open(test_batch_doc))

In [4]:
# set document store
document_store = ElasticsearchDocumentStore()

# Create pipeline with only BM25

In [7]:
# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)
# create the Query Pipeline
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])

In [18]:
for sample in tqdm(test_batch_json['questions'], position=0):
    prediction = pipeline.run(query=sample['body'], params={"Retriever": {"top_k": 15}})
    doc_list = [ globals.BIOASQ.doc_relative_url + doc.id for doc in prediction['documents'] ]
    sample['documents'] = doc_list

100%|███████████████████████████████████████████████████████████████████████████| 75/75 [00:48<00:00,  1.54it/s]


In [23]:
submission = test_batch_json.copy()
submission_file_name =  working_folder + "/test11b/" + test_batch_doc.split('/')[-1].replace('.json','_bm25.json')
json.dump(submission, open(submission_file_name, 'w'))

# Create a pipeline with BM25 and Ranker

In [24]:
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="BM25Retriever", inputs=["Query"])
ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")
pipeline.add_node(component=ranker, name="Ranker", inputs=["BM25Retriever"])

In [28]:
for sample in tqdm(test_batch_json['questions'], position=0):
    prediction = pipeline.run(query=sample['body'], params={"BM25Retriever": {"top_k": 100}, "Ranker": {"top_k": 20}, })
    doc_list = [ globals.BIOASQ.doc_relative_url + doc.id for doc in prediction['documents'] ]
    sample['documents'] = doc_list

100%|███████████████████████████████████████████████████████████████████████████| 75/75 [04:25<00:00,  3.54s/it]


In [29]:
submission = test_batch_json.copy()
submission_file_name =  working_folder + "/test11b/" + test_batch_doc.split('/')[-1].replace('.json','_bm25_crossenc_ranker_20.json')
json.dump(submission, open(submission_file_name, 'w'))

# Create a Pipeline with Ranker and Reader

In [11]:
# set document store
document_store = ElasticsearchDocumentStore()
# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)

# create the Sentence Transformer Ranker
ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")

# create the Query Pipeline
pipeline = Pipeline()

# add bm25 retriever
pipeline.add_node(component=retriever, name="BM25Retriever", inputs=["Query"])
pipeline.add_node(component=ranker, name="Ranker", inputs=["BM25Retriever"])
my_model = "deepset/roberta-base-squad2"
reader = FARMReader(model_name_or_path=my_model, use_gpu=True)
pipeline.add_node(component=reader, name="Reader", inputs=["Ranker"])

# run the pipeline
prediction = pipeline.run(query="Which factors drive replisome disassembly during DNA replication termination and mitosis", params={"BM25Retriever": {"top_k": 100}})

# predict
print([p.id for p in prediction['documents']])

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

['30979826', '30340827', '34269473', '34195792', '28368371', '34700328', '35798141', '31545170', '26255844', '32490508']


In [None]:
pred = prediction['answers'][1]

def get_snipet_from_answer(pred):
    is_extractive = pred.type == 'extractive'
    answer = {}
    answer['document'] =  globals.BIOASQ.doc_relative_url + pred.document_ids[0]
    answer['text'] = pred.context
    answer['offsetInBeginSection'] = pred.meta['abstract'].rfind(pred.context)
    answer['offsetInEndSection'] = answer['offsetInBeginSection'] + len(pred.context)
    answer["beginSection"] = "abstract",
    answer["endSection"] = "abstract"
    return answer
    
for sample in tqdm(test_batch_json['questions'], position=0):
    prediction = pipeline.run(query=sample['body'], params={"BM25Retriever": {"top_k": 50}, "Ranker": {"top_k": 30}, "Reader": {"top_k": 30} })
    snippets = []
    doc_list_by_snnipets = []
    for ans in prediction['answers']:
        if ans.score > 0.2:
            snippet = get_snipet_from_answer(ans)
            doc_list_by_snnipets.append(snippet['document'])
            snippets.append(snippet)
            
    sample['snippets'] = snippets
    doc_list = [ globals.BIOASQ.doc_relative_url + d.id for d in prediction['documents'] ]
    #sample['documents'] = doc_list[0:10]
    sample['documents'] = list(set(doc_list_by_snnipets))
    
# mindlab base ans.score > 0.3:
# mindlab tns ans.score > 0.4 and only snnipets docs

In [137]:
submission_file_name =  working_folder + "/test11b/b4_cnn_submit_" + test_batch_doc.split('/')[-1].replace('.json','_bm25_crossenc_ranker_20.json')

with open(submission_file_name, "w") as outfile:
    json.dump(test_batch_json, outfile)

In [85]:
#test_batch_json

### Improve passages

In [98]:
import editdistance
editdistance.eval('banana', 'bahama')
import nltk.data
import src.elastic_search_utils.elastic_utils as es_util

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data = "This is a sentence. This is another."
sentences = tokenizer.tokenize(data)
print('\n-----\n'.join(sentences))

This is a sentence.
-----
This is another.


In [103]:
from rapidfuzz.distance import Levenshtein
Levenshtein.normalized_similarity("levenshtein", "levenshtein")

1.0

In [135]:
for sample in tqdm(test_batch_json['questions'], position=0):
    for s in sample['snippets']:
        text_raw = s['text']
        doc_id = s['document'].replace(globals.BIOASQ.doc_relative_url,'')
        res = es_util.search_doc_by_id(doc_id)
        doc_title = res[doc_id]['title']
        doc_abstract =  res[doc_id]['abstract']
        sentences = tokenizer.tokenize(doc_abstract)
        # validate if the sentence is similar to the snippet
        for sentence in sentences:
            if Levenshtein.normalized_similarity(sentence,text_raw) > 0.7:
                if len(sentence) > len(text_raw):
                    s['text'] = sentence

100%|███████████████████████████████████████████████████████████████████████████| 75/75 [00:02<00:00, 26.39it/s]


In [120]:
submission_file_name =  working_folder + "/test11b/submit_improved2_" + test_batch_doc.split('/')[-1].replace('.json','_bm25_crossenc_ranker_20.json')

with open(submission_file_name, "w") as outfile:
    json.dump(test_batch_json, outfile)

## Rank Pasaages

In [131]:
name = "submit_improved2_BioASQ-task11bPhaseA-testset1_bm25_crossenc_ranker_20.json"
#name = "submit_experimental_improved_BioASQ-task11bPhaseA-testset1_bm25_crossenc_ranker_20.json"
#name = "submit_tns_BioASQ-task11bPhaseA-testset1_bm25_crossenc_ranker_20.json"
#name = "submit_mindlabbase_BioASQ-task11bPhaseA-testset1_bm25_crossenc_ranker_20.json"

submission_file_name = f'{working_folder}/test11b/{name}'
submission_file = json.load(open(submission_file_name))
for sample in tqdm(submission_file['questions'], position=0):
    sample['documents'] = list(set(sample['documents']))
with open(submission_file_name, "w") as outfile:
    json.dump(submission_file, outfile)

100%|███████████████████████████████████████████████████████████████████████| 75/75 [00:00<00:00, 283910.47it/s]


In [None]:
submission_file

In [132]:
submission_file_name

'/opt/bioasq/col-un-bioasq11/data/working_folder/test11b/submit_experimental_improved_BioASQ-task11bPhaseA-testset1_bm25_crossenc_ranker_20.json'

# Generative QA

In [5]:
from haystack.pipelines import GenerativeQAPipeline
from haystack.utils import print_answers
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import RAGenerator, DensePassageRetriever, Seq2SeqGenerator


# Initialize RAG Generator
"""
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)
"""
generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

# add bm25 retriever
first_retriever = BM25Retriever(document_store=document_store)

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True
)

# create the Query Pipeline
pipeline = GenerativeQAPipeline(generator=generator, retriever=retriever)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.



Query: Centor criteria are used for which disease
Answers:
[]

Query: Which are the main transcriptional activators of circadian oscillations
Answers:
[]

Query: What is the link between TADs and GRBs
Answers:
[]


In [10]:
document_store.update_embeddings(retriever)

# create the Query Pipeline
pipeline = GenerativeQAPipeline(generator=generator, retriever=retriever)

QUESTIONS = [
    "Centor criteria are used for which disease",
    "Which are the main transcriptional activators of circadian oscillations",
    "What is the link between TADs and GRBs",
]

for question in QUESTIONS:
    res = pipeline.run(query=question, params={"Retriever": {"top_k": 5}})
    print_answers(res, details="all")

Updating embeddings: 0 Docs [00:00, ? Docs/s]




Query: Centor criteria are used for which disease
Answers:
[]

Query: Which are the main transcriptional activators of circadian oscillations
Answers:
[]

Query: What is the link between TADs and GRBs
Answers:
[]


In [8]:
from haystack.nodes import EmbeddingRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline

retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1", use_gpu=True)
document_store.update_embeddings(retriever=retriever)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
presidents_qa = ExtractiveQAPipeline(reader=reader, retriever=retriever)

Updating embeddings: 0 Docs [00:00, ? Docs/s]

In [9]:
from haystack.utils import print_answers

#result = presidents_qa.run("Who was the 1st president of the USA?")
result = presidents_qa.run("What year was the 1st president of the USA born?")

print_answers(result, "minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]




Query: What year was the 1st president of the USA born?
Answers:
[]


In [14]:
from transformers import DPRReader, DPRReaderTokenizer


tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
encoded_inputs = tokenizer(
    questions=["What is love ?"],
    titles=["Haddaway"],
    texts=["'What Is Love' is a song recorded by the artist Haddaway"],
    return_tensors="pt",
)
outputs = model(**encoded_inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
relevance_logits = outputs.relevance_logits

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/484 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRReaderTokenizer'.


Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-reader-single-nq-base were not used when initializing DPRReader: ['span_predictor.encoder.bert_model.pooler.dense.bias', 'span_predictor.encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRReader from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRReader from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
outputs

In [None]:
# from transformers import DPRContextEncoderTokenizer # colab environment
tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True, from_pt=True)

input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
embeddings = model(input_ids).pooler_output


print(embeddings.shape)

In [16]:
from transformers import pipeline

qa_model = pipeline("question-answering")
question = "What is AUROC in context of predictive modeling?"
context = "The predictive accuracy of an individual biomarker on cognitive impairment was evaluated using area under the receiver operating characteristic curve (AUROC), and multivariate logistic regression was applied to evaluate predictive accuracy of biomarkers on cognitive impairment; 178 subjects (41 PD, 31 VaD and 106 normal controls) were included."
qa_model(question = question, context = context)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.914626955986023,
 'start': 110,
 'end': 149,
 'answer': 'receiver operating characteristic curve'}

In [20]:
sentiment_model = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")

Downloading:   0%|          | 0.00/949 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/540M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/338 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/167 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


[{'label': 'NEG', 'score': 0.933961808681488}]

In [25]:

if sentiment_model("not effective")[0]['label'] == 'NEG':
    

SyntaxError: invalid syntax (1211320475.py, line 1)

In [26]:
name = "BioASQ-task11bPhaseB-testset1.json"

submission_file_name = f'{working_folder}/test11b/{name}'
submission_file = json.load(open(submission_file_name))

for sample in tqdm(submission_file['questions'], position=0):
    context = ""
    question = sample["body"]
    q_type = sample["type"]
    
    for snippet in sample["snippets"]:
        context += snippet["text"]
        
    res = qa_model(question = question, context = context)
    sample["ideal_answer"] = res['answer']

100%|███████████████████████████████████████████████████████████████████████████| 75/75 [00:06<00:00, 11.13it/s]


In [41]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer,aggregation_strategy='max') # pass device=0 if using gpu

In [43]:
res = pipe("""The patient reported no recurrence of palpitations at follow-up 6 months after the ablation.""")
print(res)

[{'entity_group': 'Sign_symptom', 'score': 0.9999311, 'word': 'palpitations', 'start': 38, 'end': 50}, {'entity_group': 'Clinical_event', 'score': 0.99975544, 'word': 'follow', 'start': 54, 'end': 60}, {'entity_group': 'Date', 'score': 0.999867, 'word': '6 months after', 'start': 64, 'end': 78}]


In [46]:
for sample in tqdm(submission_file['questions'], position=0):    
    q_type = sample["type"]
    query = sample["ideal_answer"]
    
    if q_type == 'yesno':
        sent = sentiment_model(query)[0]['label']
        if sent == 'NEG': 
            sample["exact_answer"] = 'no'
        else:
            sample["exact_answer"] = 'yes'
    
    res = pipe(query)
    entities = [ ent['word'] for ent in res ]
    
    if q_type == 'list':
        sample["exact_answer"] = entities[0:5]
        
    if q_type == 'factoid':
        sample["exact_answer"] = entities[0:5]

100%|███████████████████████████████████████████████████████████████████████████| 75/75 [00:01<00:00, 39.51it/s]


In [52]:
name = "submision_BioASQ-task11bPhaseB-testset1.json"
submission_file_name = f'{working_folder}/test11b/{name}'

with open(submission_file_name, "w") as outfile:
    json.dump(submission_file, outfile)

In [53]:
for sample in submission_file['questions']:
    q_type = sample["type"]
    if q_type != 'summary':
        for snippet in sample["snippets"]:
            context += snippet["text"]
        print('------------------------------------------------')
        print(sample['body'])
        print(context)
        print(sample['ideal_answer'])
        print(sample['exact_answer'])
        print('------------------------------------------------')

------------------------------------------------
Can losartan reduce brain atrophy in Alzheimer's disease?
Glofitamab Treatment in Relapsed or Refractory DLBCL after CAR T-Cell Therapy.In this study, we evaluated the safety and efficacy of a monotherapy with the bispecific CD20xCD3 antibody glofitamab in patients who progressed after CAR T treatment. We report nine consecutive patients with progressive DLBCL after preceding CAR T-cell therapy. Our data suggest that glofitamab treatment is well tolerated and effective in patients with DLBCL relapsing after CAR T-cell therapy and can enhance residual CAR T-cell activity.Bispecific antibodies such as epcoritamab, mosunetuzumab, and glofitamab, anti-CD19 antibody drug tafasitamab combined with lenalidomide, CD19 antibody drug conjugate loncastuximab tesirine, oral selective inhibitor of nuclear export selinexor, and several new agents have been investigated for DLBCL. Glofitamab, a novel CD20xCD3, T-cell-engaging bispecific antibody, exhib

In [29]:
types = []
for sample in tqdm(submission_file['questions'], position=0):
    types.append(sample["type"])

100%|███████████████████████████████████████████████████████████████████████| 75/75 [00:00<00:00, 626639.04it/s]


In [30]:
set(types)

{'factoid', 'list', 'summary', 'yesno'}

In [None]:

    
with open(submission_file_name, "w") as outfile:
    json.dump(submission_file, outfile)

In [None]:
  "questions": [
    {
      "documents": [
        "http://www.ncbi.nlm.nih.gov/pubmed/34687634"
      ],
      "snippets": [
        {
          "beginSection": "abstract",
          "endSection": "abstract",
          "text": "INTERPRETATION: 12 months of treatment with losartan was well tolerated but was not effective in reducing the rate of brain atrophy in individuals with clinically diagnosed mild-to-moderate Alzheimer\u0027s disease.",
          "document": "http://www.ncbi.nlm.nih.gov/pubmed/34687634",
          "offsetInBeginSection": 2574,
          "offsetInEndSection": 2784
        }
      ],

In [None]:
submission_file_name =  working_folder + "/test11b/submit_improved2_" + test_batch_doc.split('/')[-1].replace('.json','_bm25_crossenc_ranker_20.json')

with open(submission_file_name, "w") as outfile:
    json.dump(test_batch_json, outfile)