In [1]:
%load_ext autoreload
%autoreload 2
import os
import re
import json
import copy
import sys
from tqdm import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import pytrec_eval
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
from haystack.pipelines import Pipeline
from haystack.nodes import BM25Retriever, ElasticsearchRetriever
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import BM25Retriever, SentenceTransformersRanker
import warnings
warnings.filterwarnings('ignore')

sys.path.append('../../')
import globals
from elastic_search_utils import elastic_utils
from haystack_utils.retrievers import BioASQ_Retriever
import bioasq_eval

working_folder = globals.PATH.home + '/data/working_folder'
eval_home = globals.PATH.eval_home + '/'
gs_google_docs = eval_home + '/examples/aueb_google_docs/aueb_nlp-bioasq6b-submissions/'
index_name = globals.BIOASQ.index + 'working_folder'
model_id = 'doc_retrieval_test'

es = Elasticsearch(globals.ES.server)

Home path : /opt/bioasq/col-un-bioasq11
Eval path : /opt/bioasq/Evaluation-Measures


## Load the test dataset

In [2]:
#evaluate over aueb documents
test_batch_docs = [ #('','8b5_ES_30_full.json')
                ('6B1_golden.json', gs_google_docs+'1-aueb-nlp-4.json'),
                ('6B2_golden.json', gs_google_docs+'2-aueb-nlp-4.json'),
                ('6B3_golden.json', gs_google_docs+'3-aueb-nlp-4.json'),
                ('6B4_golden.json', gs_google_docs+'4-aueb-nlp-4.json'),
                ('6B5_golden.json', gs_google_docs+'5-aueb-nlp-4.json')
               ]

## Create Haystack Document Retrieval Pipeline

In [5]:
# set document store
document_store = ElasticsearchDocumentStore()
# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)
# create the Query Pipeline
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
prediction = pipeline.run(query="covid", params={"Retriever": {"top_k": 20}})
# predict
print([p.id for p in prediction['documents']])

['35805530', '34290652', '33818619', '34622965', '32161968', '32594211', '36260597', '34303669', '32915702', '35144461', '33028754', '34812083', '36040960', '33486531', '36107493', '34266454', '35144622', '36352477', '36062398', '33685285']


In [11]:
df = pd.DataFrame( columns=('batch', 'Mean precision', 'Recall', 'F-Measure', 'MAP', 'GMAP') )

for i, batch_file in enumerate(test_batch_docs):
    test_batch_json = json.load(open(batch_file[1]))
    for sample in tqdm(test_batch_json['questions'], position=0):
        prediction = pipeline.run(query=sample['body'], params={"Retriever": {"top_k": 10}})
        doc_list = [ globals.BIOASQ.doc_relative_url + doc.id for doc in prediction['documents'] ]
        sample['documents'] = doc_list[0:10]
            
    submission = test_batch_json.copy()
    submission_file_name =  working_folder + "/" + model_id + '_'+batch_file[1].split('/')[-1]
    json.dump(submission, open(submission_file_name, 'w'))
    docs_score, pass_score = bioasq_eval.get_scores_phaseA(batch_file[0], submission, path_home=eval_home)
    print('Document Scores',docs_score)
    print('Passage Scores',pass_score)
    df.loc[i] = [ batch_file[0].split('.')[0] + '_' + batch_file[1].split('/')[-1].split('.')[0] ] + pass_score

df.to_csv(working_folder + "/" + model_id+'.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:26<00:00,  3.82it/s]


Document Scores [0.18599999999999997, 0.43153159340659336, 0.21292523884970727, 0.11427063492063488, 0.011824938688881879]
Passage Scores [0.21269651172236054, 0.25011684801025924, 0.19239072397187112, 0.16589622913215266, 0.011068641301611446]


100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:27<00:00,  3.64it/s]


Document Scores [0.17499999999999996, 0.41856246566732525, 0.19483745065379593, 0.11053650793650789, 0.009781915786034436]
Passage Scores [0.2872636424287026, 0.2146197751178933, 0.18501190236737292, 0.23371072162089984, 0.02310804468883905]


100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:29<00:00,  3.37it/s]


Document Scores [0.19800000000000006, 0.43640082940539116, 0.2037204740884041, 0.12090436507936503, 0.016996468090491323]
Passage Scores [0.26003786059540585, 0.2606508506979361, 0.2298338621471135, 0.23064062571728583, 0.03591235401768035]


100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:22<00:00,  4.54it/s]


Document Scores [0.13499999999999995, 0.38803580985005126, 0.15696586821137215, 0.0845662698412698, 0.0032919983740153425]
Passage Scores [0.17610509220044598, 0.23667781424156625, 0.1646042834423257, 0.14793238638640266, 0.003342406333163496]


100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:23<00:00,  4.25it/s]


Document Scores [0.11399999999999996, 0.20743502369663358, 0.12324007657666247, 0.06056388888888889, 0.0011357044111332242]
Passage Scores [0.14094343345395727, 0.18489952564391401, 0.12852251428873449, 0.10071564146215053, 0.002367187183784851]


## Add Ranker to PipeLine

In [5]:
# set document store
document_store = ElasticsearchDocumentStore()
# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)

# create the Sentence Transformer Ranker
#sentence-transformers/distilbert-base-nli-stsb-quora-ranking
#ranker = SentenceTransformersRanker(model_name_or_path="sentence-transformers/distilbert-base-nli-stsb-quora-ranking")
ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")

# create the Query Pipeline
pipeline = Pipeline()

# add bm25 retriever
pipeline.add_node(component=retriever, name="BM25Retriever", inputs=["Query"])
pipeline.add_node(component=ranker, name="Ranker", inputs=["BM25Retriever"])

# run the pipeline
prediction = pipeline.run(query="covid", params={"BM25Retriever": {"top_k": 100}})

# predict
print([p.id for p in prediction['documents']])

Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/distilbert-base-nli-stsb-quora-ranking and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

['33554154', '36416240', '32996452', '33713816', '35760548', '36309479', '33799284', '35837898', '33486531', '33465496']


In [None]:
df = pd.DataFrame( columns=('batch', 'Mean precision', 'Recall', 'F-Measure', 'MAP', 'GMAP') )

for i, batch_file in enumerate(test_batch_docs):
    test_batch_json = json.load(open(batch_file[1]))
    for sample in tqdm(test_batch_json['questions'], position=0):
        prediction = pipeline.run(query=sample['body'], params={"BM25Retriever": {"top_k": 100}})
        doc_list = [ globals.BIOASQ.doc_relative_url + doc.id for doc in prediction['documents'] ]
        sample['documents'] = doc_list[0:10]
            
    submission = test_batch_json.copy()
    submission_file_name =  working_folder + "/" + model_id + '_'+batch_file[1].split('/')[-1]
    json.dump(submission, open(submission_file_name, 'w'))
    docs_score, pass_score = bioasq_eval.get_scores_phaseA(batch_file[0], submission, path_home=eval_home)
    print('Document Scores',docs_score)
    print('Passage Scores',pass_score)
    df.loc[i] = [ batch_file[0].split('.')[0] + '_' + batch_file[1].split('/')[-1].split('.')[0] ] + pass_score

df.to_csv(working_folder + "/" + model_id+'.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████| 100/100 [07:46<00:00,  4.67s/it]


Document Scores [0.03800000000000001, 0.06213529526029525, 0.04019339829940179, 0.013253968253968258, 8.170379645422878e-05]
Passage Scores [0.21269651172236054, 0.25011684801025924, 0.19239072397187112, 0.16589622913215266, 0.011068641301611446]


100%|█████████████████████████████████████████████████████████████████████████| 100/100 [07:47<00:00,  4.67s/it]


Document Scores [0.03200000000000001, 0.05930236274098168, 0.032705732439466846, 0.009248015873015873, 7.15549459214587e-05]
Passage Scores [0.2872636424287026, 0.2146197751178933, 0.18501190236737292, 0.23371072162089984, 0.02310804468883905]


100%|█████████████████████████████████████████████████████████████████████████| 100/100 [07:42<00:00,  4.62s/it]


Document Scores [0.04699999999999999, 0.06116227591528132, 0.04020937457883919, 0.018455158730158728, 9.838571124953916e-05]
Passage Scores [0.26003786059540585, 0.2606508506979361, 0.2298338621471135, 0.23064062571728583, 0.03591235401768035]


 97%|███████████████████████████████████████████████████████████████████████▊  | 97/100 [07:19<00:13,  4.47s/it]

## Add Biomedical Ranker to PipeLine

In [19]:
# set document store
document_store = ElasticsearchDocumentStore()
# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)

# create the Sentence Transformer Ranker
ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")

# create the Query Pipeline
pipeline = Pipeline()

# add bm25 retriever
pipeline.add_node(component=retriever, name="BM25Retriever", inputs=["Query"])
pipeline.add_node(component=ranker, name="Ranker", inputs=["BM25Retriever"])

# run the pipeline
prediction = pipeline.run(query="covid", params={"BM25Retriever": {"top_k": 100}})

# predict
print([p.id for p in prediction['documents']])

['36224705', '34926521', '34206226', '36326380', '33028754', '34567389', '36459751', '33686558', '34290652', '35240494']


In [9]:
from haystack.nodes.reader import FARMReader
from haystack.utils import print_answers

my_model = "deepset/roberta-base-squad2"
reader = FARMReader(model_name_or_path=my_model, use_gpu=True)

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [24]:
# set document store
document_store = ElasticsearchDocumentStore()
# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)

# create the Sentence Transformer Ranker
ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")

# create the Query Pipeline
pipeline = Pipeline()

# add bm25 retriever
pipeline.add_node(component=retriever, name="BM25Retriever", inputs=["Query"])
pipeline.add_node(component=ranker, name="Ranker", inputs=["BM25Retriever"])
pipeline.add_node(component=reader, name="Reader", inputs=["Ranker"])

# run the pipeline
prediction = pipeline.run(query="Which factors drive replisome disassembly during DNA replication termination and mitosis", params={"BM25Retriever": {"top_k": 100}})

# predict
print([p.id for p in prediction['documents']])

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

['30979826', '30340827', '34269473', '34195792', '28368371', '34700328', '35798141', '31545170', '26255844', '32490508']


In [25]:
print_answers(prediction, details="minimal")




Query: Which factors drive replisome disassembly during DNA replication termination and mitosis
Answers:
[   <Answer {'answer': 'Cullin ubiquitin ligases', 'type': 'extractive', 'score': 0.9014731049537659, 'context': 'Cullin ubiquitin ligases drive replisome disassembly during DNA replication termination. In worm, frog and mouse cells, CUL2LRR1 is required to ubiqui', 'offsets_in_document': [{'start': 0, 'end': 24}], 'offsets_in_context': [{'start': 0, 'end': 24}], 'document_ids': ['34195792'], 'meta': {'title': 'Reconstitution of human CMG helicase ubiquitylation by CUL2LRR1 and multiple E2 enzymes.', 'issue': '478(14)', 'pages': '2825-2842', 'abstract': 'Cullin ubiquitin ligases drive replisome disassembly during DNA replication termination. In worm, frog and mouse cells, CUL2LRR1 is required to ubiquitylate the MCM7 subunit of the CMG helicase. Here, we show that cullin ligases also drive CMG-MCM7 ubiquitylation in human cells, thereby making the helicase into a substrate for the 

In [None]:
"""
:param answer: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string.
    :param type: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
                 (i.e. we can locate an exact answer string in one of the documents) or from a generative model
                 (i.e. no pointer to a specific document, no offsets ...).
    :param score: The relevance score of the Answer determined by a model (e.g. Reader or Generator).
                  In the range of [0,1], where 1 means extremely relevant.
    :param context: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...)
    :param offsets_in_document: List of `Span` objects with start and end positions of the answer **in the
                                document** (as stored in the document store).
                                For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
                                For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
                                (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
    :param offsets_in_context: List of `Span` objects with start and end positions of the answer **in the
                                context** (i.e. the surrounding text/table of a certain window size).
                                For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
                                For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
                                (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
    :param document_ids: IDs of the documents the answer came from (if any).
                                For extractive QA, this will be a list of length 1.
                                For generative QA, this will be a list of length > 0.
    :param meta: Dict that can be used to associate any kind of custom meta data with the answer.
                 In extractive QA, this will carry the meta data of the document where the answer was found.
"""

In [None]:

class BioASQ_Reader(BaseReader):
    """
    Extractive Question Answering interfase using the FARM framework (https://github.com/deepset-ai/FARM).
    """
    
    
    def __init__(
        self,
        model_name_or_path: str,
        model_version: Optional[str] = None,
        return_no_answer: bool = False,
        use_gpu: bool = True,
        top_k: int = 10,
        use_confidence_scores: bool = True,
        confidence_threshold: Optional[float] = None
    ):
        """
        :param model_name_or_path: Directory of a saved model or the name of a public model e.g. 'bert-base-cased',
        'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'.
        See https://huggingface.co/models for full list of available models.
        :param model_version: The version of model to use.
        :param use_gpu: Whether to use GPUs or the CPU. Falls back on CPU if no GPU is available.
        :param return_no_answer: Whether to include no_answer predictions in the results.
        :param top_k: The maximum number of answers to return
        :param use_confidence_scores: Determines the type of score that is used for ranking a predicted answer.
        :param confidence_threshold: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default.
        """
        super().__init__()

        self.return_no_answers = return_no_answer
        self.top_k = top_k
        self.use_confidence_scores = use_confidence_scores
        self.confidence_threshold = confidence_threshold
        self.model_name_or_path = model_name_or_path  
    

    def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None):
        """
        Use loaded QA model to find answers for a query in the supplied list of Document.
        Returns dictionaries containing answers sorted by (desc.) score.
        Example:
        ```python
        {
            'query': 'Who is the father of Arya Stark?',
            'answers':[Answer(
                         'answer': 'Eddard,',
                         'context': "She travels with her father, Eddard, to King's Landing when he is",
                         'score': 0.9787139466668613,
                         'offsets_in_context': [Span(start=29, end=35],
                         'offsets_in_context': [Span(start=347, end=353],
                         'document_id': '88d1ed769d003939d3a0d28034464ab2'
                         ),...
                      ]
        }
         ```
        :param query: Query string
        :param documents: List of Document in which to search for the answer
        :param top_k: The maximum number of answers to return
        :return: Dict containing query and answers
        """
        if top_k is None:
            top_k = self.top_k
        # convert input to FARM format
        inputs = []
        for doc in documents:
            cur = QAInput(doc_text=doc.content, questions=Question(text=query, uid=doc.id))
            inputs.append(cur)

        # get answers from QA model
        predictions = self.inferencer.inference_from_objects(
            objects=inputs, return_json=False, multiprocessing_chunksize=1
        )
        # Deduplicate same answers resulting from Document split overlap
        predictions = self._deduplicate_predictions(predictions, documents)
        # assemble answers from all the different documents & format them.
        answers, max_no_ans_gap = self._extract_answers_of_predictions(predictions, top_k)
        # TODO: potentially simplify return here to List[Answer] and handle no_ans_gap differently
        result = {"query": query, "no_ans_gap": max_no_ans_gap, "answers": answers}

        return result
    
    
    def predict_batch(
        self,
        queries: List[str],
        documents: Union[List[Document], List[List[Document]]],
        top_k: Optional[int] = None,
        batch_size: Optional[int] = None,
    ):
        return []