In [1]:
%load_ext autoreload
%autoreload 2
import os
import re
import json
import copy
import sys
from tqdm import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import pytrec_eval
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
from haystack.pipelines import Pipeline
from haystack.nodes import BM25Retriever, ElasticsearchRetriever
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import BM25Retriever, SentenceTransformersRanker
import warnings
warnings.filterwarnings('ignore')

sys.path.append('../../')
import globals
from elastic_search_utils import elastic_utils
from haystack_utils.retrievers import BioASQ_Retriever
import bioasq_eval

working_folder = globals.PATH.home + '/data/working_folder'
eval_home = globals.PATH.eval_home + '/'
gs_google_docs = eval_home + '/examples/aueb_google_docs/aueb_nlp-bioasq6b-submissions/'
index_name = globals.BIOASQ.index + 'working_folder'
model_id = 'doc_retrieval_test'

es = Elasticsearch(globals.ES.server)

Home path : /opt/bioasq/col-un-bioasq11
Eval path : /opt/bioasq/Evaluation-Measures


## Load the test dataset

In [2]:
#evaluate over aueb documents
test_batch_docs = [ #('','8b5_ES_30_full.json')
                ('6B1_golden.json', gs_google_docs+'1-aueb-nlp-4.json'),
                ('6B2_golden.json', gs_google_docs+'2-aueb-nlp-4.json'),
                ('6B3_golden.json', gs_google_docs+'3-aueb-nlp-4.json'),
                ('6B4_golden.json', gs_google_docs+'4-aueb-nlp-4.json'),
                ('6B5_golden.json', gs_google_docs+'5-aueb-nlp-4.json')
               ]

## Create Haystack Document Retrieval Pipeline

In [25]:
# set document store
document_store = ElasticsearchDocumentStore()
# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)
# create the Query Pipeline
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
prediction = pipeline.run(query="covid", params={"Retriever": {"top_k": 20}})
# predict
print([p.id for p in prediction['documents']])

['35805530', '34290652', '33818619', '34622965', '32161968', '32594211', '36260597', '34303669', '32915702', '35144461', '33028754', '34812083', '36040960', '33486531', '36107493', '34266454', '35144622', '36352477', '36062398', '33685285']


In [24]:
df = pd.DataFrame( columns=('batch', 'Mean precision', 'Recall', 'F-Measure', 'MAP', 'GMAP') )

for i, batch_file in enumerate(test_batch_docs):
    test_batch_json = json.load(open(batch_file[1]))
    for sample in tqdm(test_batch_json['questions'], position=0):
        prediction = pipeline.run(query=sample['body'], params={"Retriever": {"top_k": 10}})
        doc_list = [ globals.BIOASQ.doc_relative_url + doc.id for doc in prediction['documents'] ]
        sample['documents'] = doc_list[0:10]
            
    submission = test_batch_json.copy()
    submission_file_name =  working_folder + "/" + model_id + '_'+batch_file[1].split('/')[-1]
    json.dump(submission, open(submission_file_name, 'w'))
    docs_score, pass_score = bioasq_eval.get_scores_phaseA(batch_file[0], submission, path_home=eval_home)
    print('Document Scores',docs_score)
    print('Passage Scores',pass_score)
    df.loc[i] = [ batch_file[0].split('.')[0] + '_' + batch_file[1].split('/')[-1].split('.')[0] ] + pass_score

df.to_csv(working_folder + "/" + model_id+'.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:26<00:00,  3.72it/s]


Document Scores [0.18599999999999997, 0.43153159340659336, 0.21292523884970727, 0.11427063492063488, 0.011824938688881879]
Passage Scores [0.21269651172236054, 0.25011684801025924, 0.19239072397187112, 0.16589622913215266, 0.011068641301611446]


100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:28<00:00,  3.54it/s]


Document Scores [0.17499999999999996, 0.41856246566732525, 0.19483745065379593, 0.11053650793650789, 0.009781915786034436]
Passage Scores [0.2872636424287026, 0.2146197751178933, 0.18501190236737292, 0.23371072162089984, 0.02310804468883905]


100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:31<00:00,  3.21it/s]


Document Scores [0.19800000000000006, 0.43640082940539116, 0.2037204740884041, 0.12090436507936503, 0.016996468090491323]
Passage Scores [0.26003786059540585, 0.2606508506979361, 0.2298338621471135, 0.23064062571728583, 0.03591235401768035]


100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:22<00:00,  4.46it/s]


Document Scores [0.13499999999999995, 0.38803580985005126, 0.15696586821137215, 0.0845662698412698, 0.0032919983740153425]
Passage Scores [0.17610509220044598, 0.23667781424156625, 0.1646042834423257, 0.14793238638640266, 0.003342406333163496]


100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:23<00:00,  4.18it/s]


Document Scores [0.11399999999999996, 0.20743502369663358, 0.12324007657666247, 0.06056388888888889, 0.0011357044111332242]
Passage Scores [0.14094343345395727, 0.18489952564391401, 0.12852251428873449, 0.10071564146215053, 0.002367187183784851]


## Add Ranker to PipeLine

In [3]:
# set document store
document_store = ElasticsearchDocumentStore()
# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)

# create the ranker
ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")

# create the Query Pipeline
pipeline = Pipeline()

# add bm25 retriever
pipeline.add_node(component=retriever, name="BM25Retriever", inputs=["Query"])
pipeline.add_node(component=ranker, name="Ranker", inputs=["BM25Retriever"])

# run the pipeline
prediction = pipeline.run(query="covid", params={"BM25Retriever": {"top_k": 100}})

# predict
print([p.id for p in prediction['documents']])

['36224705', '34926521', '34206226', '36326380', '33028754', '34567389', '36459751', '33686558', '34290652', '35240494']


In [None]:
df = pd.DataFrame( columns=('batch', 'Mean precision', 'Recall', 'F-Measure', 'MAP', 'GMAP') )

for i, batch_file in enumerate(test_batch_docs):
    test_batch_json = json.load(open(batch_file[1]))
    for sample in tqdm(test_batch_json['questions'], position=0):
        prediction = pipeline.run(query=sample['body'], params={"BM25Retriever": {"top_k": 100}})
        doc_list = [ globals.BIOASQ.doc_relative_url + doc.id for doc in prediction['documents'] ]
        sample['documents'] = doc_list[0:10]
            
    submission = test_batch_json.copy()
    submission_file_name =  working_folder + "/" + model_id + '_'+batch_file[1].split('/')[-1]
    json.dump(submission, open(submission_file_name, 'w'))
    docs_score, pass_score = bioasq_eval.get_scores_phaseA(batch_file[0], submission, path_home=eval_home)
    print('Document Scores',docs_score)
    print('Passage Scores',pass_score)
    df.loc[i] = [ batch_file[0].split('.')[0] + '_' + batch_file[1].split('/')[-1].split('.')[0] ] + pass_score

df.to_csv(working_folder + "/" + model_id+'.csv', index=False)

  1%|▊                                                                          | 1/100 [00:03<05:16,  3.20s/it]