# Imports and Commons

In [3]:
%load_ext autoreload
%autoreload 2
import os
import re
import json
import copy
import sys
from tqdm import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import pytrec_eval
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
from haystack.pipelines import Pipeline
from haystack.nodes import BM25Retriever, ElasticsearchRetriever
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import BM25Retriever, SentenceTransformersRanker
from haystack.nodes.reader import FARMReader
from haystack.utils import print_answers
import warnings
warnings.filterwarnings('ignore')

sys.path.append('../../')
import globals
from elastic_search_utils import elastic_utils
from haystack_utils.retrievers import BioASQ_Retriever
import bioasq_eval
import haystack_util

working_folder = globals.PATH.home + '/data/working_folder'
eval_home = globals.PATH.eval_home + '/'
gs_google_docs = eval_home + '/examples/aueb_google_docs/aueb_nlp-bioasq6b-submissions/'
index_name = globals.BIOASQ.index + 'working_folder'
model_id = 'doc_retrieval_test'

es = Elasticsearch(globals.ES.server)

# set document store
document_store = ElasticsearchDocumentStore()

Home path : /opt/bioasq/col-un-bioasq11
Eval path : /opt/bioasq/Evaluation-Measures


# Set Eval or Prediction

In [4]:
batch_fn = 'BioASQ-task11bPhaseA-testset3.json'
test_batch_doc = f'{globals.PATH.home}/data/11b_testset/{batch_fn}'
test_batch_json = json.load(open(test_batch_doc))

In [5]:
is_eval = False

# Document Retrieval

## BM25

In [6]:
model_id = 'bm25-10docs'
num_docs = 10
num_passages = 10

# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)
# create the Query Pipeline
bm25_pipeline = Pipeline()
bm25_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipeline_params = {"Retriever": {"top_k": num_docs}}

docs_submission = haystack_util.evaluate_bioasq_phaseA_haystack_pipeline(
    batch_json = test_batch_json, 
    batch_json_fname = batch_fn,
    pipeline = bm25_pipeline, 
    pipeline_params = pipeline_params, 
    method_id = model_id, 
    max_num_docs = num_docs, 
    max_num_passages = num_passages,
    is_eval=is_eval)

submission_file_name = f'{globals.PATH.home}/data/processed/{batch_fn.replace(".json","")}_model_{model_id}.json'
json.dump(docs_submission, open(submission_file_name, 'w'))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:36<00:00,  2.49it/s]


## BM25 with 100 documents

In [7]:
model_id = 'bm25-100docs'
num_docs = 100
num_passages = 10

# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)
# create the Query Pipeline
bm25_pipeline = Pipeline()
bm25_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipeline_params = {"Retriever": {"top_k": num_docs}}

docs_submission = haystack_util.evaluate_bioasq_phaseA_haystack_pipeline(
    batch_json = test_batch_json, 
    batch_json_fname = batch_fn,
    pipeline = bm25_pipeline, 
    pipeline_params = pipeline_params, 
    method_id = model_id, 
    max_num_docs = num_docs, 
    max_num_passages = num_passages,
    is_eval=is_eval)

submission_file_name = f'{globals.PATH.home}/data/processed/{batch_fn.replace(".json","")}_model_{model_id}.json'
json.dump(docs_submission, open(submission_file_name, 'w'))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:59<00:00,  1.50it/s]


## BM25 + MiniLM-L-12-v2 Ranker

In [8]:
model_id = 'bm25-cross-encoder-MiniLM'
num_docs = 100
num_passages = 10

# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)
ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")
# create the Query Pipeline
pipeline_bm25_ranker = Pipeline()
pipeline_bm25_ranker.add_node(component=retriever, name="BM25Retriever", inputs=["Query"])
pipeline_bm25_ranker.add_node(component=ranker, name="Ranker", inputs=["BM25Retriever"])
pipeline_params = {"BM25Retriever": {"top_k": 100}, "Ranker": {"top_k": num_docs}}

docs_submission = haystack_util.evaluate_bioasq_phaseA_haystack_pipeline(
    batch_json = test_batch_json, 
    batch_json_fname = batch_fn,
    pipeline = pipeline_bm25_ranker, 
    pipeline_params = pipeline_params, 
    method_id = model_id, 
    max_num_docs = num_docs, 
    max_num_passages = num_passages,
    is_eval = is_eval)

submission_file_name = f'{globals.PATH.home}/data/processed/{batch_fn.replace(".json","")}_model_{model_id}.json'
json.dump(docs_submission, open(submission_file_name, 'w'))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [05:10<00:00,  3.45s/it]


## BM25 + MiniLM-L-12-v2 Ranker

In [9]:
model_id = 'bm25-distilbert'
num_docs = 100
num_passages = 10

# create the retriever
retriever = BioASQ_Retriever(document_store = document_store)
ranker = SentenceTransformersRanker(model_name_or_path="sentence-transformers/distilbert-base-nli-stsb-quora-ranking")
# create the Query Pipeline
pipeline_bm25_ranker = Pipeline()
pipeline_bm25_ranker.add_node(component=retriever, name="BM25Retriever", inputs=["Query"])
pipeline_bm25_ranker.add_node(component=ranker, name="Ranker", inputs=["BM25Retriever"])
pipeline_params = {"BM25Retriever": {"top_k": 100}, "Ranker": {"top_k": num_docs}}

docs_submission = haystack_util.evaluate_bioasq_phaseA_haystack_pipeline(
    batch_json = test_batch_json, 
    batch_json_fname = batch_fn,
    pipeline = pipeline_bm25_ranker, 
    pipeline_params = pipeline_params, 
    method_id = model_id, 
    max_num_docs = num_docs, 
    max_num_passages = num_passages,
    is_eval = is_eval)

submission_file_name = f'{globals.PATH.home}/data/processed/{batch_fn.replace(".json","")}_model_{model_id}.json'
json.dump(docs_submission, open(submission_file_name, 'w'))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/distilbert-base-nli-stsb-quora-ranking and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [06:55<00:00,  4.62s/it]
