In [24]:
from beir import util, LoggingHandler

from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.reranking.models import CrossEncoder
from beir.reranking import Rerank
from operator import itemgetter
import datetime
import random

import logging
import pathlib, os, sys

sys.path.append('../../')
import globals
from elastic_search_utils import elastic_utils
import bioasq_eval
from elasticsearch import Elasticsearch

es = Elasticsearch(globals.ES.server)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

bioasq_beir_home = f'{globals.PATH.home}/data/bioasq_beir/taskb_6_7'


#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

sys.path.insert(0, bioasq_beir_home)

corpus, queries, qrels = GenericDataLoader(bioasq_beir_home).load(split="train")
#pip install beir

2023-05-10 22:46:46 - Loading Corpus...


  0%|          | 0/21873 [00:00<?, ?it/s]

2023-05-10 22:46:46 - Loaded 21873 TRAIN Documents.
2023-05-10 22:46:46 - Doc Example: {'text': 'Hirschsprung disease (HSCR), or congenital intestinal aganglionosis, is a common hereditary disorder causing intestinal obstruction, thereby showing considerable phenotypic variation in conjunction with complex inheritance. Moreover, phenotypic assessment of the disease has been complicated since a subset of the observed mutations is also associated with several additional syndromic anomalies. Coding sequence mutations in e.g. RET, GDNF, EDNRB, EDN3, and SOX10 lead to long-segment (L-HSCR) as well as syndromic HSCR but fail to explain the transmission of the much more common short-segment form (S-HSCR). Furthermore, mutations in the RET gene are responsible for approximately half of the familial and some sporadic cases, strongly suggesting, on the one hand, the importance of non-coding variations and, on the other hand, that additional genes involved in the development of the enteric nervou

In [4]:
#### Provide parameters for Elasticsearch
hostname = globals.ES.server
index_name = 'bioasq-beir'
model = BM25(index_name=index_name, hostname=hostname)
bm25 = EvaluateRetrieval(model)

2023-05-10 21:14:11 - Activating Elasticsearch....
2023-05-10 21:14:11 - Elastic Search Credentials: {'hostname': 'http://localhost:9200', 'index_name': 'bioasq-beir', 'keys': {'title': 'title', 'body': 'txt'}, 'timeout': 100, 'retry_on_timeout': True, 'maxsize': 24, 'number_of_shards': 'default', 'language': 'english'}
2023-05-10 21:14:11 - Deleting previous Elasticsearch-Index named - bioasq-beir




2023-05-10 21:14:14 - Creating fresh Elasticsearch-Index named - bioasq-beir


In [None]:
corpus, queries, qrels = GenericDataLoader(bioasq_beir_home).load(split="train")
corpus_ids, query_ids = list(corpus), list(queries)
corpus_texts = {corpus_id: corpus[corpus_id]["title"] + " " + corpus[corpus_id]["text"] for corpus_id in corpus}

#### Randomly sample 1M pairs from Original Corpus (4.63M pairs) 
#### First include all relevant documents (i.e. present in qrels)
corpus_set = set()
#for query_id in qrels:
#    corpus_set.update(list(qrels[query_id].keys()))
    
corpus_new = {corpus_id: corpus[corpus_id] for corpus_id in corpus_set if corpus_id in corpus}
print('Tam new corpus: '+str(len(corpus_new)))
#### Remove already seen k relevant documents and sample (1M - k) docs randomly
remaining_corpus = list(set(corpus_ids) - corpus_set)
sample = 512
print('Remaining corpus: ',len(remaining_corpus),'   -- sample:',sample, '  set corpusids: ',len(set(corpus_ids)) )
print(remaining_corpus)
for corpus_id in random.sample(remaining_corpus, sample):
    corpus_new[str(corpus_id)] = corpus[str(corpus_id)]

bm25.retriever.index(corpus_new)

In [18]:
#### Reranking using Cross-Encoder model
reranker = CrossEncoder('cross-encoder/ms-marco-electra-base')

#### Saving benchmark times
time_taken_all = {}

for query_id in query_ids:
    query = queries[query_id]
    
    #### Measure time to retrieve top-100 BM25 documents using single query latency
    start = datetime.datetime.now()
    results = bm25.retriever.es.lexical_search(text=query, top_hits=100) 
    
    #### Measure time to rerank top-100 BM25 documents using CE
    sentence_pairs = [[queries[query_id], corpus_texts[hit[0]]] for hit in results["hits"]]
    scores = reranker.predict(sentence_pairs, batch_size=100, show_progress_bar=False)
    hits = {results["hits"][idx][0]: scores[idx] for idx in range(len(scores))}            
    sorted_results = {k: v for k,v in sorted(hits.items(), key=itemgetter(1), reverse=True)} 
    end = datetime.datetime.now()
    
    #### Measuring time taken in ms (milliseconds)
    time_taken = (end - start)
    time_taken = time_taken.total_seconds() * 1000
    time_taken_all[query_id] = time_taken
    logging.info("{}: {} {:.2f}ms".format(query_id, query, time_taken))

time_taken = list(time_taken_all.values())

Downloading (…)lve/main/config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2023-05-10 21:46:45 - Use pytorch device: cpu
2023-05-10 21:47:31 - 55031181e9bde69634000014: Is Hirschsprung disease a mendelian or a multifactorial disorder? 45888.07ms
2023-05-10 21:48:16 - 55046d5ff8aee20f27000007: List signaling molecules (ligands) that interact with the receptor EGFR? 44867.80ms
2023-05-10 21:49:01 - 54e25eaaae9738404b000017: Is the protein Papilin secreted? 44945.86ms
2023-05-10 21:49:45 - 535d292a9a4572de6f000003: Are long non coding RNAs spliced? 44847.90ms
2023-05-10 21:50:30 - 55262a9787ecba3764000009: Is RANKL secreted from the cells? 44585.67ms
2023-05-10 21:50:46 - 51406e6223fec90375000009: Does metformin interfere thyroxine absorption? 16424.22ms
2023-05-10 21:51:31 - 553fa78b1d53b76422000007: Which miRNAs could be used as potential biomarkers for epithelial ovarian cancer? 44893.55ms
2023-05-10 21:52:16 - 5149199dd24251bc05000040: Which acetylcholinesterase inhibitors are used for treatment of myasthenia gravis? 45110.56ms
2023-05-10 21:53:01 - 52bf1db6

2023-05-10 22:33:09 - 52d946c798d023950500000a: What is clathrin? 1765.11ms
2023-05-10 22:33:54 - 5322d9339b2d7acc7e000011: What are the main results of PRKAR1A Knockdown? 44808.79ms
2023-05-10 22:34:38 - 535d75ab7d100faa09000004: Is TENS machine effective in pain? 44598.70ms
2023-05-10 22:35:23 - 56d1d14567f0cb3d66000003: Is there any algorithm for enhancer identification from chromatin state? 44589.49ms
2023-05-10 22:36:08 - 532498959b2d7acc7e000017: Which enzyme is targeted by the drug Imetelstat? 44610.37ms
2023-05-10 22:36:52 - 56c1f005ef6e39474100003a: Which interleukins are inhibited by Dupilumab? 44453.32ms
2023-05-10 22:37:37 - 513ce3c8bee46bd34c000008: Which human genes are more commonly related to craniosynostosis? 44574.34ms
2023-05-10 22:38:21 - 553f78c7ab98a37113000008: Are transcribed ultraconserved regions involved in cancer? 44557.92ms
2023-05-10 22:39:06 - 56d06e043975bb303a000011: In which breast cancer patients can palbociclib be used? 44593.03ms
2023-05-10 22:39:50

KeyboardInterrupt: 

In [20]:
model = BM25(index_name=index_name, hostname=hostname, initialize=True)
retriever = EvaluateRetrieval(model)

2023-05-10 22:45:01 - Activating Elasticsearch....
2023-05-10 22:45:01 - Elastic Search Credentials: {'hostname': 'http://localhost:9200', 'index_name': 'bioasq-beir', 'keys': {'title': 'title', 'body': 'txt'}, 'timeout': 100, 'retry_on_timeout': True, 'maxsize': 24, 'number_of_shards': 'default', 'language': 'english'}
2023-05-10 22:45:01 - Deleting previous Elasticsearch-Index named - bioasq-beir
2023-05-10 22:45:03 - Creating fresh Elasticsearch-Index named - bioasq-beir


In [21]:
#### Retrieve dense results (format of results is identical to qrels)
results = retriever.retrieve(corpus, queries)

  0%|                                                                                                               | 0/21873 [00:00<?, ?docs/s]
que: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:28<00:00,  1.59s/it]


In [25]:
#### Reranking using Cross-Encoder models #####
#### https://www.sbert.net/docs/pretrained_cross-encoders.html
cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-electra-base')

#### Or use MiniLM, TinyBERT etc. CE models (https://www.sbert.net/docs/pretrained-models/ce-msmarco.html)
# cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
# cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

reranker = Rerank(cross_encoder_model, batch_size=128)

# Rerank top-100 results using the reranker provided
rerank_results = reranker.rerank(corpus, queries, results, top_k=100)

#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, rerank_results, retriever.k_values)

2023-05-10 22:46:53 - Use pytorch device: cpu
2023-05-10 22:46:55 - Starting To Rerank Top-100....


Batches:   0%|          | 0/1758 [00:00<?, ?it/s]

KeyboardInterrupt: 