In [2]:
!pip install -U sentence-transformers rank_bm25 datasets beir

Defaulting to user installation because normal site-packages is not writeable


In [108]:
from datasets import load_dataset

nq_data = load_dataset('BeIR/trec-news-generated-queries')


Found cached dataset json (/home/ubuntu/.cache/huggingface/datasets/BeIR___json/BeIR--trec-news-generated-queries-58e8f34dd4c75682/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

In [110]:
nq = nq_data['train'].shuffle(seed=42)

Loading cached shuffled indices for dataset at /home/ubuntu/.cache/huggingface/datasets/BeIR___json/BeIR--trec-news-generated-queries-58e8f34dd4c75682/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-8f0959c31593cac5.arrow


In [111]:
queries = []
passages = []
titles = []
for i in range(10000):
    queries.append(nq[i]['query'])
    passages.append(nq[i]['text'])
    titles.append(nq[i]['title'])

In [112]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")


#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 512     #Truncate long passages to 256 tokens

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [113]:
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [114]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)


  0%|          | 0/10000 [00:00<?, ?it/s]

In [115]:
def retrieve(query, num_bi_encoder, num_bm_25):
    hits = []
    if num_bm_25 > 0:
        bm25_scores = bm25.get_scores(bm25_tokenizer(query))
        top_n = np.argpartition(bm25_scores, -num_bm_25)[-num_bm_25:]
        hits.extend(top_n)
    if num_bi_encoder > 0:
        question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
        question_embedding = question_embedding.cuda()
        bi_encoder_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=num_bi_encoder)
        bi_encoder_hits = bi_encoder_hits[0]  # Get the hits for the first query
        hits.extend(hit['corpus_id'] for hit in bi_encoder_hits)
    return list(set(hits))

In [116]:
def rerank(query, passage_ids, top_k, use_titles=False):
    if use_titles:
        cross_inp = [[query, passages[idx] + '[SEP]' + titles[idx]] for idx in passage_ids]
    else:
        cross_inp = [[query, passages[idx]] for idx in passage_ids]
    cross_scores = cross_encoder.predict(cross_inp)
    scores = [
        {
            'id': passage_ids[i],
            'cross_score': cross_scores[i]
        }
        for i in range(len(passage_ids))
    ]
    scores = sorted(scores, key=lambda x: x['cross_score'], reverse=True)
    return [score['id'] for score in scores[0:top_k]]

    

In [117]:
def search(query, top_k, num_bi_encoder, num_bm_25, use_titles=False):
    return rerank(query, retrieve(query, num_bi_encoder, num_bm_25), top_k, use_titles)

In [118]:
def search_bm_25_only(query, top_k):
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
    return top_n

In [119]:
import random
random.seed(42)
sample = random.sample(range(1000), 400)

In [120]:
recall_at_1_combined = sum([
    1.0 if idx in search(queries[idx], 1, num_bi_encoder=25, num_bm_25=25) else 0.0
    for idx in sample
]) / len(sample)

In [121]:
# recall_at_1_combined_titles = sum([
#     1.0 if idx in search(queries[idx], 1, num_bi_encoder=25, num_bm_25=25, use_titles=True) else 0.0
#     for idx in sample
# ]) / len(sample)
recall_at_1_combined_titles = -1.0

In [122]:
recall_at_1_bi_ce = sum([
    1.0 if idx in search(queries[idx], 1, num_bi_encoder=50, num_bm_25=0) else 0.0
    for idx in sample
]) / len(sample)

In [123]:
recall_at_1_bm25_ce = sum([
    1.0 if idx in search(queries[idx], 1, num_bi_encoder=0, num_bm_25=50) else 0.0
    for idx in sample
]) / len(sample)

In [124]:
recall_at_1_bm25 = sum([
    1.0 if idx in search_bm_25_only(queries[idx], 1) else 0.0
    for idx in sample
]) / len(sample)

In [125]:
print('dataset: trec-covid')
print(f' recall@1 bm25: {recall_at_1_bm25:.2f}', )
print(f' recall@1 bm25 + cross-encoder: {recall_at_1_bm25_ce:.2f}')
print(f' recall@1 bi-encoder + cross-encdoer: {recall_at_1_bi_ce:.2f}')
print(f' recall@1 bm25 + bi-encoder + cross-encoder: {recall_at_1_combined:.2f}')

dataset: trec-covid
 recall@1 bm25: 0.49
 recall@1 bm25 + cross-encoder: 0.64
 recall@1 bi-encoder + cross-encdoer: 0.64
 recall@1 bm25 + bi-encoder + cross-encoder: 0.67


In [39]:
print(f' recall@1 bm25 + bi-encoder + cross-encoder with titles: {recall_at_1_combined_titles:.2f}')

 recall@1 bm25 + bi-encoder + cross-encoder with titles: 0.87


In [127]:
def search_print(*args, **kwargs):
    results = search(*args, **kwargs)
    for result in results:
        print(passages[result])

In [128]:
search_print('can you get covid from touch surfaces', 3, 25, 25, use_titles=True)

_**Dear Heloise** : I work in a public library and witness people sneeze and cough into their hands and then go on to use the computer keyboard or mouse._ _Others, unaware, use the same keyboard and mouse. I caution public computer users that germs abound on these surfaces._ _I use a disposable bleach wipe and give these surfaces a quick swipe._ _I benefit, but so do others who use the same computer workstation later!_ **Flannery C. in Columbus, Ohio** **Flannery C.** : Yuck and double yuck! How right you are about using any public keyboard: at a kiosk such as at the airport, an ATM or a parking garage, and even buttons in an elevator. Here’s a hint from Heloise: If you must touch a button, etc., use your knuckle rather than your fingertip. You are less likely to rub your eyes and transfer the germs to same if you use your knuckle to push the button. Stay healthy! _**Dear Heloise** : I’ve had many jobs and job interviews through the years. The interview question I used to fear the most

In [None]:
search_print('covid surface', 3, 25, 25)

Abstract Healthcare workers run the risk of contracting COVID-19 during the course of their work if required precautions and usage of appropriate personal protective equipment is not adhered to In the transfusion testing laboratory, indirect exposure to COVID-19 may result from environmental contamination that may occur through surface contact or be airborne Handling of potentially contaminated surfaces such as sample tubes and sample packaging, and subsequent self-inoculation through the mucous membranes of mouth, nose and eyes may occur Information on the risk of indirect contact transmission of COVID-19 from such surfaces is limited When risk assessments are conducted in the laboratory, assumptions often need to be based on studies derived from other coronaviruses and respiratory viruses such as HCoV-229E, SARS-CoV, MERS-CoV and Influenza The other mode of potential indirect transmission is through aerosols generated from certain laboratory processes The risk of inhaling aerosolized