In [13]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    print("Using MPS")

bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

wikipedia_filepath = './doc_samples/simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)

passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())

        #Add all paragraphs
        #passages.extend(data['paragraphs'])

        #Only add the first paragraph
        passages.append(data['paragraphs'][0])

print("Passages:", len(passages))

# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
corpus_embeddings = bi_encoder.encode(passages, batch_size=128, show_progress_bar=True, device=mps_device)

print('done')

Using MPS
Passages: 169597


Batches:   0%|          | 0/1325 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [2]:
# We also compare the results to lexical search (keyword search). Here, we use
# the BM25 algorithm which is implemented in the rank_bm25 package.

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)
        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

  0%|          | 0/169597 [00:00<?, ?it/s]

In [7]:
# This function will search all wikipedia articles for passages that
# answer the query
def search(query):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    print("Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, device=mps_device)
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    print("\n-------------------------\n")
    print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    # Output of top-5 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))


In [8]:
search(query = "Who is the half-blood prince?")

Input question: Who is the half-blood prince?
Top-3 lexical search (BM25) hits
	17.182	Harry Potter and the Half-Blood Prince was written by J. K. Rowling and published on 16 July 2005. The book is the sixth Harry Potter book.
	12.445	Frank Stephenson Dillane (born 21 April 1991) is an English actor. He is known for his role as Nick Clark in the AMC series "Fear the Walking Dead" (2015–2018). His movie roles include Tom Riddle in "Harry Potter and the Half-Blood Prince" (2009) and Owen Coffin in "In the Heart of the Sea" (2015).
	11.512	Thomas Andrew "Tom" Felton (born 22 September 1987) is an English actor and singer. Felton was born in Epsom, Surrey. He is best known for portraying the character Draco Malfoy in the Harry Potter movie series. Felton has also won a MTV Movie Award for "Best Villain" in the film version of Harry Potter and the Half-Blood Prince in 2010.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.548	Prince Daniel of Sweden, Duke of Västergötland (bor

In [10]:
search(query="When was Peru liberated from the Spanish")

Input question: When was Peru liberated from the Spanish
Top-3 lexical search (BM25) hits
	14.040	José de San Martín (1778 - 1850) was an Argentine general and politician. He was born in Yapeyú, Corrientes, Argentina. He liberated Argentina, Peru and Chile from Spain. In 1817, he crossed the Andes from Mendoza to Chile. Together with Simón Bolívar, San Martín is called one of the Liberators of South America.
	13.955	The Viceroyalty of Peru (in Spanish, "Virreinato del Perú") was a Spanish colonial administrative district that was created in 1542 and originally contained most of Spanish-ruled South America. It was governed from the capital of Lima.
	13.129	Simón Bolívar (1783 - 1830) was a Venezuelan military and political leader. He was born in Caracas, Venezuela. He liberated many countries from Spain in South America. Those countries included Venezuela, Colombia, Ecuador, Peru, and Bolivia. For a few years he was president of Gran Colombia, a country that no longer exists. The countr

In [11]:
search(query="who wrote the hobbit?")

Input question: who wrote the hobbit?
Top-3 lexical search (BM25) hits
	17.233	The Hobbit is a three-part fantasy movie. These parts are "The Hobbit: An Unexpected Journey" (2012), "The Hobbit: The Desolation of Smaug" (2013) and "The Hobbit: The Battle of the Five Armies" (2014). The movies are based on J. R. R. Tolkien's book "The Hobbit".
	15.883	The Hobbit: An Unexpected Journey is a 2012 epic fantasy and adventure movie. It is the first part of the Hobbit trilogy.
	13.941	The Hobbit, or There and Back Again, is a book written by J. R. R. Tolkien. It was first published on September 21, 1937. Tolkien wrote it in the 1930s for his children as a bedtime story. The story takes place before "The Lord of the Rings".

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.698	The Hobbit, or There and Back Again, is a book written by J. R. R. Tolkien. It was first published on September 21, 1937. Tolkien wrote it in the 1930s for his children as a bedtime story. The story takes pla

In [12]:
search(query="Who is Snape?")

Input question: Who is Snape?
Top-3 lexical search (BM25) hits
	16.830	Snape is a village and civil parish in Suffolk Coastal, Suffolk, England. In 2001 there were 623 people living in Snape.
	13.581	Severus Snape is a character from J.K. Rowling's fictional book series Harry Potter.
	10.051	Snape Maltings is a group of buildings including a famous concert hall in the village of Snape, Suffolk. Many concerts are given in the Maltings. Every year the Aldeburgh Festival has many of its concerts there. The composer Benjamin Britten used to live in Aldeburgh with the singer Peter Pears. They started a music school for young people. The Britten-Pears Foundation still continues today, with many concerts based in the Maltings. This foundation gives young musicians a chance to learn from famous music teachers and perform at concerts. Many of them have recently graduated from a music college.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.685	Severus Snape is a character from J.