In [None]:
## https://github.com/UKPLab/sentence-transformers/tree/master/examples/applications/semantic-search/semantic_search_wikipedia_qa.py
## https://colab.research.google.com/drive/11GunvCqJuebfeTlgbJWkIMT0xJH6PWF1?usp=sharing

In [None]:
import gzip
import json
import logging
import math
import time
import os
import sys
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, Iterable, List

import click
import torch
from sentence_transformers import SentenceTransformer, util

Batch = tuple[int, Iterable]
Passage = List[str]

In [None]:
def get_logger() -> logging.Logger:
    logger = logging.getLogger("simple-wiki-job")
    logger.setLevel(logging.DEBUG)
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        "[%(asctime)s] [%(name)s] [%(levelname)s] [%(funcName)s]: %(message)s"
    )
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    return logger

In [None]:
logger = get_logger()

In [None]:
def info_on_gpu_setup():
    if not torch.cuda.is_available():
        logger.warning("No GPU found. Please add GPU to your setup.")
    else:
        no_of_gpus = torch.cuda.device_count()
        logger.info("CUDA found. Available devices:")
        for i in range(no_of_gpus):
            logger.info(f"Device {i}: {torch.cuda.get_device_name(i)}")

In [None]:
def get_data_in_x_batches(data: Iterable, num_batches: int = 1) -> Generator:
    """Split `data` to `num_batches`"""

    batch_size = math.ceil(len(data) / num_batches)
    batch_id = 0
    for i in range(0, len(data), batch_size):
        yield (batch_id, data[i : i + batch_size])
        batch_id += 1

In [None]:
def process_batch(one_model_per_gpu: list):
    """Encode on defined device"""

    def _process_batch(payload: Batch):
        batch_id, data = payload
        device = f"cuda:{batch_id}"
        return one_model_per_gpu[batch_id].encode(
            data,
            convert_to_tensor=True,
            device=device,
        )

    return _process_batch

In [None]:
def get_simple_wikipedia_path(wikipedia_filepath: str) -> str:
    """Download prepared Simple English Wikipedia and return its path.

    As dataset, we use Simple English Wikipedia.
    Compared to the full English wikipedia, it has only
    about 170k articles. We split these articles into
    paragraphs and encode them with the bi-encoder.
    """

    if not os.path.exists(wikipedia_filepath):
        logger.info("Simple English Wikipedia not found locally. Downloading.")
        util.http_get(
            "http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz",
            wikipedia_filepath,
        )
    return wikipedia_filepath

In [None]:
def get_passages(wikipedia_filepath: str) -> List[Passage]:
    passages = []
    with gzip.open(wikipedia_filepath, "rt", encoding="utf8") as fIn:
        for line in fIn:
            data = json.loads(line.strip())
            for paragraph in data["paragraphs"]:
                # We encode the passages as [title, text]
                passages.append([data["title"], paragraph])

    logger.info(f"Number of passages: {len(passages)}")
    return passages

In [None]:
def get_model_per_gpu(
    cache_folder: str, model_name: str, no_of_gpus: int
) -> List[SentenceTransformer]:
    return [
        SentenceTransformer(
            model_name,
            cache_folder=cache_folder,
            device=f"cuda:{i}",
        )
        for i in range(no_of_gpus)
    ]

In [None]:
def save_single_pt_file(
    model_name: str, embeddings_dir: str, pt_file_prefix: str, results: List
):
    try:
        trg = os.path.join(embeddings_dir, f"{pt_file_prefix}--{model_name}.pt")
        cated = torch.cat([results[i].to(f"cuda:0") for i in range(len(results))])
        torch.save(cated, trg)
        logger.info(f"Saved file: {trg}")
    except Exception as ex:
        logger.error(
            f"Failed to save embedding as single file {trg=}. Try again as"
            f" separate files. Error we got: {ex}"
        )


In [None]:
def save_multiple_pt_files(
    model_name: str,
    embeddings_dir: str,
    pt_file_prefix: str,
    results: List,
    no_of_gpus: int,
):
    try:
        for i in range(no_of_gpus):
            trg = os.path.join(embeddings_dir, f"{pt_file_prefix}--{model_name}.{i}.pt")
            logger.info(f"Saving cuda:{i} to {trg=}")
            torch.save(results[i], trg)
            logger.info(f"Saved file: {trg}")
    except Exception as ex:
        logger.error(
            f"Failed to save embedding as a separate file {trg=}. Error we got: {ex}"
        )


In [None]:
# We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
model_name = 'nq-distilbert-base-v1'
model_cache_dir = '/home/jupyter/models'
data_dir = '/home/jupyter/data/workdir'
embeddings_dir = '/home/jupyter/data/embeddings'
no_of_gpus = 4
save_single_pt = 1
pt_file_prefix = 'createdbyjupyter'

bi_encoder = SentenceTransformer(model_name, cache_folder=f"{model_cache_dir}/{model_name}")
top_k = 5  # Number of passages we want to retrieve with the bi-encoder

In [None]:
model_cache_folder = os.path.join(model_cache_dir, model_name)

In [None]:
os.makedirs(model_cache_folder, exist_ok=True)
os.makedirs(data_dir, exist_ok=True)
os.makedirs(embeddings_dir, exist_ok=True)

In [None]:
wikipedia_filepath = os.path.join(data_dir, "simplewiki-2020-11-01.jsonl.gz")

In [None]:
info_on_gpu_setup()

In [None]:
get_simple_wikipedia_path(wikipedia_filepath)

In [None]:
passages = get_passages(wikipedia_filepath)

In [None]:
one_model_per_gpu = get_model_per_gpu(model_cache_folder, model_name, no_of_gpus)

In [None]:
data_batches = get_data_in_x_batches(passages, num_batches=no_of_gpus)

In [None]:
process_batch_with_model = process_batch(one_model_per_gpu)

In [None]:
with ThreadPoolExecutor(max_workers=no_of_gpus) as executor:
    results = list(executor.map(process_batch_with_model, data_batches))

In [None]:
if save_single_pt:
    logger.info("Trying to put all embeddings to one GPU.")
    save_single_pt_file(model_name, embeddings_dir, pt_file_prefix, results)
else:
    logger.info("Saving one file per GPU.")
    save_multiple_pt_files(
        model_name, embeddings_dir, pt_file_prefix, results, no_of_gpus
    )

In [None]:
embeddings_filepath = '/home/jupyter/data/embeddings/createdbyjupyter--nq-distilbert-base-v1.pt'

In [None]:
corpus_embeddings = torch.load(embeddings_filepath, map_location=torch.device('cpu'))
corpus_embeddings = corpus_embeddings.float()

In [None]:
def search(query):
    # Encode the query using the bi-encoder and find potentially relevant passages
    start_time = time.time()
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query
    end_time = time.time()

    # Output of top-k hits
    print("Input question:", query)
    print("Results (after {:.3f} seconds):".format(end_time - start_time))
    for hit in hits:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']]))

In [None]:
search(query = "What is the capital of the France?")

In [None]:
search(query = "What is the best orchestra in the world?")

In [None]:
search(query = "Number countries Europe")

In [None]:
search(query = "When did the cold war end?")

In [None]:
search(query = "How long do cats live?")

In [None]:
search(query = "How many people live in Toronto?")

In [None]:
search(query = "Oldest US president")

In [None]:
search(query = "Coldest place earth")

In [None]:
search(query = "When was Barack Obama born?")

In [None]:
search(query = "Paris eiffel tower")

In [None]:
search(query = "Which US president was killed?")

In [None]:
search(query="When is Chinese New Year")

In [None]:
search(query="what is the name of manchester united stadium")

In [None]:
search(query="who wrote cant get you out of my head lyrics")

In [None]:
search(query="where does the story the great gatsby take place")

In [None]:
search(query="who turned out to be the mother on how i met your mother")

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('nq-distilbert-base-v1', cache_folder=f"{model_cache_dir}/{model_name}")

query_embedding = model.encode('How many people live in London?')

#The passages are encoded as [title, text]
passage_embedding = model.encode([['London', 'London has 9,787,426 inhabitants at the 2011 census.']])

print("Similarity:", util.pytorch_cos_sim(query_embedding, passage_embedding))

In [None]:
query_embedding = model.encode('who turned out to be the mother on how i met your mother')

#The passages are encoded as [title, text]
passage_embedding = model.encode([['The Mother (How I Met Your Mother)', 'The Mother (How I Met Your Mother) Tracy McConnell (colloquial: "The Mother") is the title character from the CBS television sitcom "How I Met Your Mother". The show, narrated by Future Ted (Bob Saget), tells the story of how Ted Mosby (Josh Radnor) met The Mother. Tracy McConnell appears in eight episodes, from "Lucky Penny" to "The Time Travelers", as an unseen character; she was first seen fully in "Something New" and was promoted to a main character in season 9. The Mother is played by Cristin Milioti. The story of how Ted met The Mother is the framing device'],
                                  ['Make It Easy on Me', 'and Pete Waterman on her 1993 album "Good \'N\' Ready", on which a remixed version of the song is included. "Make It Easy On Me", a mid-tempo R&B jam, received good reviews (especially for signalling a different, more soulful and mature sound atypical of the producers\' Europop fare), but failed to make an impact on the charts, barely making the UK top 100 peaking at #99, and peaking at #52 on the "Billboard" R&B charts. The pop group Steps covered the song on their 1999 album "Steptacular". It was sung as a solo by Lisa Scott-Lee. Make It Easy on']])

print("Similarity:", util.pytorch_cos_sim(query_embedding, passage_embedding))

In [None]:
query_embedding = model.encode('where does the story the great gatsby take place')
passage_embedding = model.encode([['The Great Gatsby', 
 'The Great Gatsby The Great Gatsby is a 1925 novel written by American author F. Scott Fitzgerald that follows a cast of characters living in the fictional towns of West Egg and East Egg on prosperous Long Island in the summer of 1922. The story primarily concerns the young and mysterious millionaire Jay Gatsby and his quixotic passion and obsession with the beautiful former debutante Daisy Buchanan. Considered to be Fitzgerald\'s magnum opus, "The Great Gatsby" explores themes of decadence, idealism, resistance to change, social upheaval, and excess, creating a portrait of the Roaring Twenties that has been described as'],
 ['The Producers (1967 film)', '2005 (to coincide with the remake released that year). In 2011, MGM licensed the title to Shout! Factory to release a DVD and Blu-ray combo pack with new HD transfers and bonus materials. StudioCanal (worldwide rights holder to all of the Embassy Pictures library) released several R2 DVD editions and Blu-ray B releases using a transfer slightly different from the North Ameri can DVD and BDs. The Producers (1967 film) The Producers is a 1967 American satirical comedy film written and directed by Mel Brooks and starring Zero Mostel, Gene Wilder, Dick Shawn, and Kenneth Mars. The film was Brooks\'s directorial']
])

print("Similarity:", util.pytorch_cos_sim(query_embedding, passage_embedding))
