# 1. Get query and docuument encoders

In [5]:
import numpy as np

In [6]:
from sentence_transformers import SentenceTransformer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

query_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

document_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Sentences we want to encode. Example:

# Sentences are encoded by calling model.encode()

# 2a get dataset

In [7]:
from datasets import load_dataset

In [8]:
dataset = load_dataset(
    "ms_marco", "v2.1"
)

train_dataset = dataset["train"]

Downloading readme: 100%|██████████| 9.48k/9.48k [00:00<00:00, 9.97MB/s]
Downloading data: 100%|██████████| 210M/210M [00:01<00:00, 108MB/s]  
Downloading data: 100%|██████████| 240M/240M [00:02<00:00, 112MB/s]  
Downloading data: 100%|██████████| 240M/240M [00:02<00:00, 111MB/s]  
Downloading data: 100%|██████████| 241M/241M [00:02<00:00, 111MB/s]  
Downloading data: 100%|██████████| 242M/242M [00:02<00:00, 102MB/s]  
Downloading data: 100%|██████████| 242M/242M [00:02<00:00, 110MB/s]  
Downloading data: 100%|██████████| 242M/242M [00:02<00:00, 112MB/s]  
Downloading data: 100%|██████████| 244M/244M [00:02<00:00, 112MB/s]  
Downloading data: 100%|██████████| 204M/204M [00:02<00:00, 99.7MB/s] 
Generating validation split: 100%|██████████| 101093/101093 [00:01<00:00, 72970.01 examples/s]
Generating train split: 100%|██████████| 808731/808731 [00:10<00:00, 77588.90 examples/s]
Generating test split: 100%|██████████| 101092/101092 [00:01<00:00, 81164.25 examples/s]


In [9]:
def convert_to_query_pairs(row):
    pairs = []
    for i in range(len(row["passages"]["passage_text"])):
        pairs.append({"query": row["query"], "passage": row["passages"]["passage_text"][i]})
        
    return pairs

In [10]:

# pairs =  convert_to_query_pairs(one_dataset[0])

passages = []
queries = []

filtered_dataset = [item for item in train_dataset if len(item["wellFormedAnswers"]) > 0]

for i in range(len(filtered_dataset)):
    queries.append(filtered_dataset[i]["query"])
    for passage in filtered_dataset[i]["passages"]["passage_text"]:
        passages.append(passage)
        

In [11]:
document_embeddings = document_model.encode(passages)

In [12]:
import pickle
with open("my-embeddings.pkl", "wb") as fOut:
    pickle.dump({'sentences': passages, 'embeddings': document_embeddings},fOut)


In [13]:
document_dim = document_model.get_sentence_embedding_dimension()
query_dim = query_model.get_sentence_embedding_dimension()

# Apply FAISS


In [14]:
import faiss

In [15]:
# IndexFlatL2

index = faiss.IndexFlatL2(document_dim)

index.is_trained

True

In [16]:
index.add(document_embeddings)

In [17]:
index.ntotal

1533292

In [28]:
k = 10

xq = query_model.encode(['What colours make green?'])

D, I = index.search(xq, k)

print(I)

[[ 257789 1503477  930554 1503480  190644 1522522  152765  993454  257794
  1438502]]


In [29]:
[f'{i}: {passages[i]}' for i in I[0]]

['257789: Green is the color between blue and yellow on the spectrum of visible light. It is evoked by light with a predominant wavelength of roughly 495–570 nm.',
 '1503477: Varieties of the color green may differ in hue, chroma (also called saturation or intensity) or lightness (or value, tone, or brightness), or in two or three of these qualities. Variations in value are also called tints and shades, a tint being a green or other hue mixed with white, a shade being mixed with black.ine green is a rich shade of spring green that resembles the color of pine trees. It is an official Crayola color (since 1949) that is this exact shade in the Crayola crayon, but in the markers, it is known as crocodile green.',
 '930554: Quick Answer. Green is made by mixing blue and yellow. A mixture containing more blue than yellow makes dark green, while a mixture with more yellow than blue makes lime green, and a blue-and-yellow mixture with white added makes pastel green. Continue Reading.',
 '15034