
#### Intro
This example uses Approximate Nearest Neighbor Search (ANN) using [Hnswlib](https://github.com/nmslib/hnswlib/).

Install it with: `pip install hnswlib` (along with other requirements)

The embeddings model is 'msmarco-distilbert-base-v4' from SBERT

The dataset we use is our own, (at ../dish-washer-data.csv); that gives us a way to compare with naive search



In [None]:
!pip install hnswlib
!pip install pandas
!pip3 install torch
!pip install sentence_transformers

In [1]:
import os
import csv
import pickle
import time
import hnswlib
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [2]:
model_name = 'msmarco-distilbert-base-v4'
embeddings_model = SentenceTransformer(model_name)
embedding_size = 768    #Size of embeddings
top_k_hits = 5         #Output k hits

In [3]:
def get_embedding(text: str, model: SentenceTransformer) -> list[float]:
    embeddings = model.encode([text])
    return embeddings

def get_doc_embedding(text: str, model: SentenceTransformer) -> list[float]:
    return get_embedding(text, model)

def get_query_embedding(text: str, model: SentenceTransformer) -> list[float]:
    return get_embedding(text, model)

def compute_doc_embeddings(df: pd.DataFrame, model: SentenceTransformer) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe.
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_doc_embedding(r.content.replace("\n", " "), model) for idx, r in df.iterrows()
    }

In [4]:
embeddings_cache_path = f"dishwasher-repair-manual-embeddings-{model_name.replace('/', '_')}.pkl"

#Check if embeddings cache path exists
if not os.path.exists(embeddings_cache_path):
    df = pd.read_csv('../dish-washer-data.csv')
    df["tokens"] = pd.to_numeric(df["tokens"])  # convert column "tokens" of a DataFrame
    df = df.set_index(["title", "heading"])
    print(f"{len(df)} rows in the data.")
    print(df.sample(10))
    # TODO get some stats on max/min content length
    corpus_size = df.shape[0]

    # This could take a bit of time
    print("Encoding the corpus. This might take a while...")
    corpus_embeddings = compute_doc_embeddings(df, embeddings_model)

    print("Store file...")
    with open(embeddings_cache_path, "wb") as fOut:
        pickle.dump({'sentences': df, 'embeddings': corpus_embeddings}, fOut)
else:
    print("Loading pre-computed embeddings from disc...")
    with open(embeddings_cache_path, "rb") as f_in:
        cache_data = pickle.load(f_in)
        corpus_sentences = cache_data['sentences']
        corpus_embeddings = cache_data['embeddings']
        corpus_embeddings = [v for k, v in corpus_embeddings.items()]
        print("Loading done.")
        print(f'Length, corpus sentences:{len(corpus_sentences)}')
        print(f'Length, corpus embeddings:{len(corpus_embeddings)}')
        print(f'one vector dimension: {len(corpus_embeddings[0][0])}')

        print(f"Sentence samples\n {corpus_sentences.sample(5)}")
        


Loading pre-computed embeddings from disc...
Loading done.
Length, corpus sentences:149
Length, corpus embeddings:149
one vector dimension: 768
Sentence samples
                                                              content  tokens
title     heading                                                           
Chapter_6 352       However, if you're a DIY'er, it's a BIG probl...     105
          436       If your machine uses a blower to dry the dish...      85
Chapter_3 174       Etching is caused by overly acidic conditions...     116
Chapter_6 361       If you see dotted or shaded lines around a gr...      73
          378       Sometimes you can eliminate possibilities jus...      97


In [7]:
#Defining our hnswlib index
index_path = "./hnswlib.index"
index = hnswlib.Index(space = 'cosine', dim = embedding_size)

if os.path.exists(index_path):
    print("Loading index...")
    index.load_index(index_path)
    print('Loading index done.')
else:
    ### Create the HNSWLIB index
    print("Start creating HNSWLIB index")
    # TODO check is ef_consturction and M and appropriate for this dataset
    index.init_index(max_elements = len(corpus_embeddings), ef_construction = 400, M = 64)

    print(list(range(len(corpus_embeddings))))
    # Then we train the index to find a suitable clustering
    corpus_embeddings = np.array(corpus_embeddings).squeeze()
    print('shape', corpus_embeddings.shape)
    print(f'corpus embeddings:\n{corpus_embeddings}')
    index.add_items(corpus_embeddings, list(range(len(corpus_embeddings))))
    # ?? index.add_items(corpus_embeddings, list(corpus_sentences.index))

    print("Saving index to:", index_path)
    index.save_index(index_path)

# Controlling the recall by setting ef:
index.set_ef(50)  # ef should always be > top_k_hits



Start creating HNSWLIB index
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148]
shape (149, 768)
corpus embeddings:
[[ 0.19535312  0.3732101   0.00957511 ... -0.33160087  0.40261486
   0.00784518]
 [ 0.2158837   0.24450128  0.04495094 ... -0.44080067  0.05832485
  -0.31278768]
 [ 0.02364053 -0.06990882 -0.00177497 ... -0.30113485  0.51544154
   0.23895846]
 ...
 [-0.24814965 -0.4099893   0.35149598 ... -0.107722

In [8]:
# Choose questions
# inp_question = "Why is my dishwasher leaking?"
inp_question = "Why do we use a dishwasher?"

In [9]:
start_time = time.time()
question_embedding = embeddings_model.encode(inp_question)

# Use hnswlib knn_query method to find the top_k_hits
corpus_ids, distances = index.knn_query(question_embedding, k=top_k_hits)

# Extract corpus ids and scores for the query
hits = [{'corpus_id': id, 'score': 1 - score} for id, score in zip(corpus_ids[0], distances[0])]
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
end_time = time.time()
print(f'Hits\n, {hits}')

print("Input question:", inp_question)
print("Results (after {:.3f} seconds):".format(end_time-start_time))
for hit in hits[0:top_k_hits]:
    print(f"\t{hit['score']:.3f}\t{corpus_sentences.iloc[hit['corpus_id']].content}")
    



Hits
, [{'corpus_id': np.uint64(1), 'score': np.float32(0.76414967)}, {'corpus_id': np.uint64(106), 'score': np.float32(0.63856494)}, {'corpus_id': np.uint64(0), 'score': np.float32(0.60734546)}, {'corpus_id': np.uint64(15), 'score': np.float32(0.6050055)}, {'corpus_id': np.uint64(3), 'score': np.float32(0.59901273)}]
Input question: Why do we use a dishwasher?
Results (after 0.344 seconds):
	0.764	 The main reason dishwashers exist is that they allow dishes to be washed in water much hotter than you can use when washing dishes by hand. This allows greater grease-cutting and sterilization of the dishes. They are NOT made to operate under cold water conditions or to ingest your disgusting, moldy leftovers, no matter what the sales literature says. And using cheap soap and hard water (without making some adjustments) can shorten their lives considerably.
	0.639	 Nowadays, dishwashers are being made as efficient as possible, due in no small part to government energy efficiency requirement

In [10]:
# Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity
# Here, we compute the recall of ANN compared to the exact results
correct_hits = util.semantic_search(torch.tensor(question_embedding), 
                                    torch.tensor(corpus_embeddings).squeeze(), 
                                    top_k=top_k_hits)[0]

print(f'Correct hits:\n {correct_hits}')

for hit in correct_hits[0:top_k_hits]:
    print(f"\t{hit['score']:.3f}\t{corpus_sentences.iloc[hit['corpus_id']].content}")

correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits])

ann_corpus_ids = set([hit['corpus_id'] for hit in hits])
if len(ann_corpus_ids) != len(correct_hits_ids):
    print("Approximate Nearest Neighbor returned a different number of results than expected")

recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))

if recall < 1:
    print("Missing results:")
    for hit in correct_hits[0:top_k_hits]:
        if hit['corpus_id'] not in ann_corpus_ids:
            print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']]))
print("\n\n========\n")


Correct hits:
 [{'corpus_id': 1, 'score': 0.7641496062278748}, {'corpus_id': 106, 'score': 0.638565182685852}, {'corpus_id': 0, 'score': 0.6073455214500427}, {'corpus_id': 15, 'score': 0.60500568151474}, {'corpus_id': 3, 'score': 0.5990124940872192}]
	0.764	 The main reason dishwashers exist is that they allow dishes to be washed in water much hotter than you can use when washing dishes by hand. This allows greater grease-cutting and sterilization of the dishes. They are NOT made to operate under cold water conditions or to ingest your disgusting, moldy leftovers, no matter what the sales literature says. And using cheap soap and hard water (without making some adjustments) can shorten their lives considerably.
	0.639	 Nowadays, dishwashers are being made as efficient as possible, due in no small part to government energy efficiency requirements. Heating water can use a lot of energy, so designers are mimimizing water usage and heater operation. The trick is in achieving a balance; tha