In [2]:
"""
This example uses Approximate Nearest Neighbor Search (ANN) with Hnswlib  (https://github.com/nmslib/hnswlib/).
Install it with `pip install hnswlib`

For an embeddings model, we use the SBERT model 'msmarco-distilbert-base-v4'
For a dataset we use our own; gives us a way to compare with naive search
"""

"\nThis example uses Approximate Nearest Neighbor Search (ANN) with Hnswlib  (https://github.com/nmslib/hnswlib/).\nInstall it with `pip install hnswlib`\n\nFor an embeddings model, we use the SBERT model 'msmarco-distilbert-base-v4'\nFor a dataset we use our own; gives us a way to compare with naive search\n"

In [3]:
import os
import csv
import pickle
import time
import hnswlib
import pandas as pd
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_name = 'msmarco-distilbert-base-v4'
embeddings_model = SentenceTransformer(model_name)
embedding_size = 768    #Size of embeddings
top_k_hits = 5         #Output k hits


In [5]:
def get_embedding(text: str, model: SentenceTransformer) -> list[float]:
    embeddings = model.encode([text])
    return embeddings

def get_doc_embedding(text: str, model: SentenceTransformer) -> list[float]:
    return get_embedding(text, model)

def get_query_embedding(text: str, model: SentenceTransformer) -> list[float]:
    return get_embedding(text, model)

def compute_doc_embeddings(df: pd.DataFrame, model: SentenceTransformer) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe.
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_doc_embedding(r.content.replace("\n", " "), model) for idx, r in df.iterrows()
    }

In [6]:
embeddings_cache_path = f"dishwasher-repair-manual-embeddings-{model_name.replace('/', '_')}.pkl"
# print(embeddings_cache_path)

#Check if embeddings cache path exists
if not os.path.exists(embeddings_cache_path):
    # Check if the dataset exists. If not, read them
    df = pd.read_csv('../dish-washer-data.csv')
    df["tokens"] = pd.to_numeric(df["tokens"])  # convert column "tokens" of a DataFrame
    df = df.set_index(["title", "heading"])
    print(f"{len(df)} rows in the data.")
    print(df.sample(10))
    # TODO get some stats on max/min content length
    corpus_size = df.shape[0]

    # This could take a bit of time
    print("Encoding the corpus. This might take a while...")
    corpus_embeddings = compute_doc_embeddings(df, embeddings_model)

    print("Store file...")
    with open(embeddings_cache_path, "wb") as fOut:
        pickle.dump({'sentences': df, 'embeddings': corpus_embeddings}, fOut)
else:
    print("Loading pre-computed embeddings from disc...")
    with open(embeddings_cache_path, "rb") as f_in:
        cache_data = pickle.load(f_in)
        corpus_sentences = cache_data['sentences']
        corpus_embeddings = cache_data['embeddings']
        corpus_embeddings = [v for k, v in corpus_embeddings.items()]
        print("Loading done.")
        print(f'Length, corpus sentences:{len(corpus_sentences)}')
        print(f'Length, corpus embeddings:{len(corpus_embeddings)}')
        print(f'type corpus embeddings: {type(corpus_embeddings)}')
        print(f'one vector dimension: {len(corpus_embeddings[0][0])}')

        print(f"sentence samples")
        print(corpus_sentences.sample(10))
        


Loading pre-computed embeddings from disc...
Loading done.
Length, corpus sentences:149
Length, corpus embeddings:149
type corpus embeddings: <class 'list'>
one vector dimension: 768
sentence samples
                                                             content  tokens
title     heading                                                           
Chapter_3 183       Cutlery with wood, bone or horn handles may c...      74
Chapter_4 209       The solution is to shut off the water supply ...      73
Chapter_2 98        Disassemble or remove the pump and motor unit...      91
Chapter_6 365       Remember that for something to be energized, ...      88
Chapter_4 199       To get through the cycle you've already start...      74
          216       In sidewinder machines with butterfly drain v...      78
Chapter_6 408       A motor that is trying to start, but can't fo...      85
Chapter_2 104       Take the pump housing apart as described in C...      81
Chapter_5 260       You usuall

In [12]:
#Defining our hnswlib index
index_path = "./hnswlib.index"
#We use Inner Product (dot-product) as Index. We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity
import hnswlib
import numpy as np
index = hnswlib.Index(space = 'cosine', dim = embedding_size)

if os.path.exists(index_path):
    print("Loading index...")
    index.load_index(index_path)
    print('Loading index done.')
else:
    ### Create the HNSWLIB index
    print("Start creating HNSWLIB index")
    # TODO check is ef_consturction and M and appropriate for this dataset
    index.init_index(max_elements = len(corpus_embeddings), ef_construction = 400, M = 64)

    print(list(range(len(corpus_embeddings))))
    # Then we train the index to find a suitable clustering
    corpus_embeddings = np.array(corpus_embeddings).squeeze()
    print('shape', corpus_embeddings.shape)
    index.add_items(corpus_embeddings, list(range(len(corpus_embeddings))))

    print("Saving index to:", index_path)
    index.save_index(index_path)

# Controlling the recall by setting ef:
index.set_ef(50)  # ef should always be > top_k_hits



Start creating HNSWLIB index
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148]
shape (149, 768)
Saving index to: ./hnswlib.index


In [24]:
######### Search in the index ###########

print(f"Corpus loaded with {len(corpus_sentences)} sentences")


# inp_question = input("Enter your question: ")
inp_question = "Why is my dishwasher leaking?"



Corpus loaded with 149 sentences


In [28]:
start_time = time.time()
question_embedding = embeddings_model.encode(inp_question)
print('length of question embedding', len(question_embedding))

#We use hnswlib knn_query method to find the top_k_hits
corpus_ids, distances = index.knn_query(question_embedding, k=top_k_hits)
# print('corpus_ids', corpus_ids)

# We extract corpus ids and scores for the first query
hits = [{'corpus_id': id, 'score': 1 - score} for id, score in zip(corpus_ids[0], distances[0])]
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
print('hits', hits)
end_time = time.time()

print("Input question:", inp_question)
print("Results (after {:.3f} seconds):".format(end_time-start_time))
print('type', type(corpus_sentences))
# print(corpus_sentences)
for hit in hits[0:top_k_hits]:
    print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences.iloc[hit['corpus_id']]))

# # Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity
# # Here, we compute the recall of ANN compared to the exact results
# correct_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k_hits)[0]
# correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits])

# ann_corpus_ids = set([hit['corpus_id'] for hit in hits])
# if len(ann_corpus_ids) != len(correct_hits_ids):
#     print("Approximate Nearest Neighbor returned a different number of results than expected")

# recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
# print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))

# if recall < 1:
#     print("Missing results:")
#     for hit in correct_hits[0:top_k_hits]:
#         if hit['corpus_id'] not in ann_corpus_ids:
#             print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']]))
# print("\n\n========\n")


length of question embedding 768
hits [{'corpus_id': 55, 'score': 0.7006855607032776}, {'corpus_id': 20, 'score': 0.5618450045585632}, {'corpus_id': 56, 'score': 0.5229777693748474}, {'corpus_id': 40, 'score': 0.507260799407959}, {'corpus_id': 1, 'score': 0.505429208278656}]
Input question: Why is my dishwasher leaking?
Results (after 0.026 seconds):
type <class 'pandas.core.frame.DataFrame'>
	0.701	content     There are two general areas where you will co...
tokens                                                    77
Name: (Chapter_4, 190), dtype: object
	0.562	content     POOR WASH QUALITY: This is the most common co...
tokens                                                    77
Name: (Chapter_2, 81), dtype: object
	0.523	content     A slow, under-tub leak may go years without b...
tokens                                                    71
Name: (Chapter_4, 191), dtype: object
	0.507	content     NOTE: This Chapter assumes that the motor is ...
tokens                              