## FAISS index

In [None]:
import faiss


time: 36.6 ms (started: 2023-10-16 02:04:29 +05:30)


In [None]:
from math import sqrt

4 * sqrt(len(corpus_embeddings))

518.5672569686598

time: 1.19 ms (started: 2023-10-16 02:04:42 +05:30)


In [None]:
embedding_size = 768  # Size of embeddings
top_k_hits = 5  # Output k hits
# Number of clusters used for faiss. Select a value 4*sqrt(N) to 16*sqrt(N) - https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
n_clusters = 400

time: 402 µs (started: 2023-10-16 02:05:33 +05:30)


In [None]:
quantizer = faiss.IndexFlatIP(embedding_size)
faiss_index = faiss.IndexIVFFlat(
    quantizer, embedding_size, n_clusters, faiss.METRIC_INNER_PRODUCT
)

# Number of clusters to explorer at search time.
faiss_index.nprobe = 10

time: 375 µs (started: 2023-10-16 02:10:43 +05:30)


In [None]:
import numpy as np

# First, we need to normalize vectors to unit length
corpus_embeddings_lin = (
    corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1)[:, None]
)

# Then we train the index to find a suitable clustering
faiss_index.train(corpus_embeddings)

# Finally we add all embeddings to the index
faiss_index.add(corpus_embeddings)


time: 3.84 s (started: 2023-10-16 02:10:47 +05:30)


In [None]:
inp_question = "dying alone"

time: 175 µs (started: 2023-10-16 02:12:32 +05:30)


In [None]:
question_embedding = model.encode(inp_question)

# FAISS works with inner product (dot product). When we normalize vectors to unit length, inner product is equal to cosine similarity
question_embedding = question_embedding / np.linalg.norm(question_embedding)
question_embedding = np.expand_dims(question_embedding, axis=0)

# Search in FAISS. It returns a matrix with distances and corpus ids.
distances, corpus_ids = faiss_index.search(question_embedding, top_k_hits)

# We extract corpus ids and scores for the first query
hits = [
    {"corpus_id": id, "score": score} for id, score in zip(corpus_ids[0], distances[0])
]
hits = sorted(hits, key=lambda x: x["score"], reverse=True)

print("Input question:", inp_question)
for hit in hits[0:top_k_hits]:
    print("\t{:.3f}\t{}".format(hit["score"], hit["corpus_id"]))

Input question: dying alone
	0.479	1974
	0.475	5600
	0.460	10532
	0.444	650
	0.430	16801
time: 10.2 ms (started: 2023-10-16 02:14:21 +05:30)


In [None]:
print(poems[16801])

By: Vera Pavlova
 When the very last griefdeadens all our pain,I will follow you thereon the very next train,not because I lack strength to ponder the end result,but maybe you forgot to bringpills, a necktie, razor blades . .
time: 216 µs (started: 2023-10-16 02:14:16 +05:30)


## HNSW

In [None]:
import hnswlib
import os

time: 3.01 ms (started: 2023-09-12 16:54:23 +00:00)


In [None]:
embedding_size = 768
hnsw_index_path = "./hnswlib.index"


hnsw_index = hnswlib.Index(space = 'cosine', dim = embedding_size)


time: 1.55 ms (started: 2023-09-10 13:36:41 +00:00)


In [None]:
# if os.path.exists(hnsw_index_path):
#     print("Loading index...")
#     hnsw_index.load_index(hnsw_index_path)
# else:
#     ### Create the HNSWLIB index
#     print("Start creating HNSWLIB index")
hnsw_index.init_index(max_elements = len(corpus_embeddings), ef_construction = 400, M = 64)

# Then we train the index to find a suitable clustering
hnsw_index.add_items(corpus_embeddings, list(range(len(corpus_embeddings))))

print("Saving index to:", hnsw_index_path)
hnsw_index.save_index(hnsw_index_path)

# Controlling the recall by setting ef:


hnsw_index.set_ef(50)  # ef should always be > top_k_hits

######### Search in the index ###########

print("Corpus loaded with {} poems / embeddings".format(len(poem)))

Saving index to: ./hnswlib.index
Corpus loaded with 13716 poems / embeddings
time: 3.04 s (started: 2023-09-10 13:36:55 +00:00)


In [None]:
inp_question = "being homosexual"

time: 796 µs (started: 2023-09-10 13:37:03 +00:00)


In [None]:
top_k_hits =5
question_embedding = model.encode(inp_question)

#We use hnswlib knn_query method to find the top_k_hits
corpus_ids, distances = hnsw_index.knn_query(question_embedding, k=top_k_hits)

# We extract corpus ids and scores for the first query
hits = [{'corpus_id': id, 'score': 1-score} for id, score in zip(corpus_ids[0], distances[0])]
hits = sorted(hits, key=lambda x: x['score'], reverse=True)

print("Input question:", inp_question)
for hit in hits[0:top_k_hits]:
    print("\t{:.3f}\t{}".format(hit['score'], hit['corpus_id']))

Input question: being homosexual
	0.385	10761
	0.360	4497
	0.348	8778
	0.341	11450
	0.317	12093
time: 40 ms (started: 2023-09-10 13:37:06 +00:00)


In [None]:
print( poet[12093], poem[12093])

Bob Hicok You can’t trust lesbians. You invite them to your party and they don’t come, they’re too busy tending vaginal flowers, hating football, walking their golden and chocolate labs. X gave me a poem in which she was in love with a woman and the church but the church couldn’t accept four breasts in one bed. When I asked if our coworkers knew, she dropped her head and I said nothing for years until this morning I realized no one reads poems: my secrets and hersare safe in verse. I knew she’d have enjoyed the Beaujolais and I want to meet Dianne, Mona Lisa, Betty, Alice, the name’s been changed to protect women who can’t stand in a room holding hands because you can’t trust heterosexuals to love love, however it comes. So I recorded the party for her, for them, the mica bit away from the action to catch the feel of waves touching shore and letting go, the wash of moods across the hours of drink and yes, some grapes were thrown and I breathed the quickening revelationof a cigarette, s

In [None]:
## Using cosine distance to identify the ones that ANN would miss

correct_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k_hits)[0]
correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits])

ann_corpus_ids = set([hit['corpus_id'] for hit in hits])
if len(ann_corpus_ids) != len(correct_hits_ids):
    print("Approximate Nearest Neighbor returned a different number of results than expected")

recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))

if recall < 1:
    print("Missing results:")
    for hit in correct_hits[0:top_k_hits]:
        if hit['corpus_id'] not in ann_corpus_ids:
            print("\t{:.3f}\t{}".format(hit['score'], poem[hit['corpus_id']]))
print("\n\n========\n")


Approximate Nearest Neighbor Recall@5: 100.00



time: 21.9 ms (started: 2023-09-10 13:39:18 +00:00)
