<a href="https://colab.research.google.com/github/akanksha0911/ANN-Methods/blob/main/ANN_methods_using_FAISS_Annoy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [151]:
!apt install libomp-dev
!python -m pip install --upgrade faiss
import faiss

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (5.0.1-1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [152]:
!pip install annoy



In [153]:
import annoy

In [154]:
from annoy import AnnoyIndex
import random

In [155]:
 !pip install sentence-transformers




In [156]:
import requests
from io import StringIO
import pandas as pd

In [157]:
data = pd.read_csv('/content/Untitled spreadsheet - Sheet1 (1).csv')
data.head()

Unnamed: 0,Sentences
0,Good health is central to handling stress and ...
1,"Health is a state of complete physical, mental..."
2,Mental and physical health are probably the tw...
3,Football is the world’s most popular ball game...
4,Modern football originated in Britain in the 1...


In [158]:
sentences = data['Sentences'].tolist()
sentences[:5]

['Good health is central to handling stress and living a longer, more active life',
 'Health is a state of complete physical, mental, and social well-being and not merely the absence of disease or infirmity',
 'Mental and physical health are probably the two most frequently discussed types of health.',
 'Football is the world’s most popular ball game in numbers of participants and spectators',
 'Modern football originated in Britain in the 19th century.']

In [159]:
sentences = data['Sentences'].tolist()
len(set(sentences)) 

9

In [160]:
# remove duplicates and NaN
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [161]:
from sentence_transformers import SentenceTransformer
# initialize sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')
# create sentence embeddings
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

(9, 768)

In [162]:
d = sentence_embeddings.shape[1]
d

768

In [163]:
nbits=d*2
index = faiss.IndexLSH(d,nbits)

In [164]:
index.is_trained

True

In [165]:
index.add(sentence_embeddings)

In [166]:
index.ntotal

9

In [167]:
k = 4
xq = model.encode(["health"])

In [168]:

D, I = index.search(xq, k)  # search
print(I)

[[2 5 4 8]]


In [169]:
data['Sentences'].iloc[[1,6,7,8]]

1    Health is a state of complete physical, mental...
6    he variance in rules made it difficult for pub...
7    The consequence was that northern clubs, with ...
8    n its most general form, the activities descri...
Name: Sentences, dtype: object

In [170]:
sentence_embeddings[I[0]]

array([[ 0.48714146,  0.54696697,  0.4888124 , ...,  0.5850455 ,
        -0.33022356,  0.16207601],
       [ 0.17000906, -0.15413846,  1.4202784 , ..., -0.1878344 ,
        -0.40962493, -0.65946245],
       [-0.04052224,  0.46956408,  0.38203186, ..., -0.4890077 ,
         0.23508275, -0.49799684],
       [ 0.31459934, -0.07955608,  1.086542  , ...,  0.53169256,
        -1.0007148 ,  0.24592972]], dtype=float32)

In [171]:
# and calculate the cosine similarity between each of these vectors and xq[0]
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(sentence_embeddings[I[0]], [xq[0]])



array([[0.57913065],
       [0.5669739 ],
       [0.4887184 ],
       [0.25352752]], dtype=float32)

In [172]:
D

array([[477., 488., 524., 655.]], dtype=float32)

**Exhaustive Search**

In [173]:
index = faiss.IndexFlatL2(d)
index.is_trained



True

In [174]:
index.add(sentence_embeddings)
index.ntotal


9

In [175]:
k = 4
xq = model.encode(["Someone sprints with a football"])


In [176]:
D, I = index.search(xq, k)  # search
print(I)


[[1 7 6 8]]


In [177]:
data['Sentences'].iloc[[1,7,6 ,8]]

1    Health is a state of complete physical, mental...
7    The consequence was that northern clubs, with ...
6    he variance in rules made it difficult for pub...
8    n its most general form, the activities descri...
Name: Sentences, dtype: object

**HNSW Implementation**

In [178]:
# set HNSW index parameters
M = 64  # number of connections each vertex will have
ef_search = 32  # depth of layers explored during search
ef_construction = 64  # depth of layers explored during index construction

# initialize index (d == 128)
index = faiss.IndexHNSWFlat(d, M)
# set efConstruction and efSearch parameters
index.hnsw.efConstruction = ef_construction
index.hnsw.efSearch = ef_search
# add data to index
index.add(sentence_embeddings)

# search as usual
D, I = index.search(sentence_embeddings[I[0]], k=5)

print(I)

[[1 0 4 3 6]
 [7 3 8 2 4]
 [6 1 4 3 0]
 [8 7 4 5 2]]


**Product Quantization**

In [179]:
nlist = 3  # how many cells
m = 2  # number of centroid IDs in final compressed vectors
bits = 2 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(d)  # we keep the same L2 distance flat index
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits) 

In [180]:
index.is_trained

False

In [181]:
index.train(sentence_embeddings)

In [182]:
index.add(sentence_embeddings)


number of nearby cells to search  with nprobe.

In [183]:
index.nprobe = 10 

In [184]:
D, I = index.search(xq, k)
print(I)


[[1 8 4 3]]


In [185]:


[f'{i}: {sentences[i]}' for i in I[0]]



['1: Football is the world’s most popular ball game in numbers of participants and spectators',
 '8: n its most general form, the activities describing music as an art form or cultural activity include the creation of works of music',
 '4: Mental and physical health are probably the two most frequently discussed types of health.',
 '3: he variance in rules made it difficult for public schoolboys entering university to continue playing except with former schoolmates']

**trees and graphs**

In [186]:
number_of_trees=5


index = annoy.AnnoyIndex(d)
for i, vec in enumerate(sentence_embeddings):
  index.add_item(i, vec.tolist())
index.build(number_of_trees)


  after removing the cwd from sys.path.


True

In [187]:
k=4
I = index.get_nns_by_vector(vec.tolist(), k, search_k=-1)                                         
print(I)



[8, 7, 4, 5]


In [189]:
data['Sentences'].iloc[[8,7,4,5]]

8    n its most general form, the activities descri...
7    The consequence was that northern clubs, with ...
4    Modern football originated in Britain in the 1...
5    Each school had its own rules; some allowed li...
Name: Sentences, dtype: object