In [1]:
import requests
from io import StringIO
import pandas as pd

In [2]:
%pip install faiss-cpu


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
data = pd.read_csv(StringIO(res.text), sep='\t')
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [4]:
sentences = data['sentence_A'].tolist()
sentence_b = data['sentence_B'].tolist()
sentences.extend(sentence_b)
len(set(sentences))

4802

since this is not a very large number, we need more sentences

In [5]:
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]


In [6]:
for url in urls:
    res = requests.get(url)
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
    print(data.shape)
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())

(723, 3)
(701, 3)
(750, 3)
(561, 3)
(750, 3)
(750, 3)
(1500, 3)


In [7]:
len(set(sentences))

14505

remove duplicates now

In [8]:
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [9]:
%pip install sentence_transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


(14504, 768)

let us use indexflatl2, i.e the euclidian distance between all points and our query

In [10]:
import faiss
d = sentence_embeddings.shape[1]
d

768

In [11]:
index = faiss.IndexFlatL2(d)

some indexes need to be trained before loading, so we can check using:

In [12]:
index.is_trained

True

In [13]:
index.add(sentence_embeddings)
index.ntotal

14504

let number of nearest neighbors be k, and search query be xq

In [14]:
k = 4
xq = model.encode(["Someone sprints with a football"])


In [15]:
%%time
D, I = index.search(xq, k)
print(I)

[[10947 10138  8981  9991]]
CPU times: user 7.24 ms, sys: 2.1 ms, total: 9.34 ms
Wall time: 7.17 ms


In [16]:
for i in [5866, 3917, 6183, 9407]:
    print(f"{i}\t{sentences[i]}")

5866	Two boys in a field kicking a soccer ball.
3917	Brown dog running on grass.
6183	A black and white dog is swimming in a large green lake.
9407	Two women shearing a white sheep in a wooden stall.


these are clearly great matches, now let us extract the actual numerical vectors from faiss

In [17]:
import numpy as np
vecs = np.zeros((k,d))

for i, val in enumerate(I[0].tolist()):
    vecs[i, :] = index.reconstruct(val)

In [18]:
vecs.shape

(4, 768)

In [19]:
vecs[0][:100]

array([ 0.01627047,  0.22325911, -0.15037431, -0.30747238, -0.27122411,
       -0.10593214, -0.06460946,  0.04738141, -0.73349041, -0.37657666,
       -0.76762801,  0.16902865,  0.53107691,  0.51176602,  1.14415848,
       -0.0856294 , -0.67240065, -0.96637076,  0.0254542 , -0.21559823,
       -1.25656521, -0.82982141, -0.09824999, -0.21850841,  0.50610226,
        0.10527912,  0.50396878,  0.6524294 , -1.39458692,  0.6584751 ,
       -0.21525329, -0.22487469,  0.81818318,  0.08464345, -0.76141715,
       -0.28928268, -0.09825802, -0.73046142,  0.07855809, -0.84354591,
       -0.59242058,  0.77471322, -1.20920563, -0.22757955, -1.30733585,
       -0.23081516, -1.31322575,  0.0162904 , -0.97285455,  0.19308192,
        0.4742457 ,  1.18920982, -1.96741343, -0.70061046, -0.29638696,
        0.60533744,  0.62407446, -0.70340371, -0.8675428 ,  0.17673112,
       -0.19170494, -0.02951982,  0.22623539, -0.16695476, -0.80402517,
       -0.45918974,  0.69675493, -0.24928208, -1.01478684, -0.92

indexflatl2 is computationally expensive when used alone, and does not scale well. this is because it is an exhaustive search and our query vector is compared to every other vector in our index.

better approach is to partition the index using voronoi cells