In [1]:
!pip install -q faiss-cpu faiss-gpu sentence-transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
nbits = 4
d = 2

In [9]:
import numpy as np
plane_norms = np.random.rand(nbits, d) - 0.5
plane_norms

array([[-0.4614033 ,  0.34868215],
       [-0.13359095, -0.12400435],
       [ 0.22152424, -0.07139028],
       [-0.49876006, -0.19509125]])

In [10]:
a = np.asarray([1, 2])
b = np.asarray([2, 1])
c = np.asarray([3, 1])

In [12]:
plane_norms.shape, a.shape

((4, 2), (2,))

In [13]:
plane_norms.dot(a)

array([ 0.23596099, -0.38159964,  0.07874368, -0.88894255])

In [16]:
a_dot = np.dot(a, plane_norms.T)
b_dot = np.dot(b, plane_norms.T)
c_dot = np.dot(c, plane_norms.T)
a_dot

array([ 0.23596099, -0.38159964,  0.07874368, -0.88894255])

In [18]:
a_dot = a_dot > 0
b_dot = b_dot > 0
c_dot = c_dot > 0
a_dot

array([ True, False,  True, False])

In [22]:
a_dot = a_dot.astype(int)
b_dot = b_dot.astype(int)
c_dot = c_dot.astype(int)
print(a_dot)
print(b_dot)
print(c_dot)

[1 0 1 0]
[0 0 1 0]
[0 0 1 0]


In [24]:
vecs = [a_dot, b_dot, c_dot]
buckets = {}
for i in range(len(vecs)):
  key = "".join(vecs[i].astype(str))
  if key in buckets:
    buckets[key].append(i)
  else:
    buckets[key] = [i]
buckets

{'1010': [0], '0010': [1, 2]}

In [25]:
import shutil
from contextlib import closing
from urllib import request

with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:
  with open('sift.tar.gz', 'wb') as f:
    shutil.copyfileobj(r, f)

In [26]:
import tarfile
tar = tarfile.open('sift.tar.gz', 'r:gz')
tar.extractall()

In [27]:
def read_fvecs(fvecfile):
  a = np.fromfile(fvecfile, dtype='int32')
  d = a[0]
  return a.reshape(-1, d+1)[:, 1:].copy().view('float32')

In [28]:
xb = read_fvecs('./sift/sift_base.fvecs')
xq = read_fvecs('./sift/sift_query.fvecs')
print(xb.shape, xq.shape)

(1000000, 128) (10000, 128)


In [29]:
import faiss

In [31]:
d = xb.shape[1]
nbits = 4
index = faiss.IndexLSH(d, nbits)
index.add(xb)

In [32]:
xq0 = xb[0][None]
xq0.shape

(1, 128)

In [34]:
%%time
D, I = index.search(xq0, k=10)
I

CPU times: user 18.1 ms, sys: 0 ns, total: 18.1 ms
Wall time: 12.3 ms


array([[ 0,  2,  6, 25, 26, 43, 47, 70, 73, 74]])

In [37]:
xb[I[0]].shape

(10, 128)

In [42]:
from sentence_transformers import util
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
util.pytorch_cos_sim(xb[I[0]], xq0)

tensor([[1.0000],
        [0.9638],
        [0.9529],
        [0.1947],
        [0.1713],
        [0.2192],
        [0.2208],
        [0.8036],
        [0.8438],
        [0.8415]])

In [43]:
cosine_similarity(xb[I[0]], xq0)

array([[1.        ],
       [0.9637785 ],
       [0.95294523],
       [0.1947029 ],
       [0.17127216],
       [0.21918732],
       [0.22081813],
       [0.8035712 ],
       [0.8437884 ],
       [0.84145665]], dtype=float32)

In [45]:
k=100
while True:
  D, I = index.search(xq0, k=k)
  if D.any() != 0:
    break
  k += 100
k

172100

In [46]:
for nbits in [2, 4, 8, 16, 24, 32]:
  buckets = 1 << nbits
  print(f"nbits=={nbits}")
  print(f"{xb.shape[0]} / {buckets} = {xb.shape[0]/buckets}")

nbits==2
1000000 / 4 = 250000.0
nbits==4
1000000 / 16 = 62500.0
nbits==8
1000000 / 256 = 3906.25
nbits==16
1000000 / 65536 = 15.2587890625
nbits==24
1000000 / 16777216 = 0.059604644775390625
nbits==32
1000000 / 4294967296 = 0.00023283064365386963


In [49]:
for nbits in [2, 4, 8, 16, 24, 32, 64, 128]:
  index = faiss.IndexLSH(d, nbits)
  index.add(xb)
  D, I = index.search(xq0, k=k)
  cos = cosine_similarity(xb[I[0]], xq0)
  print(np.mean(cos))

0.4149835
0.37523153
0.72134036
0.6057699
0.70601416
0.7077736
0.74506366
0.75215524


In [51]:
index = faiss.IndexLSH(d, 4)
index.add(xb)

In [56]:
arr = faiss.vector_to_array(index.codes)
arr

array([ 5, 12,  5, ..., 15, 13, 12], dtype=uint8)

In [57]:
arr.shape

(1000000,)