In [102]:
import shutil
from contextlib import closing
from urllib import request

with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:
  with open('sift.tar.gz', 'wb') as f:
    shutil.copyfileobj(r, f)

In [103]:
import tarfile
tar = tarfile.open('sift.tar.gz', 'r:gz')
tar.extractall()

In [104]:
import numpy as np

a = np.fromfile('./sift/sift_base.fvecs', dtype='int32')
d = a[0]
print(d)

128


In [105]:
a.reshape(-1, d + 1)[:, 1:].shape

(1000000, 128)

In [106]:
def read_fvecs(fvecfile):
  a = np.fromfile(fvecfile, dtype='int32')
  d = a[0]
  return a.reshape(-1, d+1)[:, 1:].copy().view('float32')

In [107]:
xb = read_fvecs('./sift/sift_base.fvecs')
xq = read_fvecs('./sift/sift_query.fvecs')
print(xb.shape, xq.shape)

(1000000, 128) (10000, 128)


In [108]:
xq = xq[0][None]
xq.shape

(1, 128)

In [109]:
xq

array([[  1.,   3.,  11., 110.,  62.,  22.,   4.,   0.,  43.,  21.,  22.,
         18.,   6.,  28.,  64.,   9.,  11.,   1.,   0.,   0.,   1.,  40.,
        101.,  21.,  20.,   2.,   4.,   2.,   2.,   9.,  18.,  35.,   1.,
          1.,   7.,  25., 108., 116.,  63.,   2.,   0.,   0.,  11.,  74.,
         40., 101., 116.,   3.,  33.,   1.,   1.,  11.,  14.,  18., 116.,
        116.,  68.,  12.,   5.,   4.,   2.,   2.,   9., 102.,  17.,   3.,
         10.,  18.,   8.,  15.,  67.,  63.,  15.,   0.,  14., 116.,  80.,
          0.,   2.,  22.,  96.,  37.,  28.,  88.,  43.,   1.,   4.,  18.,
        116.,  51.,   5.,  11.,  32.,  14.,   8.,  23.,  44.,  17.,  12.,
          9.,   0.,   0.,  19.,  37.,  85.,  18.,  16., 104.,  22.,   6.,
          2.,  26.,  12.,  58.,  67.,  82.,  25.,  12.,   2.,   2.,  25.,
         18.,   8.,   2.,  19.,  42.,  48.,  11.]], dtype=float32)

In [110]:
%%time
import faiss
d = 128
k = 10
index = faiss.IndexFlatIP(d)
index.add(xb)
index.ntotal

CPU times: user 157 ms, sys: 277 ms, total: 434 ms
Wall time: 421 ms


1000000

In [111]:
%%time
D, I = index.search(xq, k)

CPU times: user 47 ms, sys: 0 ns, total: 47 ms
Wall time: 46.2 ms


In [112]:
nbit = d*4
index = faiss.IndexLSH(d, nbit)

In [113]:
%%time
index.add(xb)

CPU times: user 3.35 s, sys: 1.3 s, total: 4.65 s
Wall time: 5.03 s


In [114]:
%%time
D, I = index.search(xq, 5)
I

CPU times: user 9.85 ms, sys: 0 ns, total: 9.85 ms
Wall time: 19.8 ms


array([[435345, 931632, 708177, 813701, 934876]])

In [116]:
%%time
M = 64
ef_search = 32
ef_construction = 64
index = faiss.IndexHNSWFlat(d, M)
index.hnsw.efConstruction = ef_construction
index.hnsw.efSearch = ef_search
index.add(xb)

CPU times: user 9min 17s, sys: 1.56 s, total: 9min 18s
Wall time: 9min 29s


In [117]:
%%time
D, I = index.search(xq, 5)
I

CPU times: user 1.65 ms, sys: 0 ns, total: 1.65 ms
Wall time: 4.07 ms


array([[932085, 934876, 561813, 708177, 706771]])

In [124]:
%%time
nlist = 128
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

CPU times: user 564 µs, sys: 0 ns, total: 564 µs
Wall time: 489 µs


In [125]:
%%time
index.train(xb)
index.add(xb)

CPU times: user 1.61 s, sys: 298 ms, total: 1.9 s
Wall time: 1.89 s


In [126]:
%%time
D, I = index.search(xq, 5)
I

CPU times: user 2.51 ms, sys: 0 ns, total: 2.51 ms
Wall time: 1.37 ms


array([[932085, 708177, 706771, 701258, 455537]])