In [1]:
from diskann import *

In [2]:
import numpy as np
import os
import ftplib
import tarfile

## Sample Data
- The SIFTSMALL dataset(ftp://ftp.irisa.fr/local/texmex/corpus/siftsmall.tar.gz) contains 10,000 vectors with dimensions=128
- These vectors are used to create the index
- The same vectors are used to perform search as well

In [3]:
import ftplib
FILENAME = "siftsmall.tar.gz"

ftp_url = "ftp.irisa.fr"
with ftplib.FTP(ftp_url) as ftp:
    ftp.login()
    ftp.cwd("local/texmex/corpus")
    with open("/tmp/siftsmall.tar.gz", 'wb') as f:
        ftp.retrbinary('RETR ' + FILENAME, f.write)


In [4]:
file = tarfile.open('/tmp/siftsmall.tar.gz')
# extracting file
file.extractall('/tmp')
file.close()

In [5]:
def fvecs_read(filename, c_contiguous=True):
    fv = np.fromfile(filename, dtype=np.float32)
    if fv.size == 0:
        return np.zeros((0, 0))
    dim = fv.view(np.int32)[0]
    assert dim > 0
    fv = fv.reshape(-1, 1 + dim)
    if not all(fv.view(np.int32)[:, 0] == dim):
        raise IOError("Non-uniform vector sizes in " + filename)
    fv = fv[:, 1:]
    if c_contiguous:
        fv = fv.copy()
    return fv

In [6]:
! ls /tmp/siftsmall

siftsmall_base.fvecs        siftsmall_learn.fvecs
siftsmall_groundtruth.ivecs siftsmall_query.fvecs


In [7]:
path = "/tmp/siftsmall"
vecs_file = os.path.join(path, "siftsmall_base.fvecs")
np_sift = fvecs_read(vecs_file)

In [9]:
from diskann import IndexBuildParams, IndexSearchParams, DiskANN

shared_lib_path = "../../build/lib/pylib/libpydisk_index.dylib"
index_path = "./DiskANN_data/siftsmall"

# np_vecs represents a numpy array containing input vectors
diskAnn = DiskANN(shared_lib_path)
idx_bld_params = IndexBuildParams(metric="l2", 
                                  graph_degree=32, 
                                  search_list_size=50,
                                  max_mem_build=1.0)

diskAnn.build_disk_index(index_path, np_sift, idx_bld_params)

Starting index build: R=32 L=50 Query RAM budget: 3.22123e+08 Indexing ram budget: 1 T: 32
Compressing 128-dimensional data into 128 bytes per vector.
Opened: /var/folders/hd/8ct1rg6n3w71kzr3lkhz1ffh0000gn/T/tmpnx6rqi70, size: 5120008, cache_size: 5120008
Training data loaded of size 10000
 Stat(./DiskANN_data/siftsmall_pq_pivots.bin) returned: 0
Reading bin file ./DiskANN_data/siftsmall_pq_pivots.bin ...
Metadata: #pts = 256, #dims = 128...
PQ pivot file exists. Not generating again
Opened: /var/folders/hd/8ct1rg6n3w71kzr3lkhz1ffh0000gn/T/tmpnx6rqi70, size: 5120008, cache_size: 5120008
 Stat(./DiskANN_data/siftsmall_pq_pivots.bin) returned: 0
Reading bin file ./DiskANN_data/siftsmall_pq_pivots.bin_centroid.bin ...
Metadata: #pts = 128, #dims = 1...
Reading bin file ./DiskANN_data/siftsmall_pq_pivots.bin_rearrangement_perm.bin ...
Metadata: #pts = 128, #dims = 1...
Reading bin file ./DiskANN_data/siftsmall_pq_pivots.bin_chunk_offsets.bin ...
Metadata: #pts = 129, #dims = 1...
Reading b

1

In [10]:
idx_srch_params = IndexSearchParams(num_nodes_to_cache=100000, 
                                    num_threads=32, 
                                    beam_width=4,
                                    search_list_size=60)

num_neighbours = 10

query_res = diskAnn.search_disk_index(np_sift, num_neighbours, idx_srch_params)

Reading bin file /var/folders/hd/8ct1rg6n3w71kzr3lkhz1ffh0000gn/T/tmp69ckl4mh ...Metadata: #pts = 10000, #dims = 128, aligned_dim = 128...allocating aligned memory, 5120000 bytes...done. Copying data... done.
********* Loaded query binary file...
Using AVX2 functions for dist_cmp and dist_cmp_float
Reading bin file ./DiskANN_data/siftsmall_pq_compressed.bin ...
Metadata: #pts = 10000, #dims = 128...
Reading bin file ./DiskANN_data/siftsmall_pq_pivots.bin ...
Metadata: #pts = 256, #dims = 128...
 Stat(./DiskANN_data/siftsmall_pq_pivots.bin_chunk_offsets.bin) returned: 0
Reading bin file ./DiskANN_data/siftsmall_pq_pivots.bin_rearrangement_perm.bin ...
Metadata: #pts = 128, #dims = 1...
Reading bin file ./DiskANN_data/siftsmall_pq_pivots.bin_chunk_offsets.bin ...
Metadata: #pts = 129, #dims = 1...
PQ data has 128 bytes per point.
Reading bin file ./DiskANN_data/siftsmall_pq_pivots.bin_centroid.bin ...
Metadata: #pts = 128, #dims = 1...
PQ Pivots: #ctrs: 256, #dims: 128, #chunks: 128
Load

Opened file : ./DiskANN_data/siftsmall_disk.index


 Stat(./DiskANN_data/siftsmall_disk.index_medoids.bin) returned: -1
Loading centroid data from medoids vector data of 1 medoid(s)
 Stat(./DiskANN_data/siftsmall_disk.index_max_base_norm.bin) returned: -1
done..
*********** Loaded Flash Index...
Caching 100000 BFS nodes around medoid(s)
 Stat(./DiskANN_data/siftsmall_sample_data.bin) returned: 0
Reading bin file ./DiskANN_data/siftsmall_sample_data.bin ...Metadata: #pts = 10000, #dims = 128, aligned_dim = 128...allocating aligned memory, 5120000 bytes...done. Copying data... done.
Loading the cache list into memory....done.
************ Loaded query cache from sample queries
Done searching. Now saving results 
Writing bin: /tmp/query_res_60_idx_uint32.bin
bin: #pts = 10000, #dims = 10, size = 400008B
Finished writing bin.
Writing bin: /tmp/query_res_60_dists_float.bin
bin: #pts = 10000, #dims = 10, size = 400008B
Finished writing bin.
Clearing scratch


In [11]:
ctr = 0
for i in range(len(np_sift)):
    arr = query_res[i*num_neighbours:(i+1)*num_neighbours]
    # Result is a tuple consisting of (id, dist)
    arr = [a[0] for a in arr]
    if i not in arr:
        ctr += 1
recall = 1 - ctr/len(np_sift)
print("Number of unmatched vectors={}, recall={}".format(ctr, recall))

Number of unmatched vectors=0, recall=1.0
