This benchmarking notebook was taken and modified from 

https://github.com/nmslib/nmslib/blob/master/python_bindings/notebooks/search_vector_dense_optim.ipynb

In [1]:
from simtool.indexing import Indexer
import nmslib
import numpy as np
import sys 
import nmslib 
import time 
import math 
from sklearn.neighbors import NearestNeighbors

In [2]:
indexer = Indexer()

In [3]:

train_embeddings, train_paths = indexer._gather_embeddings('/s/mlsc/abake116/geodata/embeddings/train')
test_embeddings, test_paths = indexer._gather_embeddings('/s/mlsc/abake116/geodata/embeddings/test')


index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(train_embeddings)
index.createIndex({'post': 2}, print_progress=False)

In [5]:
# Set index parameters
# These are the most important onese
M = 15
efC = 100

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0,
                     'skip_optimized_index' : 1 # using non-optimized index!
                    }

In [6]:
# Number of neighbors 
K=50

In [7]:
# Space name should correspond to the space name 
# used for brute-force search
space_name='cosinesimil'

In [8]:

# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='hnsw', space=space_name) 
index.addDataPointBatch(train_embeddings)

28384

In [9]:
# Create an index
start = time.time()
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
end = time.time() 
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))

Index-time parameters {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100}
Indexing time = 1.495295


In [10]:

# Setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [11]:
# Querying
query_qty = test_embeddings.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(test_embeddings, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))

kNN time total=0.364536 (sec), per query=0.000036 (sec), per query adjusted for thread number=0.000144 (sec)


In [12]:
# Computing gold-standard data 
print('Computing gold-standard data')

start = time.time()
sindx = NearestNeighbors(n_neighbors=K, metric='cosine', algorithm='brute').fit(train_embeddings)
end = time.time()

print('Brute-force preparation time %f' % (end - start))

start = time.time() 
gs = sindx.kneighbors(test_embeddings)
end = time.time()

print('brute-force kNN time total=%f (sec), per query=%f (sec)' % 
      (end-start, float(end-start)/query_qty) )

Computing gold-standard data
Brute-force preparation time 0.010092
brute-force kNN time total=8.708914 (sec), per query=0.000860 (sec)


In [13]:
# Finally computing recall
recall=0.0
for i in range(0, query_qty):
    correct_set = set(gs[1][i])
    ret_set = set(nbrs[i][0])
    recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print('kNN recall %f' % recall)

kNN recall 0.999103
