In [1]:
import time

import numpy
import scipy
import unittest
import nearpy.utils.utils
from nearpy import Engine
from nearpy.distances import EuclideanDistance

from nearpy.hashes import RandomBinaryProjections, RandomBinaryProjectionsBias, RandomBinaryProjectionTree, HashPermutations, HashPermutationMapper


In [2]:
def print_results(results):
    print('  Data \t| Distance')
    for r in results:
        data = r[1]
        dist = r[2]
        print('  {}  \t| {:.6f}'.format(data, dist))


In [3]:
# Dimension of feature space
DIM = 100

# Number of data points (dont do too much because of exact search)
POINTS = 100000



In [4]:
print('Creating engines')


# Create engine 1
# We want 12 projections, 20 results at least
rbpt = RandomBinaryProjectionTree('rbpt', 20, 20)
engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=EuclideanDistance())


Creating engines
*** engine init done ***


In [5]:
# Create engine 2
rbp1 = RandomBinaryProjectionsBias('rbp1', 5,1)
engine = Engine(DIM, lshashes=[rbp1], distance=EuclideanDistance())


*** engine init done ***


In [6]:
# Create engine 3
# Create permutations meta-hash
permutations = HashPermutations('permut')

# Create binary hash as child hash
rbp_perm = RandomBinaryProjections('rbp_perm', 20)
rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

# Add rbp as child hash of permutations hash
permutations.add_child_hash(rbp_perm, rbp_conf)

engine_perm = Engine(DIM, lshashes=[permutations], distance=EuclideanDistance())


*** engine init done ***


In [7]:
# Create engine 4
# Create permutations meta-hash
permutations2 = HashPermutationMapper('permut2')

# Create binary hash as child hash
rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12)

# Add rbp as child hash of permutations hash
permutations2.add_child_hash(rbp_perm2)

engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=EuclideanDistance())



*** engine init done ***


In [8]:
print('Indexing %d random vectors of dimension %d' % (POINTS, DIM))

# First index some random vectors
matrix = numpy.zeros((POINTS,DIM))
for i in range(POINTS):
    v = numpy.random.randn(DIM)
    matrix[i, :] = v
    engine.store_vector(v, i)
    engine_rbpt.store_vector(v, i)
    engine_perm.store_vector(v, i)
    engine_perm2.store_vector(v, i)

print('Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys()))
print('Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys()))


Indexing 100000 random vectors of dimension 100
Buckets 1 = 32
Buckets 2 = 91048


In [9]:
print('Building permuted index for HashPermutations')

# Then update permuted index
permutations.build_permuted_index()


Building permuted index for HashPermutations


In [10]:
print('Generate random data')

# Get random query vector
query = numpy.random.randn(DIM)


Generate random data


In [11]:
# Do random query on engine 1
print('\nNeighbour distances with RandomBinaryProjectionTree:')
print('  -> Candidate count is %d' % engine_rbpt.candidate_count(query))

t0=time.time()
results = engine_rbpt.neighbours(query)
t1 = time.time()
print('Query took %f seconds' % (t1-t0))

print_results(results)



Neighbour distances with RandomBinaryProjectionTree:
  -> Candidate count is 26
Query took 0.001529 seconds
  Data 	| Distance
  13854  	| 13.006561
  13955  	| 13.019674
  44488  	| 13.374769
  88062  	| 13.450678
  87574  	| 13.642172
  50347  	| 13.791151
  29583  	| 13.811685
  80151  	| 13.879819
  47402  	| 14.012909
  49383  	| 14.095808


In [12]:
# Do random query on engine 2
print('\nNeighbour distances with RandomBinaryProjections:')
print('  -> Candidate count is %d' % engine.candidate_count(query))

t0=time.time()
results = engine.neighbours(query)
t1 = time.time()
print('Query took %f seconds' % (t1-t0))

print_results(results)



Neighbour distances with RandomBinaryProjections:
  -> Candidate count is 3838
Query took 0.039614 seconds
  Data 	| Distance
  7830  	| 11.414758
  26368  	| 11.590515
  95468  	| 11.661299
  73457  	| 11.670204
  45173  	| 11.679428
  75149  	| 11.794335
  67002  	| 11.794519
  3588  	| 11.923831
  92726  	| 12.002960
  36262  	| 12.026924


In [13]:
# Do random query on engine 3
print('\nNeighbour distances with HashPermutations:')
print('  -> Candidate count is %d' % engine_perm.candidate_count(query))

t0=time.time()
results = engine_perm.neighbours(query)
t1 = time.time()
print('Query took %f seconds' % (t1-t0))

print_results(results)



Neighbour distances with HashPermutations:
  -> Candidate count is 114
Query took 0.003474 seconds
  Data 	| Distance
  92545  	| 11.751412
  87321  	| 11.848985
  51618  	| 12.143929
  26920  	| 12.294305
  39463  	| 12.464323
  79005  	| 12.477592
  85569  	| 12.493771
  3232  	| 12.529598
  60619  	| 12.546336
  37061  	| 12.554748


In [14]:
# Do random query on engine 4
print('\nNeighbour distances with HashPermutations2:')
print('  -> Candidate count is %d' % engine_perm2.candidate_count(query))

t0=time.time()
results = engine_perm2.neighbours(query)
t1 = time.time()
print('Query took %f seconds' % (t1-t0))

print_results(results)



Neighbour distances with HashPermutations2:
  -> Candidate count is 2847
Query took 0.023810 seconds
  Data 	| Distance
  45271  	| 10.822296
  63441  	| 11.462094
  38317  	| 11.509614
  72292  	| 11.547821
  63882  	| 11.614826
  97137  	| 11.688304
  24577  	| 11.751268
  21204  	| 11.791053
  67002  	| 11.794519
  50328  	| 11.810473


In [15]:
# Real neighbours
print('\nReal neighbour distances:')


dists=numpy.zeros(POINTS)
t0 = time.time()
for iter in range(POINTS):
    dists[iter] = EuclideanDistance().distance(matrix[iter], query)
dists = dists.reshape((-1,))
dists_argsort = numpy.argsort(dists)
t1 = time.time()
print('Query took %f seconds' % (t1-t0))

results = [(None, d, dists[d]) for d in dists_argsort[:10]]
print_results(results)


Real neighbour distances:
Query took 0.678506 seconds
  Data 	| Distance
  45271  	| 10.822296
  10193  	| 11.026911
  44906  	| 11.182873
  82730  	| 11.363449
  43733  	| 11.366635
  8831  	| 11.393744
  97447  	| 11.402475
  95583  	| 11.409419
  7830  	| 11.414758
  63441  	| 11.462094
