In [1]:
import numpy
import scipy
import unittest
import nearpy.utils.utils
from nearpy import Engine
from nearpy.distances import CosineDistance

from nearpy.hashes import RandomBinaryProjections, RandomBinaryProjectionTree, HashPermutations, HashPermutationMapper


In [2]:
def print_results(results):
    print('  Data \t| Distance')
    for r in results:
        data = r[1]
        dist = r[2]
        print('  {} \t| {:.4f}'.format(data, dist))


In [3]:
# Dimension of feature space
DIM = 100

# Number of data points (dont do too much because of exact search)
POINTS = 10000



In [4]:
print('Creating engines')

# We want 12 projections, 20 results at least
rbpt = RandomBinaryProjectionTree('rbpt', 20, 20)

# Create engine 1
engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance())

# Create binary hash as child hash
rbp = RandomBinaryProjections('rbp1', 20)

# Create engine 2
engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance())

# Create permutations meta-hash
permutations = HashPermutations('permut')

# Create binary hash as child hash
rbp_perm = RandomBinaryProjections('rbp_perm', 20)
rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

# Add rbp as child hash of permutations hash
permutations.add_child_hash(rbp_perm, rbp_conf)

# Create engine 3
engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance())

# Create permutations meta-hash
permutations2 = HashPermutationMapper('permut2')

# Create binary hash as child hash
rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12)

# Add rbp as child hash of permutations hash
permutations2.add_child_hash(rbp_perm2)

# Create engine 3
engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance())



Creating engines


In [5]:
print('Indexing %d random vectors of dimension %d' % (POINTS, DIM))

# First index some random vectors
matrix = numpy.zeros((POINTS,DIM))
for i in range(POINTS):
    v = numpy.random.randn(DIM)
    matrix[i, :] = nearpy.utils.utils.unitvec(v)
    engine.store_vector(v, i)
    engine_rbpt.store_vector(v, i)
    engine_perm.store_vector(v, i)
    engine_perm2.store_vector(v, i)

print('Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys()))
print('Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys()))


Indexing 10000 random vectors of dimension 100
Buckets 1 = 9900
Buckets 2 = 9903


In [6]:
print('Building permuted index for HashPermutations')

# Then update permuted index
permutations.build_permuted_index()


Building permuted index for HashPermutations


In [7]:
print('Generate random data')

# Get random query vector
query = numpy.random.randn(DIM)


Generate random data


In [8]:
# Do random query on engine 1
print('\nNeighbour distances with RandomBinaryProjectionTree:')
print('  -> Candidate count is %d' % engine_rbpt.candidate_count(query))
results = engine_rbpt.neighbours(query)
print_results(results)



Neighbour distances with RandomBinaryProjectionTree:
  -> Candidate count is 22
  Data 	| Distance
  7445 	| 0.8079
  3521 	| 0.8104
  3615 	| 0.8259
  1325 	| 0.8306
  4111 	| 0.8528
  5106 	| 0.8880
  8363 	| 0.9416
  290 	| 0.9518
  5093 	| 0.9528
  1764 	| 0.9597


In [9]:
# Do random query on engine 2
print('\nNeighbour distances with RandomBinaryProjections:')
print('  -> Candidate count is %d' % engine.candidate_count(query))
results = engine.neighbours(query)
print_results(results)



Neighbour distances with RandomBinaryProjections:
  -> Candidate count is 0
  Data 	| Distance


In [10]:
# Do random query on engine 3
print('\nNeighbour distances with HashPermutations:')
print('  -> Candidate count is %d' % engine_perm.candidate_count(query))
results = engine_perm.neighbours(query)
print_results(results)



Neighbour distances with HashPermutations:
  -> Candidate count is 102
  Data 	| Distance
  4295 	| 0.6480
  5840 	| 0.7074
  4214 	| 0.7408
  4475 	| 0.7563
  6193 	| 0.7636
  808 	| 0.7773
  3115 	| 0.7786
  3487 	| 0.7842
  320 	| 0.7852
  4509 	| 0.7878


In [11]:
# Do random query on engine 4
print('\nNeighbour distances with HashPermutations2:')
print('  -> Candidate count is %d' % engine_perm2.candidate_count(query))
results = engine_perm2.neighbours(query)
print_results(results)



Neighbour distances with HashPermutations2:
  -> Candidate count is 371
  Data 	| Distance
  2988 	| 0.6716
  5091 	| 0.6899
  7837 	| 0.7055
  537 	| 0.7134
  4320 	| 0.7199
  8162 	| 0.7277
  5973 	| 0.7280
  8247 	| 0.7367
  9673 	| 0.7404
  1329 	| 0.7411


In [12]:
# Real neighbours
print('\nReal neighbour distances:')
query = nearpy.utils.utils.unitvec(query)
query = query.reshape((DIM, 1))
dists = CosineDistance().distance(matrix,query)
dists = dists.reshape((-1,))
# dists = sorted(dists)
dists_argsort = numpy.argsort(dists)

results = [(None, d, dists[d]) for d in dists_argsort[:10]]
print_results(results)


Real neighbour distances:
  Data 	| Distance
  9227 	| 0.6192
  4295 	| 0.6480
  5667 	| 0.6595
  2988 	| 0.6716
  6495 	| 0.6741
  5198 	| 0.6777
  1375 	| 0.6783
  3482 	| 0.6856
  5091 	| 0.6899
  3751 	| 0.6923
