### FLANN - Fast Library for Approximate Nearest Neighbors

#### For more information, see e.g. https://www.cs.ubc.ca/research/flann/

#### Manual: http://www.cs.ubc.ca/research/flann/uploads/FLANN/flann_manual-1.8.4.pdf

#### The C++ version has LSH implementation too, but this seems to be missing from the python bindings. 

In [1]:
# The pip install only works with python2!
#!pip install pyflann

In [2]:
from pyflann import *
import numpy as np
import time

In [3]:
dataset = np.array(
    [[1., 1, 1, 2, 3],
     [10, 10, 10, 3, 2],
     [100, 100, 2, 30, 1],
     [40, 10, 22, 30, 9]
     ])

print(dataset.shape[0],dataset.shape[1],'-dimensional points')

(4, 5, '-dimensional points')


In [4]:
queryset = np.array(
    [[1., 1, 1, 1, 1],
     [90, 90, 10, 10, 1]
     ])

print(queryset.shape)

(2, 5)


In [5]:
flann = FLANN()
result, dists = flann.nn(dataset, queryset, 3, algorithm="kmeans", branching=32, iterations=7, checks=16)

In [6]:
help(flann.nn)

Help on method nn in module pyflann.index:

nn(self, pts, qpts, num_neighbors=1, **kwargs) method of pyflann.index.FLANN instance
    Returns the num_neighbors nearest points in dataset for each point
    in testset.



In [7]:
print(result)
# the indices of the 3 nearest neighbours of the 2 query points

[[0 1 3]
 [2 3 1]]


In [8]:
print(dists)
# the distences from the the 2 query points to its 3 nearest neighbours in the dataset

[[5.000e+00 2.480e+02 2.948e+03]
 [6.640e+02 9.508e+03 1.285e+04]]


In [9]:
dataset = np.random.rand(10000, 128)
queryset = np.random.rand(1000, 128)

In [10]:
flann = FLANN()
result, dists = flann.nn(dataset, queryset, 5, algorithm="kmeans", branching=32, iterations=7, checks=16)

In [11]:
#this would take a few minutes to run ...
#params = flann.build_index(dataset, algorithm="autotuned", target_precision=0.5, log_level = "info");
#print params
#result, dists = flann.nn_index(queryset,5, checks=params["checks"]);

In [12]:
print(result.shape)
print(result)

(1000, 5)
[[3541 2746 9940 7916 6447]
 [8835  660 7696 7303 2088]
 [8763 1758 6792 5828 6278]
 ...
 [8104 7879 4497 3143 6283]
 [5343 4939 4004 2753 6616]
 [ 227 4867  761 3556 6675]]


In [13]:
print(dists.shape)
print(dists)

(1000, 5)
[[16.33106196 16.38779936 17.03451737 18.49504967 18.76710884]
 [15.38906065 16.35455037 16.51432489 17.64265026 17.82126537]
 [18.79513261 19.91920865 20.20926194 20.78662022 20.85424496]
 ...
 [14.04298852 15.56481968 16.68514896 17.11880522 17.31772277]
 [15.84126084 16.52776562 17.0601385  17.15884764 17.22149323]
 [16.52784255 17.36488808 17.40824558 17.4163009  17.48064692]]


In [14]:
t0=time.time()
result, dists = flann.nn(dataset, queryset, 5, algorithm="kdtree", trees=4)
t1 = time.time()
print('Query took %f seconds' % (t1-t0))

print(result.shape)
print(dists.shape)


Query took 0.123095 seconds
(1000, 5)
(1000, 5)


In [15]:
t0=time.time()
result, dists = flann.nn(dataset, queryset, 5, algorithm="linear")
t1 = time.time()
print('Query took %f seconds' % (t1-t0))

print(result.shape)
print(dists.shape)

Query took 1.116628 seconds
(1000, 5)
(1000, 5)
