### FLANN - Fast Library for Approximate Nearest Neighbors

#### For more information, see e.g. https://github.com/flann-lib/flann

#### Docvumentation: https://github.com/flann-lib/flann/blob/master/doc/manual.tex

#### The C++ version has LSH implementation too, but this seems to be missing from the python bindings. 

In [1]:
# The pip install only works with python2!
!pip install pyflann

[31mmoviepy 1.0.0 requires tqdm<5.0,>=4.11.2, which is not installed.[0m
[31mproglog 0.1.9 requires tqdm, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 20.3.4 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
from pyflann import *
import numpy as np
import time

### Example 1

In [3]:
# Create a dataset

dataset = np.array(
    [[1., 1, 1, 2, 3],
     [10, 10, 10, 3, 2],
     [100, 100, 2, 30, 1],
     [40, 10, 22, 30, 9]
     ])

print('We have {} {}-dimensional points in the dataset'.format(dataset.shape[0],dataset.shape[1]))

We have 4 5-dimensional points in the dataset


In [4]:
# Create a query set

queryset = np.array(
    [[1., 1, 1, 1, 1],
     [90, 90, 10, 10, 1]
     ])

print('The query set contains {} {}-dimensional points'.format(queryset.shape[0],queryset.shape[1]))


The query set contains 2 5-dimensional points


In [5]:
# Create a flann datastructure
# There are vvarious algorithms you can choose from.

flann = FLANN()
result, dists = flann.nn(dataset, queryset, 3, algorithm="kmeans", branching=32, iterations=7, checks=16)

In [6]:
help(flann.nn)

Help on method nn in module pyflann.index:

nn(self, pts, qpts, num_neighbors=1, **kwargs) method of pyflann.index.FLANN instance
    Returns the num_neighbors nearest points in dataset for each point
    in testset.



In [7]:
print(result)
# the indices of the 3 nearest neighbours of the 2 query points

[[0 1 3]
 [2 3 1]]


In [8]:
print(dists)
# the distences from the the 2 query points to its 3 nearest neighbours in the dataset

[[5.000e+00 2.480e+02 2.948e+03]
 [6.640e+02 9.508e+03 1.285e+04]]


### Example 2

In [9]:
dataset = np.random.rand(10000, 128)
print('We have {} {}-dimensional points in the dataset'.format(dataset.shape[0],dataset.shape[1]))

queryset = np.random.rand(1000, 128)
print('The query set contains {} {}-dimensional points'.format(queryset.shape[0],queryset.shape[1]))


We have 10000 128-dimensional points in the dataset
The query set contains 1000 128-dimensional points


In [10]:
flann = FLANN()
result, dists = flann.nn(dataset, queryset, 5, algorithm="kmeans", branching=32, iterations=7, checks=16)

In [11]:
#this would take a few minutes to run ...
#params = flann.build_index(dataset, algorithm="autotuned", target_precision=0.5, log_level = "info");
#print params
#result, dists = flann.nn_index(queryset,5, checks=params["checks"]);

In [12]:
print(result.shape)
print(result)

(1000, 5)
[[4432 5090 1787 8018  480]
 [2719 9627 2720 1300 2891]
 [2105 9242 8438  480 5356]
 ...
 [ 933 8654 3642 7746 9636]
 [2692 3125 3350 1029 3891]
 [5254  693 6764 4383 5911]]


In [13]:
print(dists.shape)
print(dists)

(1000, 5)
[[15.28320257 15.45376435 16.41858967 16.92903959 17.03648633]
 [16.16246798 16.6267674  16.91570521 17.83611935 18.46422683]
 [14.98482739 17.29532222 17.76185872 18.45351986 18.47682788]
 ...
 [15.59720834 16.29963063 18.32725127 19.02224202 19.39307164]
 [15.60386668 15.69104885 16.2268636  16.40207419 16.7072947 ]
 [15.19437224 15.72291309 15.84173138 16.36839645 16.37319553]]


### Example 3

In [14]:
# We will use kdtrees

t0=time.time()
result, dists = flann.nn(dataset, queryset, 5, algorithm="kdtree", trees=4)
t1 = time.time()
print('Query took %f seconds' % (t1-t0))

print(result.shape)
print(dists.shape)


Query took 0.133844 seconds
(1000, 5)
(1000, 5)


### Example 4

In [15]:
t0=time.time()
result, dists = flann.nn(dataset, queryset, 5, algorithm="linear")
t1 = time.time()
print('Query took %f seconds' % (t1-t0))

print(result.shape)
print(dists.shape)

Query took 1.138500 seconds
(1000, 5)
(1000, 5)
