# Searching a pairwise distance matrix for specific points:

In this notebook you can compare the performance of a classic combinatorics method to find the indices of two points in an array with a precise distance to scipy's squareform method and a [triangular numbers approach](https://mathworld.wolfram.com/TriangularNumber.html).

The first half of this notebook steps through an example of how the triangular numbers approach works. In the second half, you can compare the time required for each method, which clearly displays the decrease in computational time 
of using triangular numbers.

In [1]:
# import required functionalities

from __future__ import division, print_function

import time
import numpy as np
import matplotlib.pyplot as plt

from os import getcwd
from os.path import join
from itertools import combinations
from scipy.spatial.distance import pdist, squareform

%matplotlib inline

In [2]:
# define triangular numbers-based search function

def index_pair_from_condensed_form(array, condensed_idx):
    
    '''this function uses triangular numbers formulas to convert a condensed form distance matrix index
       (condensed_idx) into a pair of indices for returning points from the original pointset (array).
       
       
       Let T(x) be the x'th triangular number (i.e., the sum of the numbers from 1 to x). 
       The formula for T(x) is simple:
       T(x) = (x * (x + 1)) / 2
       
       
       Given y = T(x) we can calculate x by inverting the above formula:
       x = (((8y + 1) ** 0.5) - 1) / 2'''
    
    i = condensed_idx
    n = array.shape[0] - 1
    t_n = (n * (n+1)) / 2
    y = t_n - i - 1
    d = 1 + int((((8 * y + 1) ** 0.5) - 1) / 2)
    k = n - d
    k_star = 1 + i + k + ((d * (d + 1)) / 2) - t_n
    
    return int(k), int(k_star)

### Create a matrix with shape (n_pts, 3) to store n points of interest with dimension m (e.g., a pointcloud)

In [3]:
n_pts = 5000
m_dim = 3
largest_val = 500

arr = np.random.randint(0, largest_val, (n_pts, m_dim))

### Compute the pairwise distance matrix of the point set

In [4]:
d = pdist(arr)

### Find array index of desired mathematical argument (e.g., minimal distance)

In [5]:
idx = np.argmin(d)

### Assign result of mathematical argument to a variable

In [6]:
value = d[idx]
print(value == np.amin(d))

True


### Convert index to pair of indices in original point set

In [7]:
idx_1, idx_2 = index_pair_from_condensed_form(arr, idx)

### Confirm the returned indices result in the same value as math_arg

In [8]:
p1 = arr[idx_1,...]
p2 = arr[idx_2,...]
confirm_val = np.linalg.norm(p1 - p2)
print(value == confirm_val)

True


### If the result is confirmed (==True), return the points from the original array

In [9]:
print('point 1:', p1)
print('point 2:', p2)

[ 71 344 393] [ 70 344 393]


### Compare how comp time scales for all three methods from the minimum set comparison (n_pts = 2) up to your dataset's size & save the results as a plot

In [None]:
a = np.random.randint(0, largest_val, (n_pts, m_dim))

subset_sizes = range(2, n_pts, 1)
n_loops = len(subset_sizes)

time_pdist = np.zeros((n_loops,1))
time_tri = np.copy(time_pdist)
time_comb_idx_det = np.copy(time_pdist)
time_sqform_unravel = np.copy(time_pdist)

counter = 0
for i in subset_sizes:
    a_subset = a[:i, :]

    t_0 = time.time()
    d = pdist(a_subset)
    time_pdist[counter] = time.time() - t_0
    
    t_0 = time.time()
    idx_tri = index_pair_from_condensed_form(a_subset, np.argmax(d))
    time_tri[counter] = time.time() - t_0

    t_0 = time.time()
    indices = list(combinations(range(i), 2))
    comb_idx = indices[np.argmax(d)]
    time_comb_idx_det[counter] = time.time() - t_0
    
    t_0 = time.time()
    d_sq = squareform(d)
    idx_unravel = np.unravel_index(np.argmax(d_sq), d_sq.shape)
    time_sqform_unravel[counter] = time.time() - t_0
    
    counter += 1
    
print(len(time_tri) == len(time_comb_idx_det) == len(time_sqform_unravel))

# initialize plot
plt.figure(figsize=(10,10))
x = np.arange(0,n_loops)

# plot three datasets (x = n_pts, y = comp_time)
plt.plot(x, time_comb_idx_det, color = 'darkturquoise', label = 'itertools')
plt.plot(x, time_sqform_unravel, color = 'orange', label = 'scipy.spatial')
plt.plot(x, time_tri, color = 'purple', label = 'triangular numbers')

# plot specs
x_step = int(n_pts/5)
plt.xticks(range(0,int(n_pts*1.1),x_step), fontsize = 24)
plt.xlim(0,n_pts)

y_max = np.amax(np.concatenate((time_tri, time_comb_idx_det, time_sqform_unravel)))
y_step = y_max/5
plt.yticks(fontsize = 24)
plt.ylim(0, y_max*1.1)

plt.xlabel('N points pairwise compared', fontsize = 36)
plt.ylabel('time (seconds)', fontsize = 36)
plt.legend(fontsize = 24)
plt.tight_layout()
plt.show()