## Distances and Angles between Images

__Distance__ and __angle__ are useful beyond their usual interpretation. They are useful for describing __similarity__ between objects.

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import scipy

In [None]:
%matplotlib inline 
#ipympl
%config InlineBackend.figure_format = 'svg'

In [None]:
%%javascript
// Some javascript to provide better layout for plots we have later.
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
};


Recall that the distance defined by the dot product is 
$$ d(\boldsymbol{x},\boldsymbol{y}) = \lVert \boldsymbol{x} - \boldsymbol{y} \rVert, $$
and the angle defined by the dot product is 
$$ \boldsymbol{x}^T \boldsymbol{y} = \lVert \boldsymbol{x} \rVert \lVert \boldsymbol{y} \rVert cos \theta$$

In [None]:
def distance(x, y):
    """Compute distance between two vectors x, y using the dot product"""
    x = np.array(x, dtype=np.float).ravel() # ravel() "flattens" the ndarray
    y = np.array(y, dtype=np.float).ravel()
    distance = np.sqrt((x-y).T@(x-y))
    return distance

def angle(x, y):
    """Compute the angle between two vectors x, y using the dot product"""
    angle = np.arccos(x.T@y / (np.sqrt(x.T@x) * np.sqrt(x.T@x)))
    if np.isnan(angle): return 0
    return angle

def pairwise_distance_matrix(X, Y):
    """Compute the pairwise distance between rows of X and rows of Y

    Arguments
    ----------
    X: ndarray of size (N, D)
    Y: ndarray of size (M, D)
    
    Returns
    --------
    D: matrix of shape (N, M), each entry D[i,j] is the distance between
    X[i,:] and Y[j,:] using the dot product.
    """
    N, D = X.shape
    M, _ = Y.shape
    distance_matrix = np.zeros((N, M), dtype=np.float) 
    for i in range(N):
        for j in range(M):
            distance_matrix[i, j] = distance(X[i,:], Y[j,:])
    return distance_matrix

For `pairwise_distance_matrix`, you may be tempting to iterate through
rows of X and Y and fill in the distance matrix, but that is slow! Can you
think of some way to vectorize your computation (i.e. make it faster by using numpy/scipy operations only)

In [None]:
def plot_vector(v, w):
    """Plot two vectors `v` and `w` of dimension 2"""
    fig = plt.figure(figsize=(4,4))
    ax = fig.gca()
    plt.xlim([-2, 2])
    plt.ylim([-2, 2])
    plt.grid()
    ax.arrow(0, 0, v[0], v[1], head_width=0.05, head_length=0.1, 
             length_includes_head=True, linewidth=2, color='r');
    ax.arrow(0, 0, w[0], w[1], head_width=0.05, head_length=0.1, 
             length_includes_head=True, linewidth=2, color='r');

In [None]:
# Some sanity checks, you may want to have more interesting test cases to test your implementation
a = np.array([1,0])
b = np.array([0,1])
np.testing.assert_almost_equal(distance(a, b), np.sqrt(2))
assert((angle(a,b) / (np.pi * 2) * 360.) == 90)
print('correct')

In [None]:
plot_vector(b, a)

In [None]:
import sklearn
from sklearn.datasets import fetch_mldata
from ipywidgets import interact
MNIST = fetch_mldata('MNIST original', data_home='../../_data/MNIST')

In [None]:
plt.imshow(MNIST.data[MNIST.target==0].reshape(-1, 28, 28)[0], cmap='gray');

But we have the following questions:

1. What does it mean for two digits in the MNIST dataset to be _different_ by our distance function? 
2. Furthermore, how are different classes of digits different for MNIST digits? Let's find out!

### Pairwise distance between digits

In [None]:
def pairwise_map(data, fn, N=0):
    """Execute any function on each data pair and return as 2d-array"""
    if N == 0: N = len(data)
    Z = np.zeros(N*N).reshape(-1, N)
    for (i,j), v in np.ndenumerate(Z):
        Z[i,j] = fn(data[i], data[j])
    return Z

In [None]:
Z = pairwise_map(MNIST.data, distance, 500)
Z.round()

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

pairwise_distances(MNIST.data[:500]).round()

In [None]:
def find_similar(x):
    min_dist, min_index = np.inf, x
    for i in range(1, len(MNIST.data)):
        dist = distance(MNIST.data[x], MNIST.data[i])
        if min_dist > dist:
            min_dist, min_index = dist, i
    return min_dist, min_index
find_similar(0)

In [None]:
# Most similar image
img_1 = 0
np.argsort([distance(MNIST.data[img_1], img_2) for img_2 in MNIST.data])[1]

### Sort similar digits pairwise

In [None]:
np.argsort(pairwise_distances(MNIST.data[:500]))

### Find distance between 2 images

In [None]:
@interact(first=(0, 499), second=(0, 499), continuous_update=False)
def show_img(first, second):
    plt.figure(figsize=(12, 5))
    f = MNIST.data[first].reshape(28, 28)
    s = MNIST.data[second].reshape(28, 28)
    
    ax2 = plt.subplot2grid((2, 2), (0, 1), rowspan=2)
    ax0 = plt.subplot2grid((2, 2), (0, 0))
    ax1 = plt.subplot2grid((2, 2), (1, 0))
    
    plt.imshow(np.hstack([f,s]), cmap='gray')
    ax0.imshow(f, cmap='gray')
    ax1.imshow(s, cmap='gray')
    ax2.hist(np.array(distances), bins=50)
    d = distance(f, s)
    ax2.axvline(x=d, ymin=0, ymax=40000, color='C4', linewidth=4)
    ax2.set(xlabel='distance', ylabel='number of images')
    ax2.text(1200, 19500, "Distance is {:.2f}".format(d), size=12)
    plt.show()

### Label means

In [None]:
means = {}
for n in np.unique(MNIST.target).astype(np.int):
    means[n] = np.mean(MNIST.data[MNIST.target==n], axis=0)
means[0].shape

### Pairwise metrics

In [None]:
MD = pairwise_map(means, distance).round()
AG = pairwise_map(means, angle).round(2)

For each pair of classes, we compute the pairwise distance and 
store them into MD (mean distances). We store the angles between the mean digits in AG

### Visualise pairwise metrics

In [None]:
def heatmap(X, title=''):
    N = X.shape[0]
    fig, ax = plt.subplots()
    grid = ax.imshow(X, interpolation='nearest')
    ax.set(title='Differences between classes of digits\nMetric = {}'.format(title),
           xticks=range(N), 
           xlabel='class of digits',
           ylabel='class of digits',
           yticks=range(N))
    fig.colorbar(grid)

In [None]:
heatmap(MD, 'Distance')
heatmap(AG, 'Angle')