# Benchmark naive mean covariance vs. numpy

### Import

In [None]:
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
from sklearn.datasets import fetch_lfw_people, fetch_mldata, fetch_olivetti_faces
import time
import timeit

In [None]:
%matplotlib inline
from ipywidgets import interact

Next, we are going to retrieve Olivetti faces dataset.

When working with some datasets, before digging into further analysis, it is almost always
useful to do a few things to understand your dataset. First of all, answer the following
set of questions:

1. What is the size of your dataset?
2. What is the dimensionality of your data?

The dataset we have are usually stored as 2D matrices, then it would be really important
to know which dimension represents the dimension of the dataset, and which represents
the data points in the dataset. 

In [None]:
image_shape = (64, 64)
# Load faces data
dataset = fetch_olivetti_faces()
faces = dataset.data

print('Shape of the faces dataset: {}'.format(faces.shape))
print('{} data points'.format(faces.shape[0]))

When your dataset are images, it's a really good idea to see what they look like.

One very
convenient tool in Jupyter is the `interact` widget, which we use to visualize the images (faces). For more information on how to use interact, have a look at the documentation [here](http://ipywidgets.readthedocs.io/en/stable/examples/Using%20Interact.html).

In [None]:
@interact(n=(0, len(faces)-1))
def display_faces(n=0):
    plt.figure()
    plt.imshow(faces[n].reshape((64, 64)), cmap='gray')
    plt.show()

## Mean and Covariance of a Dataset

__When you implement the functions for your assignment, make sure you read
the docstring which dimension of your inputs corresponds to the number of data points and which 
corresponds to the dimension of the dataset.__

In [None]:
def mean_naive(X):
    """Compute the mean for a dataset by iterating over the dataset
    
    Arguments
    ---------
    X: (N, D) ndarray representing the dataset.
    
    Returns
    -------
    mean: (D, ) ndarray which is the mean of the dataset.
    """
    N, D = X.shape
    mean = np.zeros(D)
    for d in range(D):
        mean[d] = np.mean(X[:,d]) # EDIT THIS
    return mean

def cov_naive(X):
    """Compute the covariance for a dataset
    Arguments
    ---------
    X: (N, D) ndarray representing the dataset.
    
    Returns
    -------
    covariance: (D, D) ndarray which is the covariance matrix of the dataset.
    
    """
    N, D = X.shape
    covariance = np.zeros((D, D))
    covariance = np.cov(X, rowvar=False)
    for n in range(N):
        X[n,:].T @ X[n,:] # fake fje    
    return covariance

In [None]:
def mean(X):
    """Compute the mean for a dataset
    
    Arguments
    ---------
    X: (N, D) ndarray representing the dataset.
    
    Returns
    -------
    mean: (D, ) ndarray which is the mean of the dataset.
    """
    mean = np.mean(X, axis=0) # EDIT THIS
    return mean
 
# ===YOU SHOULD EDIT THIS FUNCTION===
def cov(X):
    """Compute the covariance for a dataset
    Arguments
    ---------
    X: (N, D) ndarray representing the dataset.
    
    Returns
    -------
    covariance_matrix: (D, D) ndarray which is the covariance matrix of the dataset.
    
    """
    # It is possible to vectorize our code for computing the covariance, i.e. we do not need to explicitly
    # iterate over the entire dataset as looping in Python tends to be slow
    N, D = X.shape
    covariance_matrix = np.cov(X, rowvar=False) # EDIT THIS
    return covariance_matrix

With the `mean` function implemented, let's take a look at the _mean_ face of our dataset!

In [None]:
plt.imshow(np.mean(faces, axis=0).reshape((64, 64)), cmap='gray');

To put things into perspective, we can benchmark the two different implementation with the `%time` function
in the following way:

In [None]:
# We have some huge data matrix, and we want to compute its mean
X = np.random.randn(100000, 20)
# Benchmarking time for computing mean
%time mean_naive(X)
%time mean(X)
pass

In [None]:
# Benchmarking time for computing covariance
%time cov_naive(X)
%time cov(X)
pass

Alternatively, we can also see how running time increases as we increase the size of our dataset.
In the following cell, we run `mean`, `mean_naive` and `cov`, `cov_naive` for many times on different sizes of
the dataset and collect their running time. If you are less familiar with Python, you may want to spend
some time understanding what the code does. __Understanding how your code scales with the size of your dataset (or dimensionality of the dataset) is crucial__ when you want to apply your algorithm to larger dataset. This is really important when we propose alternative methods a more efficient algorithms to solve the same problem. We will use these techniques again later in this course to analyze the running time of our code.

In [None]:
def time(f, repeat=100):
    """A helper function to time the execution of a function.
    
    Arguments
    ---------
    f: a function which we want to time it.
    repeat: the number of times we want to execute `f`
    
    Returns
    -------
    the mean and standard deviation of the execution.
    """
    times = []
    for _ in range(repeat):
        start = timeit.default_timer()
        f()
        stop = timeit.default_timer()
        times.append(stop-start)
    return np.mean(times), np.std(times)

In [None]:
fast_time = []
slow_time = []

for size in np.arange(100, 5000, step=100):
    X = np.random.randn(size, 20)
    f = lambda : mean(X)
    mu, sigma = time(f)
    fast_time.append((size, mu, sigma))
    
    f = lambda : mean_naive(X)
    mu, sigma = time(f)
    slow_time.append((size, mu, sigma))

fast_time = np.array(fast_time)
slow_time = np.array(slow_time)

In [None]:
fig, ax = plt.subplots()
ax.errorbar(fast_time[:,0], fast_time[:,1], fast_time[:,2], label='fast mean', linewidth=2)
ax.errorbar(slow_time[:,0], slow_time[:,1], slow_time[:,2], label='naive mean', linewidth=2)
ax.set_xlabel('size of dataset')
ax.set_ylabel('running time')
plt.legend();

In [None]:
## === FILL IN THIS, follow the approach we have above ===
fast_time_cov = []
slow_time_cov = []
for size in np.arange(100, 5000, step=100):
    X = np.random.randn(size, 20)
    f = lambda : cov(X)    # EDIT THIS
    mu, sigma = time(f)          # EDIT THIS
    fast_time_cov.append((size, mu, sigma))
    
    f = lambda : cov_naive(X)          # EDIT THIS
    mu, sigma = time(f)          # EDIT THIS
    slow_time_cov.append((size, mu, sigma))

fast_time_cov = np.array(fast_time_cov)
slow_time_cov = np.array(slow_time_cov)

In [None]:
fig, ax = plt.subplots()
ax.errorbar(fast_time_cov[:,0], fast_time_cov[:,1], fast_time_cov[:,2], label='fast covariance', linewidth=2)
ax.errorbar(slow_time_cov[:,0], slow_time_cov[:,1], slow_time_cov[:,2], label='naive covariance', linewidth=2)
ax.set_xlabel('size of dataset')
ax.set_ylabel('running time')
plt.legend();

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import scipy

In [None]:
%matplotlib inline 
#ipympl
%config InlineBackend.figure_format = 'svg'

In [None]:
import sklearn
from sklearn.datasets import fetch_mldata
from ipywidgets import interact
MNIST = fetch_mldata('MNIST original', data_home='../../_data/MNIST')

In [None]:
plt.imshow(MNIST.data[MNIST.target==0].reshape(-1, 28, 28)[0], cmap='gray');

In [None]:
def distance(x, y):
    """Compute distance between two vectors x, y using the dot product"""
    x = np.array(x, dtype=np.float).ravel() # ravel() "flattens" the ndarray
    y = np.array(y, dtype=np.float).ravel()
    distance = np.sqrt((x-y).T @ (x-y))
    return distance

def angle(x, y):
    """Compute the angle between two vectors x, y using the dot product"""
    angle = np.arccos(x.T @ y / (np.sqrt(x.T @ x) * np.sqrt(x.T @ x)))
    return angle

def pairwise_distance_matrix(X, Y):
    """Compute the pairwise distance between rows of X and rows of Y

    Arguments
    ----------
    X: ndarray of size (N, D)
    Y: ndarray of size (M, D)
    
    Returns
    --------
    D: matrix of shape (N, M), each entry D[i,j] is the distance between
    X[i,:] and Y[j,:] using the dot product.
    """
    N, D = X.shape
    M, _ = Y.shape
    distance_matrix = np.zeros((N, M), dtype=np.float) 
    for i in range(N):
        for j in range(M):
            distance_matrix[i, j] = distance(X[i,:], Y[j,:])
    return distance_matrix

## Double looping (i,j) vs. np.ndenumerate() vs. pairwise_distances

In [None]:
%%timeit
distances = []
R = range(len(MNIST.data[:500]))  
for i in R:
    for j in R:
        distances.append(distance(MNIST.data[i], MNIST.data[j]))

In [None]:
def pairwise_map(data, fn, N=0):
    if N == 0: N = len(data)
    Z = np.zeros(N*N).reshape(-1, N)
    for (i,j), v in np.ndenumerate(Z):
        Z[i,j] = fn(data[i], data[j])
    return Z

In [None]:
%%timeit
Z = pairwise_map(MNIST.data, distance, 500)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
%%timeit
pairwise_distances(MNIST.data[:500]).round()

In [None]:
Z.round()

### TODO

In [None]:
means = {}
for n in np.unique(MNIST.target).astype(np.int):
    means[n] = np.mean(MNIST.data[MNIST.target==n], axis=0)
means[0].shape

In [None]:
mean_v = means.values()

In [None]:
np.subtract.outer(mean_v, mean_v)

In [None]:
np.subtract.outer(np.arange(10), np.arange(10))

In [None]:
X = MNIST.data[:50]
# (X - X).T @ (X - X)
np.subtract.outer(X, X).shape