# Problem Set 4: Visualization, $k$-Means/E-M, Spectral Clustering

We're back to using `scikit-learn` now. 

In [None]:
# Boilerplate

import numpy as np
import pandas as pd

from sklearn.utils import check_random_state
from sklearn.utils.extmath import row_norms
# Some scaling functions
from sklearn.preprocessing import robust_scale, minmax_scale, maxabs_scale, scale
# Clustering
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
# Iris dataset
from sklearn.datasets import load_iris
# Random datasets
from sklearn.datasets import make_blobs
from sklearn.datasets import make_spd_matrix
# For visualization
from sklearn.manifold import TSNE, MDS, SpectralEmbedding, spectral_embedding
from sklearn.decomposition import PCA
# Distances and kernels
from sklearn.metrics.pairwise import cosine_distances, manhattan_distances, rbf_kernel, laplacian_kernel, euclidean_distances
    

import matplotlib.pyplot as plt
import seaborn as sns

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# Enable high resolution PNGs
%config InlineBackend.figure_formats = {'png', 'retina'}

# Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': '#DFDFE5'}
sns.set(context='notebook', style='darkgrid', rc=rc)

# Our familiar plot_clusters function:
# hue: the labels we want to use, should be a column name of df
# vars: the dimensions that we want to plot, should be a list of 
#       column names from df
def plot_clusters(df, hue='ypred', vars=None): 
    # Plot the cluster labels:
    g = sns.pairplot(data=df, hue=hue, palette='Dark2', vars=vars)
    
    plt.show()
    
def plot_clusters2d(df, hue='ypred', vars=['T0','T1']):
    # Plot the cluster labels:
    g = sns.FacetGrid(data=df, size=5, hue=hue, palette='Dark2', subplot_kws=dict(aspect='equal'))
    g.map(plt.scatter, vars[0], vars[1], edgecolor="w")
    plt.show()

# Section 1: Visualization

We'll use the Iris data since we're already familiar with that set from a couple assignments ago. Let's load it the same way into `df_iris` the same way as in PS2 and then plot with the real labels (`real_labels`) from the dataset:

In [None]:
# First, load the iris data
iris = load_iris()

# The data is in list format, so make it into an array:
X_iris = np.array(iris.data)
iris_samples, iris_features = X_iris.shape

# It will be convenient later to randomize the order, 
# but it's less confusing to do it here
order = check_random_state(201610270).permutation(iris_samples)
X_iris = X_iris[order]

# Give the columns of the DataFrame the names of the features
df_iris = pd.DataFrame(X_iris, columns=iris['feature_names'])

# Add the actual labels from the dataset into the DataFrame 
# for comparison
df_iris['real_labels'] = np.array(iris['target'])[order]

plot_clusters(df_iris, hue='real_labels', vars=['sepal length (cm)','sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])


## Problem 1 (PCA)

First we're going to mess around with some visualizations. In the first problem, we're going to build a PCA visualization by coding it, and use it to take a look at the iris dataset.

Fill in the following function's missing pieces to get it to produce PCA components as discussed in class (the test will see if the function returns reasonably-close values and will also produce a visualization using `plot_clusters`):

In [None]:
def pca_components(X):
    
    # Find the mean of all of the examples in X, if X has 
    # shape (n_examples, n_features). The shape of mean_X
    # should either be (n_features,) or (1, n_features).
    # Remember: there is an easy way to do this!
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Take the SVD. The shapes of the variables are:
    # U    : (n_examples, n_features)
    # Sigma: (n_features,)
    # V    : (n_features, n_features)
    #
    # In this version of the SVD, the following matrix equation is true:
    # X = U * Sigma * V
    # (within a certain amount of tolerance)
    U, Sigma, V = np.linalg.svd(X-mean_X, full_matrices=False)

    # Construct a variable C of shape (n_examples, n_features) that 
    # contains the components of the PCA:
    #
    # Sigma is a diagonal matrix, so it only needs to be represented
    # by a vector. To make it a diagonal matrix, use np.diag(Sigma).
    # (But technically you don't need to do this.)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return C

In [None]:
C = pca_components(X_iris)

df_iris['C0'] = C[:,0]
df_iris['C1'] = C[:,1]
plot_clusters2d(df_iris, hue='real_labels', vars=['C0','C1'])

# Compare with what sklearn.decomposition.PCA gives us
assert(np.isclose(C, PCA().fit_transform(X_iris)).all())

PCA gives us a fairly nice-looking visualization that separates the real labels of the dataset pretty well. Recall also that we didn't feed the real labels into the visualization -- the labels are already nicely separated within the data. 

## Problem 2 (MDS)

### Part 1:
Now let's try with MDS, using the default parameters, and an arbitrary random seed. Let's start out with euclidean distance and then change the metric to try for something better. The `n_init` tells us how many attempts at the best embedding we'll try:

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
mds = MDS(random_state=201610270, n_init=50, dissimilarity='precomputed')
M = mds.fit_transform(euclidean_distances(X_iris))
df_iris['M0'] = M[:,0]
df_iris['M1'] = M[:,1]
plot_clusters2d(df_iris, hue='real_labels', vars=['M0','M1'])

This doesn't do any better than PCA, and in fact produces a nearly identical embedding (why? not graded, just think about it).

Let's try with other metrics, do inject some nonlinearity into the mix. Try some of them out:

In [None]:
def other_metrics(X, metric=None):
    if metric is None:
        raise ValueError('You need to pick something')
        
    if metric == 'euclidean_distances':
        return euclidean_distances(X)
    if metric == 'cosine':
        return cosine_distances(X)
    if metric == 'manhattan':
        return manhattan_distances(X)
    if metric == 'laplacian':
        return 1-laplacian_kernel(X)
    if metric == 'gaussian':
        return 1-rbf_kernel(X)

# Set `metric` to be one of 'cosine', 'manhattan', 'laplacian', or 'gaussian':
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
# We can just recycle the `mds` object from above
M = mds.fit_transform(other_metrics(X_iris, metric=metric))
df_iris['M0'] = M[:,0]
df_iris['M1'] = M[:,1]
plot_clusters2d(df_iris, hue='real_labels', vars=['M0','M1'])

### Part 2:
Which is the best for visualizing the structure of the data? Expand on your opinion.

YOUR ANSWER HERE

## Problem 3 (t-SNE)

### Part 1:
Now let's try with t-SNE, using the default parameters, and an arbitrary random seed: 

In [None]:
tsne_params = dict(random_state=201610278, n_components=2)
tsne = TSNE(**tsne_params)
T = tsne.fit_transform(X_iris)
df_iris['T0'] = T[:,0]
df_iris['T1'] = T[:,1]
plot_clusters2d(df_iris, hue='real_labels')

This...well, this sucks. I thought that t-SNE was supposed to produce these awesome embeddings! Try to play with the parameters to get something better (this is one of the problems with t-SNE that balances its advantages).

You're welcome to do this by hand or automate the process. Just make sure that you change `tsne_params` and that it is still a `dict` object, using [the descriptions of the parameters of `TSNE`](http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html). You will probably want to set the `random_state` parameter so that we see what you see.

You can change `tsne_params` in one of a couple ways:

    tsne_params['parameter1'] = new_value1
    tsne_params['parameter2'] = new_value2
    tsne_params.update(parameter1=new_value1, parameter2=new_value2)

The latter is useful for updating several at a time.

In [None]:
# You can also change random_state; the above is just an initial value.
# The seed will also affect the outcome of the process.

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
tsne = TSNE(**tsne_params)
T = tsne.fit_transform(X_iris)
df_iris['T0'] = T[:,0]
df_iris['T1'] = T[:,1]
plot_clusters2d(df_iris, hue='real_labels', vars=['T0','T1'])

### Part 2:
What parameters were critical in generating a good visualization? Why do these parameters work compared to the ones before?

YOUR ANSWER HERE

## Problem 4 (more dimensions, more clusters)

Now we're going to generate some random blobs to use in the clustering part of the assignment. We'll generate the blobs, put them in a `DataFrame`, and plot them:

In [None]:
n_features = 5
n_clusters = 8
X_blobs, y_blobs = make_blobs(random_state=201610276, n_features=n_features, centers=n_clusters, n_samples=1000)
df_blobs = pd.DataFrame(X_blobs, columns=['B{:d}'.format(i) for i in range(n_features)])
df_blobs['blob_labels'] = y_blobs
plot_clusters(df_blobs, hue='blob_labels', vars=['B0','B1','B2','B3','B4'])

Let's go ahead and try PCA. PCA isn't going to help us much (at least for visualization) because the data is just too intrinsically high-dimensional, and it's also too "spherical" of a dataset -- that is, there's no "dominant" directions like in the Iris dataset:

In [None]:
U = PCA(n_components=2).fit_transform(X_blobs)
df_blobs['U0'] = U[:,0]
df_blobs['U1'] = U[:,1]
plot_clusters2d(df_blobs, hue='blob_labels', vars=['U0','U1'])

PCA many times fails as a visualization tool simply because it's only a linear projection. If there are more than two intrinsic dimensions in the data, then PCA won't display them in the most significant components.

### Part 1:
Let's try with MDS first. Again, play with the value of `metric`, to see if it gives you a visualization that you like (each run will take a bit of time, since we have more points than the last set of problems):

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
mds = MDS(random_state=201610272, n_init=4, dissimilarity='precomputed')
M = mds.fit_transform(other_metrics(X_blobs, metric=metric))
df_blobs['M0'] = M[:,0]
df_blobs['M1'] = M[:,1]
plot_clusters2d(df_blobs, hue='blob_labels', vars=['M0','M1'])

### Part 2:
Let's now try t-SNE, with the parameters you used above. If this gives you a good visualization, just change the code in the following cell to `pass`. If not, update the parameters like above until you get a good visualization. 

Once you get a good visualization of this data, we'll use it for the problems below.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
T = TSNE(**tsne_params).fit_transform(X_blobs)
df_blobs['T0'] = T[:,0]
df_blobs['T1'] = T[:,1]
plot_clusters2d(df_blobs, hue='blob_labels', vars=['T0','T1'])

# Section 2: $k$-Means and EM

## Problem 5 ($k$-Means)

We would expect $k$-means to do a good job at clustering our blobs dataset. All the blobs are well-separated from one another, and they're roughly circular. We'll use both our MDS and our t-SNE features to visualize the data:

In [None]:
df_blobs['kmeans_labels'] = KMeans(random_state=201610273, n_clusters=n_clusters).fit_predict(X_blobs)
plot_clusters2d(df_blobs, hue='kmeans_labels', vars=['M0','M1'])
plot_clusters2d(df_blobs, hue='kmeans_labels', vars=['T0','T1'])

This is pretty much like we expected. Now let's mess with it a bit. 

We're going to generate a random, symmetric, positive-definite matrix $P$. Most of the time, $P$ will be a nice matrix that stretches inputs by some positive amount in random, orthogonal directions (that is, without any skew). In three dimensions the effect is similar to taking a perfect ball of putty and either squashing it or stretching it.

We'll still use our visualization embeddings from before, since we have only affected the data with a linear transformation:

In [None]:
P = make_spd_matrix(n_dim=n_features, random_state=201610275)
X_blobs_stretch = X_blobs.dot(P)

df_blobs['kmeans_labels'] = KMeans(random_state=201610273, n_clusters=n_clusters).fit_predict(X_blobs_stretch)
plot_clusters2d(df_blobs, hue='kmeans_labels', vars=['M0','M1'])
plot_clusters2d(df_blobs, hue='kmeans_labels', vars=['T0','T1'])

What has happened to the clusters? Given what you know about $k$-means, why do we see this effect?

YOUR ANSWER HERE

## Problem 6 (EM)

Now let's try with EM. This is implemented in the GMM (Gaussian mixture model) class. Let's use it on the same stretched input and see what happens.

(Unfortunately GMM doesn't provide a `fit_predict` shortcut, so we have to do this the long way.)

In [None]:
df_blobs['gmm_labels'] = GMM(n_components=n_clusters, random_state=201610274).fit(X_blobs_stretch).predict(X_blobs_stretch)
plot_clusters2d(df_blobs, hue='gmm_labels', vars=['M0','M1'])
plot_clusters2d(df_blobs, hue='gmm_labels', vars=['T0','T1'])

1. How is this different from what we saw in $k$-means? 
2. Why does EM perform better on the squashed data?
3. How would you change this example to break EM (if you fiddled with the data above, what did you change)?

YOUR ANSWER HERE

# Section 3: Spectral Clustering

In this section, let's make the problem a little harder. We'll generate the blobs as before, except we're going to force the centers to be just a bit closer together, so it's harder to tease them apart, but there are still obvious blobs that are separate:

In [None]:
X_blobs_close, y_blobs_close = make_blobs(random_state=201610278, n_features=n_features, centers=n_clusters, n_samples=1000, center_box=(-5,5))
df_blobs_close = pd.DataFrame(X_blobs_close, columns=['B{:d}'.format(i) for i in range(n_features)])
df_blobs_close['blob_labels'] = y_blobs_close
plot_clusters(df_blobs_close, hue='blob_labels', vars=['B0','B1','B2','B3','B4'])

## Problem 7 (Spectral Embedding as a Visualization Tool)

In order to use spectral clustering, we need a graph. The `SpectralEmbedding` tool builds that graph, computes the Laplacian, and then generates an embedding that we can use for visualization. We'll use $k$-nearest-neighbors, and set up the graph to use the 300 nearest neighbors. We'll also compare the embedding to MDS (using the metric you selected) and t-SNE (using the parameters you settled on):

In [None]:
spe = SpectralEmbedding(random_state=201610277, n_components=2, n_neighbors=300)
S = spe.fit_transform(X_blobs_close)
df_blobs_close['S0'] = S[:,0]
df_blobs_close['S1'] = S[:,1]
plot_clusters2d(df_blobs_close, hue='blob_labels', vars=['S0','S1'])

mds = MDS(random_state=201610272, n_init=4, dissimilarity='precomputed')
M = mds.fit_transform(other_metrics(X_blobs_close, metric=metric))
df_blobs_close['M0'] = M[:,0]
df_blobs_close['M1'] = M[:,1]
plot_clusters2d(df_blobs_close, hue='blob_labels', vars=['M0','M1'])

T = TSNE(**tsne_params).fit_transform(X_blobs_close)
df_blobs_close['T0'] = T[:,0]
df_blobs_close['T1'] = T[:,1]
plot_clusters2d(df_blobs_close, hue='blob_labels', vars=['T0','T1'])

If you found good settings for MDS or t-SNE, you probably notice that the spectral embedding isn't quite as nice. Play with the value of `n_neighbors`, between 125 and 1000 to see if you can improve it:

In [None]:
n_neighbors = -1
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
spe = SpectralEmbedding(random_state=201610277, n_components=2, n_neighbors=n_neighbors)
S = spe.fit_transform(X_blobs_close)
df_blobs_close['S0'] = S[:,0]
df_blobs_close['S1'] = S[:,1]
plot_clusters2d(df_blobs_close, hue='blob_labels', vars=['S0','S1'])

## Problem 8 (Spectral Clustering)

Now we want to see how well things work if we try to use spectral embeddings to cluster. We don't need to limit ourselves to two components now, since we're clustering. Let's try with thirty to start, and we'll just use $k$-means to cluster the data:

In [None]:
spe_clust = SpectralEmbedding(random_state=201610279, n_components=30, n_neighbors=300)
SE = spe_clust.fit_transform(X_blobs_close)
spe_km = KMeans(random_state=201610278)
df_blobs_close['spe_labels'] = spe_km.fit_predict(SE)
plot_clusters2d(df_blobs_close, hue='spe_labels', vars=['S0','S1'])
plot_clusters2d(df_blobs_close, hue='spe_labels', vars=['M0','M1'])
plot_clusters2d(df_blobs_close, hue='spe_labels', vars=['T0','T1'])

This is ok, but not great. Play with the value of `n_components` to see if you can improve the clustering:

In [None]:
n_components = -1
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
spe_clust = SpectralEmbedding(random_state=201610279, n_components=n_components, n_neighbors=300)
SE = spe_clust.fit_transform(X_blobs_close)
spe_km = KMeans(random_state=201610278)
df_blobs_close['spe_labels'] = spe_km.fit_predict(SE)
plot_clusters2d(df_blobs_close, hue='spe_labels', vars=['S0','S1'])
plot_clusters2d(df_blobs_close, hue='spe_labels', vars=['M0','M1'])
plot_clusters2d(df_blobs_close, hue='spe_labels', vars=['T0','T1'])