In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from pca import pca
from gmm import *
import copy

In [None]:
# load synthetic data from MATLAB data file

variables = dict()
loadmat('synthData.mat', variables)

data = variables['data']

In [None]:
# run PCA

new_data, variances, eigenvectors = pca(data)

plt.figure()
plt.stem(variances.ravel())
plt.xlabel('Dimension')
plt.ylabel('Captured Variance')


plt.figure()
plt.plot(new_data[0,:], new_data[1,:], 'x')
plt.title('Transformed Data After PCA')

In [None]:
# truncate dimensions to just the first two
small_data = new_data[:2, :]

# split data for validation
d, n = small_data.shape

# use fraction of data for training

train_inds = np.random.rand(n) < 0.5

train_data = small_data[:, train_inds]
val_data = small_data[:, ~train_inds]

In [None]:
# Run k-means

num_clusters = 5

means = np.random.randn(d, num_clusters)

symbols = ['xr', 'og', 'sb', '*m', '+c']

for iter in range(10):
    # compute memberships
    distance = np.sum(small_data ** 2, 0, keepdims=True) + np.sum(means ** 2, 0, keepdims=True).T - \
               2 * means.T.dot(small_data)
    
    memberships = np.argmin(distance, 0)
    
    plot_k_means(small_data, means, memberships)
    plt.title("K-Means Iteration %d" % iter)
    plt.show()
    
    # update means

    for i in range(num_clusters):
        if np.any(memberships == i):
            means[:, i] = np.mean(small_data[:, memberships == i], 1)


plot_k_means(small_data, means, memberships)
plt.title("K-Means Iteration 10")

In [None]:
# Try Gaussian mixture models with different numbers of Gaussians

num_clusters = [1, 2, 3, 4, 5, 6, 8, 9, 10]

val_likelihood = []

all_means = []
all_sigmas = []
all_clust_probs = []

for k in range(len(num_clusters)):
    plt.figure()
    means, sigmas, clust_probs = gmm(train_data, num_clusters[k], plot='final')
    
    val_likelihood.append(gmm_ll(val_data, means, sigmas, clust_probs))

    all_means.append(means)
    all_sigmas.append(sigmas)
    all_clust_probs.append(clust_probs)

# plot likelihoods

plt.figure()

plt.plot(num_clusters, val_likelihood)
plt.xlabel('Number of Gaussians')
plt.ylabel('Log Likelihood of Val. Data')