# Soft K-means
## Load data and plot it

In [None]:
import numpy as np
import scipy.io as sio
%matplotlib inline
import matplotlib.pyplot as plt

# Load synthetic data [dataset1.mat, dataset2.mat, dataset3.mat]
data = sio.loadmat('datasets/dataset1.mat')

data1_true = np.array(data['x'].T)
data2_true = np.array(data['y'].T)
print("Cluster1: {}, Cluster2: {}".format(data1_true.shape, data2_true.shape))

# Plot the data with true cluster label
plt.scatter(data1_true[:, 0], data1_true[:, 1], c='r', marker='.');
plt.scatter(data2_true[:, 0], data2_true[:, 1], c='b', marker='.');

## Train

In [None]:
# Merge the data to one matrix
data = np.vstack((data1_true, data2_true))

K = 2
N = data.shape[0]

# Initialize centers with K randomly selected data
######## YOUR CODE HERE
import random
centers = random.sample(data, K)


# Initialize responsibilities with zeros (N, K)
######## YOUR CODE HERE
resp = np.zeros((N, K))

beta = 0.1
max_iter = 2000
for step in range(max_iter):
    # Assign
    ######## YOUR CODE HERE
    x_minus_mu = np.expand_dims(data, 1) - np.expand_dims(centers, 0)
    numerator = np.exp(-beta * np.sum(x_minus_mu ** 2, 2))
    resp = numerator / np.sum(numerator, 1, keepdims=True)
    # Update
    ######## YOUR CODE HERE
    centers = (np.matmul(data.T, resp) / np.sum(resp, 0, keepdims=True)).T
    

predicted_cluster = np.argmax(resp, axis=1)

## Compare true clusters and predicted clusters

In [None]:
data1 = data[predicted_cluster == 0]
data2 = data[predicted_cluster == 1]
print("Prediction | Cluster1: {}, Cluster2: {}".format(data1.shape, data2.shape))
print("True       | Cluster1: {}, Cluster2: {}".format(data1_true.shape, data2_true.shape))

# Plot the data and compare it with true labeled data
f, axarr = plt.subplots(1, 2, sharey=True, figsize=(14, 5))
axarr[0].scatter(centers[0, 0], centers[0, 1], c='k', marker='x', s=70, linewidth=3);
axarr[0].scatter(centers[1, 0], centers[1, 1], c='k', marker='x', s=70, linewidth=3);
axarr[0].scatter(data1[:, 0], data1[:, 1], c='r', marker='.');
axarr[0].scatter(data2[:, 0], data2[:, 1], c='b', marker='.');
axarr[0].set_title('Prediction');
axarr[1].scatter(data1_true[:, 0], data1_true[:, 1], c='r', marker='.');
axarr[1].scatter(data2_true[:, 0], data2_true[:, 1], c='b', marker='.');
axarr[1].set_title('True');

## Evaluate how well K-means clustered data

In [None]:
# Evaluate clustering performance using Normalized Mutual Information
def get_NMI(true_cluster, predicted_cluster, K):
    """
    Get Normalized Mutual Information
    
    Args:
        true_cluster: a vector of shape (N) that has elements assigned true cluster for each data point
        predicted_cluster: a vector of shape (N) that has elements assigned predicted cluster for each data point
        K: the number of clusters
    Returns:
        nmi: Normalized mutual information
    """
    # very little value to prevent logarithm from -infinity
    eps = 1e-10
    
    # Define the number of data
    N = float(data.shape[0])
    
    true_entropy = 0
    predicted_entropy = 0
    # Compute entropy for ground truth clusters and predicted clusters
    ######## YOUR CODE HERE
    
    denominator = (true_entropy + predicted_entropy) / 2.
    
    mi = 0
    # Compute mutual information between ground truth clusters and predicted clusters
    ######## YOUR CODE HERE
    
    
    nmi = mi / denominator
    
    return nmi

true_cluster = np.concatenate((np.zeros(data1_true.shape[0]), np.ones(data2_true.shape[0])))
nmi = get_NMI(true_cluster, predicted_cluster, K)
print("Nomrlized Mutual Information: {}".format(nmi))