In [1]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
import tensorflow.compat.v1 as tf 
tf.disable_v2_behavior()
import numpy as np
import matplotlib.pyplot as plt
import helper as hlp

Instructions for updating:
non-resource variables are not supported in the long term


In [3]:
# Loading data
#data = np.load('data2D.npy')
data = np.load('data100D.npy')
[num_pts, dim] = np.shape(data)

In [4]:
K = 15

In [5]:
is_valid = True

# Splits the dataset into training and validation set:
if is_valid:
  valid_batch = int(num_pts / 3.0)
  np.random.seed(45689)
  rnd_idx = np.arange(num_pts)
  np.random.shuffle(rnd_idx)
  val_data = data[rnd_idx[:valid_batch]]
  data = data[rnd_idx[valid_batch:]]

In [6]:
X = tf.convert_to_tensor(data, dtype=tf.float64)
MU = tf.Variable(tf.random_normal(np.array([K, dim]), dtype=X.dtype))
X = tf.expand_dims(X, 0)
MU = tf.expand_dims(MU, 1)

In [7]:
# Distance function for K-means
def distanceFunc(X, MU):
    # Inputs
    # X: is an NxD matrix (N observations and D dimensions)
    # MU: is an KxD matrix (K means and D dimensions)
    # Outputs
    # pair_dist: is the squared pairwise distance matrix (NxK)
    # TODO
    
    pair_dist = tf.transpose(tf.reduce_sum(tf.square(X - MU), axis=2))
    return pair_dist

In [8]:
def loss_function(X, MU):
    dist = distanceFunc(X, MU) #get distances from data points to cluster means
    error = tf.reduce_min(dist, axis=1) #get smallest cluster-point distances
    loss = tf.reduce_sum(error) #calculate error
    return loss

In [9]:
def cluster_assignments(X, MU):
    dist = distanceFunc(X, MU)
    cluster = tf.argmin(dist, axis=1)
    return cluster

In [10]:
loss = loss_function(X, MU)
dist = distanceFunc(X, MU)
optimizer = tf.train.AdamOptimizer(learning_rate= 0.1, beta1=0.9, beta2=0.99, epsilon=1e-5).minimize(loss)

In [None]:
with tf.Session() as training_loop:
  tf.initializers.global_variables().run()

  training_loss = []
  validation_loss = []
  for epoch in range(150):
    new_MU, train_loss, new_dist, _ = training_loop.run([MU, loss, dist, optimizer])

    if is_valid:
      v_loss = loss_function(val_data,new_MU)
      v_dist = distanceFunc(val_data, new_MU)
      with tf.Session() as get_loss:
        tf.initializers.global_variables().run()
        valid_loss, valid_dist = get_loss.run([v_loss,v_dist])
        validation_loss.append(valid_loss/len(val_data))
      
    training_loss.append(train_loss/len(data))

    print("Epoch: ", epoch+1)
    print("Total training loss: ", train_loss)
    print("Average training loss:", train_loss / len(data))
    print("Average validation loss:", valid_loss / len(val_data))

Epoch:  1
Total training loss:  1109862.926897976
Average training loss: 166.47111547892246
Average validation loss: 149.48062177023618
Epoch:  2
Total training loss:  1002838.7003079114
Average training loss: 150.4182841319801
Average validation loss: 134.5284908645837
Epoch:  3
Total training loss:  902771.3925369619
Average training loss: 135.4089384336226
Average validation loss: 120.88537266365053
Epoch:  4
Total training loss:  811387.3407120749
Average training loss: 121.70201600601094
Average validation loss: 108.73580472005726
Epoch:  5
Total training loss:  730075.2186835317
Average training loss: 109.50580751215415
Average validation loss: 98.03405356031084
Epoch:  6
Total training loss:  658405.7272627228
Average training loss: 98.75592129334375
Average validation loss: 88.64640733970566
Epoch:  7
Total training loss:  595499.9168534249
Average training loss: 89.32052150193863
Average validation loss: 80.3812060369705
Epoch:  8
Total training loss:  540113.7890096959
Averag

In [None]:
# Training Loss Plot:

plt.plot(training_loss, label='Training Loss')
plt.xlabel('Number of Epochs')
plt.xlim(0, len(training_loss))
plt.ylabel('Average Loss')
plt.ylim(0, max(training_loss) + 1)
plt.title('Training Data Loss of K-means Clustering w/ ' + str(K) + ' Cluster Center(s) for dim = ' + str(dim))
plt.legend()
plt.show()

In [None]:
# Getting the all the distributions:
pred = np.argmin(new_dist, axis = 1) # The predictions of the model.

combined_data = np.concatenate((data, pred.reshape((len(pred),1))), axis =1)

# Getting accuracies of the 3 clusters:
cluster_distrib_percentage = []
final_data = []
for center in range(K):
  distrib_percentage = (pred==center).sum() / data.shape[0]
  cluster_distrib_percentage.append(distrib_percentage)
  d = combined_data[combined_data[:,2] == center]
  final_data.append(d) 

In [None]:
for i in range(K):
  print("Percentage of data in Cluster " + str(i + 1) + ": ", cluster_distrib_percentage[i])

In [None]:
# Training and Validation Loss Plot:

plt.plot(training_loss, label='Training Loss')
plt.plot(validation_loss, label='Validation Loss')
plt.xlabel('Number of Epochs')
plt.xlim(0, len(training_loss))
plt.ylabel('Average Loss')
plt.ylim(0, max(training_loss) + 1)
plt.title('Training and Validation Loss of K-means Clustering w/ '+str(K)+' Cluster Center(s) for dim = ' + str(dim))
plt.legend()
plt.show()

In [None]:
# Final Validation Error of the model

v_loss = loss_function(val_data,new_MU)
with tf.Session() as get_loss:
  tf.initializers.global_variables().run()
  valid_loss = get_loss.run(v_loss)

In [None]:
valid_loss #Total Validation Loss

In [None]:
valid_loss/len(val_data) #Avg Validation Loss

In [None]:
# Getting the all the distributions:
v_pred = np.argmin(valid_dist, axis = 1) # The predictions of the model.

v_combined_data = np.concatenate((val_data, v_pred.reshape((len(v_pred),1))), axis =1)

# Getting accuracies of the 3 clusters:
v_cluster_distrib_percentage = []
v_final_data = []
for center in range(K):
  v_distrib_percentage = (v_pred==center).sum() / val_data.shape[0]
  v_cluster_distrib_percentage.append(v_distrib_percentage)
  v_d = v_combined_data[v_combined_data[:,2] == center]
  v_final_data.append(v_d) 

In [None]:
for i in range(K):
  print("Percentage of vdata in Cluster " + str(i + 1) + ": ", v_cluster_distrib_percentage[i])