In [None]:
import numpy as np
from utils import plot_cluster_centers, load_images_labels

def compute_dist(X, centers):
    """
    Compute the Euclidean distance between each point in X and each center in 'centers'.
    X: N x D
    centers: K x D
    Output: N x K matrix of distances
    """
    N, D = X.shape
    K = centers.shape[0]
    dist = np.sqrt(np.sum((X[:, np.newaxis, :] - centers[np.newaxis, :, :])**2, axis=2))
    return dist

def mykmeans(digits, K):
    """
    Perform K-means clustering on 'digits' using K clusters.
    
    digits: (N, D) - N data points, each of dimension D = 784
    K: number of clusters
    max_iter: maximum number of iterations
    
    Returns:
      - cluster_assignments: (N,) array of cluster indices [0..K-1]
      - centers: (K, D) array of cluster centers
    """

def mykmedoids(digits, K):
    """
    Perform K-medoids clustering on 'digits' using K clusters.
    
    digits: (N, D) - N data points, each of dimension D = 784
    K: number of clusters
    max_iter: maximum number of iterations
    
    Returns:
      - cluster_assignments: (N,) array of cluster indices [0..K-1]
      - centers: (K, D) array of medoids (a subset of the original points)
    """

def evaluate_clustering(cluster_assignments, labels, K):
    """
    Evaluate the clustering result by assigning each cluster to the most frequent 
    true label in that cluster, then compute the overall accuracy.
    
    cluster_assignments: (N,) array of cluster indices
    labels: (N,) array of true digit labels
    K: number of clusters
    Returns:
      accuracy (float)
    """
    correct = 0
    N = len(labels)
    
    for k in range(K):
        cluster_points_idx = np.where(cluster_assignments == k)[0]
        if len(cluster_points_idx) == 0:
            continue
        cluster_labels = labels[cluster_points_idx]
        most_common_label = np.bincount(cluster_labels).argmax()
        correct += np.sum(cluster_labels == most_common_label)
    
    accuracy = correct / N
    return accuracy


def main():
    # Load the data
    directory = "data"
    images, labels = load_images_labels(directory)
    
    # In case you get an out of memory error you may decrease N
    N = 1000
    images_subset = images[:N].astype(np.float32)
    labels_subset = labels[:N]
    
    K = 5  # Number of clusters

    # Apply K-Medoids on the clean data
    print("Running K-Medoids on clean data...")
    kmedoids_assignments, kmedoids_centers = mykmedoids(images_subset, K)

    # Apply K-Means on the clean data
    print("Running K-Means on clean data...")
    kmeans_assignments, kmeans_centers = mykmeans(images_subset, K)
    print(kmeans_centers.shape)

    # Plot cluster centers
    plot_cluster_centers(kmeans_centers, "K-Means Cluster Centers (Clean Data)", K)
    plot_cluster_centers(kmedoids_centers, "K-Medoids Cluster Centers (Clean Data)", K)

    # Evaluate clustering accuracy
    kmedoids_accuracy = evaluate_clustering(kmedoids_assignments, labels_subset, K)
    print(f"K-Medoids accuracy (clean data): {kmedoids_accuracy:.4f}")
    kmeans_accuracy = evaluate_clustering(kmeans_assignments, labels_subset, K)
    print(f"K-Means accuracy (clean data): {kmeans_accuracy:.4f}")

    # Add noisy contaminanted datapoints/outliers to the dataset
    print("Adding contaminanted datapoints to the data...")
    num_outliers = 100
    image_size = images_subset.shape[1]
    outliers = np.random.randint(0, 256, size=(num_outliers, image_size)).astype(np.float32)
    images_contaminated = np.vstack([images_subset, outliers])
    labels_contaminated = np.hstack([labels_subset, -1 * np.ones(num_outliers, dtype=int)])
    print("Running K-Medoids on contaminated data...")
    kmedoids_assignments_cont, kmedoids_centers_cont = mykmedoids(images_contaminated, K)
    print("Running K-Means on contaminated data...")
    kmeans_assignments_cont, kmeans_centers_cont = mykmeans(images_contaminated, K)
    
    # Evaluate clustering accuracy after contamination
    mask_no_outliers = labels_contaminated != -1
    kmedoids_accuracy_cont = evaluate_clustering(kmedoids_assignments_cont[mask_no_outliers], labels_contaminated[mask_no_outliers], K)
    print(f"K-Medoids accuracy (contaminated data): {kmedoids_accuracy_cont:.4f}")
    kmeans_accuracy_cont = evaluate_clustering(kmeans_assignments_cont[mask_no_outliers], labels_contaminated[mask_no_outliers], K)
    print(f"K-Means accuracy (contaminated data): {kmeans_accuracy_cont:.4f}")

if __name__ == "__main__":
    main()