In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import accuracy_score
import random as rand
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import time
import seaborn as sb

In [2]:
data = pd.read_csv("data.csv")

In [3]:
def dist_euclidean(x1, x2):
    return np.linalg.norm(x1 - x2)

def sim_cosine(x1, x2):
    dot_prod = np.dot(x1, x2)
    norm_x1 = np.linalg.norm(x1)
    norm_x2 = np.linalg.norm(x2)
    sim = dot_prod / (norm_x1 * norm_x2)
    return 1 - sim

def sim_jaccard(x1, x2):
    intersection = np.sum(np.minimum(x1, x2))
    union = np.sum(np.maximum(x1, x2))
    sim = intersection / union
    return 1 - sim

def kmeans_custom(data, k, dist_func, max_iters=100):
    n_samples, n_features = data.shape
    centr = data[np.random.choice(n_samples, k, replace=False)]
    labels = np.zeros(n_samples)

    for _ in range(max_iters):
        for i in range(n_samples):
            dists = [dist_func(data[i], c) for c in centr]
            labels[i] = np.argmin(dists)

        for j in range(k):
            mask = labels == j
            centr[j] = np.mean(data[mask], axis=0)

    sse = sum([dist_func(data[i], centr[int(labels[i])])**2 for i in range(n_samples)])

    return labels, centr, sse

data = pd.read_csv("data.csv", header=None).to_numpy()
labels = pd.read_csv("label.csv", header=None).to_numpy()
# Set the number of clusters (k)
K = len(np.unique(labels))  # Number of categories in label

dist_funcs = [dist_euclidean, sim_cosine, sim_jaccard]
dist_func_names = ['Euclidean', 'Cosine', 'Jaccard']

results = {}

for dist_func, dist_func_name in zip(dist_funcs, dist_func_names):
    cluster_labels, cluster_centroids, sse = kmeans_custom(data, K, dist_func)
    accuracy = np.sum(cluster_labels == labels.squeeze()) / len(labels)
    results[dist_func_name] = {'SSE': sse, 'Accuracy': accuracy}

for method, result in results.items():
    print(f"{method} - SSE: {result['SSE']:.2f}, Accuracy: {result['Accuracy'] * 100:.2f}%")


Euclidean - SSE: 25405440131.00, Accuracy: 3.30%
Cosine - SSE: 686.73, Accuracy: 6.23%
Jaccard - SSE: 3652.66, Accuracy: 18.73%


In [6]:
data = pd.read_csv("data.csv", header=None).to_numpy()
labels = pd.read_csv("label.csv", header=None).to_numpy().flatten()

def majority_vote_kmeans(data, k, dist_metric, max_iters=100):
    num_samples, num_features = data.shape
    centr = data[np.random.choice(num_samples, k, replace=False)]
    cluster_assignments = np.zeros(num_samples)

    for _ in range(max_iters):
        for i in range(num_samples):
            dists = [dist_metric(data[i], c) for c in centr]
            cluster_assignments[i] = np.argmin(dists)

        for j in range(k):
            cluster_mask = cluster_assignments == j
            centr[j] = np.mean(data[cluster_mask], axis=0)

    cluster_labels = np.zeros(k)
    for j in range(k):
        cluster_mask = cluster_assignments == j
        majority_label = np.argmax(np.bincount(labels[cluster_mask].astype(int)))
        cluster_labels[j] = majority_label

    predicted_labels = np.array([cluster_labels[int(label)] for label in cluster_assignments])

    accuracy = np.sum(predicted_labels == labels.squeeze()) / len(labels)

    return accuracy


dist_metrics = [dist_euclidean, sim_cosine, sim_jaccard]
dist_metric_names = ['Euclidean', 'Cosine', 'Jaccard']


accuracy_results = {}

for dist_metric, dist_metric_name in zip(dist_metrics, dist_metric_names):
    accuracy = majority_vote_kmeans(data, K, dist_metric)
    accuracy_results[dist_metric_name] = accuracy

for method, accuracy in accuracy_results.items():
    print(f"{method} K-means with Majority Vote - Accuracy: {accuracy * 100:.2f}%")


Euclidean K-means with Majority Vote - Accuracy: 55.65%
Cosine K-means with Majority Vote - Accuracy: 61.43%
Jaccard K-means with Majority Vote - Accuracy: 62.37%


In [7]:
def kmeans_with_stop_criteria(data, k, dist_metric, max_iters=500, conv_threshold=1e-4):
    num_samples, num_features = data.shape
    centr = data[np.random.choice(num_samples, k, replace=False)]
    cluster_assignments = np.zeros(num_samples)

    for iter_count in range(max_iters):
        for i in range(num_samples):
            dists = [dist_metric(data[i], c) for c in centr]
            cluster_assignments[i] = np.argmin(dists)

        updated_centr = np.array([np.mean(data[cluster_assignments == j], axis=0) for j in range(k)])

        if np.all(np.abs(updated_centr - centr) < conv_threshold):
            break
        centr = updated_centr

    sum_sq_errors = sum([dist_metric(data[i], centr[int(cluster_assignments[i])])**2 for i in range(num_samples)])

    return cluster_assignments, centr, iter_count + 1, sum_sq_errors


dist_metrics = [dist_euclidean, sim_cosine, sim_jaccard]
dist_metric_names = ['Euclidean', 'Cosine', 'Jaccard']
kmeans_results = {}

for dist_metric, dist_metric_name in zip(dist_metrics, dist_metric_names):
    start_time = time.time()

    cluster_labels, cluster_centroids, num_iterations, sse = kmeans_with_stop_criteria(data, K, dist_metric)

    end_time = time.time()

    kmeans_results[dist_metric_name] = {
        'Cluster Labels': cluster_labels,
        'Centroids': cluster_centroids,
        'Num Iterations': num_iterations,
        'SSE': sse,
        'Time to Converge': end_time - start_time
    }

for method, result in kmeans_results.items():
    print(f"{method} - Iterations: {result['Num Iterations']}, SSE: {result['SSE']:.2f}, Time taken to Converge: {result['Time to Converge']:.4f} seconds")


Euclidean - Iterations: 32, SSE: 25588697827.99, Time taken to Converge: 11.9261 seconds
Cosine - Iterations: 65, SSE: 697.21, Time taken to Converge: 41.4066 seconds
Jaccard - Iterations: 104, SSE: 3661.31, Time taken to Converge: 72.5497 seconds


In [8]:
def kmeans_with_conv_check(data, k, dist_metric, max_iters=100, conv_threshold=1e-4):
    num_samples, num_features = data.shape
    centr = data[np.random.choice(num_samples, k, replace=False)]
    cluster_assignments = np.zeros(num_samples)

    for iteration in range(max_iters):
        for i in range(num_samples):
            dists = [dist_metric(data[i], c) for c in centr]
            cluster_assignments[i] = np.argmin(dists)

        updated_centr = np.array([np.mean(data[cluster_assignments == j], axis=0) for j in range(k)])

        if np.all(np.abs(updated_centr - centr) < conv_threshold):
            break

        centr = updated_centr

    sum_sq_errors = sum([dist_metric(data[i], centr[int(cluster_assignments[i])])**2 for i in range(num_samples)])

    return cluster_assignments, centr, iteration + 1, sum_sq_errors


dist_metrics = [dist_euclidean, sim_cosine, sim_jaccard]
dist_metric_names = ['Euclidean', 'Cosine', 'Jaccard']


sum_sq_errors_results = {}

for dist_metric, dist_metric_name in zip(dist_metrics, dist_metric_names):
    cluster_labels, cluster_centroids, num_iterations, sse = kmeans_with_conv_check(data, K, dist_metric)

    sum_sq_errors_results[dist_metric_name] = sse

for method, sse in sum_sq_errors_results.items():
    print(f"{method} K-means - SSE: {sse:.2f}")


Euclidean K-means - SSE: 25458240667.60
Cosine K-means - SSE: 695.36
Jaccard K-means - SSE: 3660.70
