In [4]:
import numpy as np
import matplotlib.pyplot as plt
import time

def initialize_centroids(X, k):
    """
    Randomly initialize centroids from the dataset.
    """
    np.random.seed(42)  # For reproducibility
    random_indices = np.random.permutation(X.shape[0])
    centroids = X[random_indices[:k]]
    return centroids

def compute_distances(X, centroids):
    """
    Compute the distance between each data point and each centroid.
    """
    distances = np.zeros((X.shape[0], len(centroids)))
    for i, centroid in enumerate(centroids):
        distances[:, i] = np.linalg.norm(X - centroid, axis=1)
    return distances

def assign_clusters(distances):
    """
    Assign each data point to the closest centroid.
    """
    return np.argmin(distances, axis=1)

def update_centroids(X, labels, k):
    """
    Update centroids as the mean of all points assigned to each cluster.
    """
    centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        points = X[labels == i]
        centroids[i] = points.mean(axis=0)
    return centroids

def kmeans(X, k, max_iters=20, tolerance=1e-4):
    """
    Perform the K-Means clustering algorithm.
    """
    start_time = time.time()
    centroids = initialize_centroids(X, k)
    for i in range(max_iters):
        iteration_time = time.time()
        old_centroids = centroids
        distances = compute_distances(X, centroids)
        labels = assign_clusters(distances)
        centroids = update_centroids(X, labels, k)

        print(f"Iteration: {i}\ttime taken: {time.time() - iteration_time}")
        
        # Check for convergence
        if np.all(np.abs(centroids - old_centroids) < tolerance):
            print(f"K-Means converged after {i+1} iterations.")
            break
    return centroids, labels

In [5]:
import os 

A3_DATASET_URL = "https://cs.joensuu.fi/sipu/datasets/a3.txt"
DATA_FOLDER = "/home/jovyan/work/data"
A3_LOCAL_PATH = os.path.join(DATA_FOLDER, "a3.txt")

k = 50  # Number of clusters
s = 10  # Number of centroid groups
max_iterations = 20
num_partitions = 10


# Download Data
if not os.path.exists(A3_LOCAL_PATH):
    with open(A3_LOCAL_PATH, 'wb') as file:
        response = requests.get(A3_DATASET_URL)
        file.write(response.content)

with open(A3_LOCAL_PATH, 'r') as f:
    data = np.array([np.array([float(x) for x in row.strip().split()]) for row in f])

In [7]:
centroids, labels = kmeans(data, k)

K-Means converged after 17 iterations.


In [9]:
from sklearn.metrics import silhouette_score

silhouette = silhouette_score(data, labels)
silhouette

np.float64(0.5244219454601392)