In [7]:
import numpy as np
from threading import Thread, Lock

# Thread-safe centroid update
class ThreadSafeCentroids:
    def __init__(self, centroids):
        self.centroids = centroids
        self.lock = Lock()

    def update(self, updates):
        with self.lock:
            for i, (new_sum, count) in updates.items():
                if count > 0:
                    self.centroids[i] = new_sum / count

# K-means clustering with threading
def kmeans_threaded(data, k, init_centroids = None, 
                    max_iterations=100, num_threads=1):
    n_samples, n_features = data.shape
    if init_centroids is None:
        centroids = data[np.random.choice(n_samples, k, replace=False)]
    else:
        centroids = init_centroids
    thread_safe_centroids = ThreadSafeCentroids(centroids)

    def assign_and_update(chunk, thread_id, chunk_updates):
        chunk_centroids = np.zeros((k, n_features))
        counts = np.zeros(k)

        distances = np.linalg.norm(chunk[:, None] - centroids[None, :], axis=2)
        clusters = np.argmin(distances, axis=1)

        for i, cluster in enumerate(clusters):
            chunk_centroids[cluster] += chunk[i]
            counts[cluster] += 1

        chunk_updates[thread_id] = {i: (chunk_centroids[i], counts[i]) for i in range(k)}

    for _ in range(max_iterations):
        # Step 1: Divide data into chunks
        chunk_size = n_samples // num_threads
        threads = []
        chunk_updates = [None] * num_threads

        # Step 2: Create and start threads
        for i in range(num_threads):
            start = i * chunk_size
            end = (i + 1) * chunk_size if i != num_threads - 1 else n_samples
            chunk = data[start:end]
            thread = Thread(target=assign_and_update, args=(chunk, i, chunk_updates))
            threads.append(thread)
            thread.start()

        # Step 3: Wait for all threads to finish
        for thread in threads:
            thread.join()

        # Step 4: Update centroids
        global_updates = {}
        for updates in chunk_updates:
            for cluster_id, (new_sum, count) in updates.items():
                if cluster_id not in global_updates:
                    global_updates[cluster_id] = (new_sum, count)
                else:
                    current_sum, current_count = global_updates[cluster_id]
                    global_updates[cluster_id] = (current_sum + new_sum, current_count + count)

        thread_safe_centroids.update(global_updates)

        # Check for convergence
        if np.allclose(centroids, thread_safe_centroids.centroids):
            break
        centroids = np.copy(thread_safe_centroids.centroids)

    return centroids

In [14]:
import numpy as np
from multiprocessing import Process, Manager, Array
import ctypes
from scipy.spatial.distance import cdist

# K-means clustering with multiprocessing
def kmeans_multiprocess(data, k, init_centroids=None,
                        max_iterations=100, num_processes=4):
    n_samples, n_features = data.shape
    data = np.array(data, dtype=np.float64)

    # Initialize centroids
    if init_centroids is None:
        centroids = data[np.random.choice(n_samples, k, replace=False)]
    else:
        centroids = init_centroids    
    shared_centroids = Array(ctypes.c_double, centroids.flatten(), lock=False)

    def assign_and_update(chunk, return_dict, process_id):
        local_centroids = np.frombuffer(shared_centroids).reshape((k, n_features))
        chunk_centroids = np.zeros((k, n_features))
        counts = np.zeros(k)

        # Compute distances and assign clusters
        distances = cdist(chunk, local_centroids)
        clusters = np.argmin(distances, axis=1)

        # Compute chunk centroids
        for i, cluster in enumerate(clusters):
            chunk_centroids[cluster] += chunk[i]
            counts[cluster] += 1

        # Store updates in a shared dictionary
        return_dict[process_id] = (chunk_centroids, counts)

    for _ in range(max_iterations):
        # Divide data into chunks
        chunk_size = n_samples // num_processes
        processes = []
        manager = Manager()
        return_dict = manager.dict()

        # Start processes
        for i in range(num_processes):
            start = i * chunk_size
            end = (i + 1) * chunk_size if i != num_processes - 1 else n_samples
            chunk = data[start:end]
            process = Process(target=assign_and_update, args=(chunk, return_dict, i))
            processes.append(process)
            process.start()

        # Wait for all processes to complete
        for process in processes:
            process.join()

        # Aggregate updates
        global_centroids = np.zeros((k, n_features))
        global_counts = np.zeros(k)
        for chunk_centroids, counts in return_dict.values():
            global_centroids += chunk_centroids
            global_counts += counts

        # Update centroids
        for i in range(k):
            if global_counts[i] > 0:
                centroids[i] = global_centroids[i] / global_counts[i]

        # Update shared centroids
        np.copyto(np.frombuffer(shared_centroids).reshape((k, n_features)), centroids)

        # Check for convergence
        if np.allclose(centroids, np.frombuffer(shared_centroids).reshape((k, n_features))):
            break

    return np.array(centroids)

In [9]:
data = np.random.rand(10_000_000, 10)
k = 3
init_centroids = centroids = data[np.random.choice(data.shape[0], k, replace=False)]

In [10]:
import os
import time

In [11]:
st = time.perf_counter()
centroids = kmeans_threaded(data, k, 
                            init_centroids.copy(), 
                            max_iterations=100, 
                            num_threads=1)
et = time.perf_counter()
print(f"Execute in {et-st} seconds")

Execute in 12.636538830585778 seconds


In [12]:
centroids

array([[0.49395107, 0.57331377, 0.52462958, 0.3068108 , 0.38209598,
        0.60287094, 0.35794831, 0.67290944, 0.49404066, 0.34783139],
       [0.60510198, 0.4776329 , 0.60299206, 0.47767964, 0.55563296,
        0.26324388, 0.36862252, 0.53048049, 0.44516577, 0.63539886],
       [0.45112466, 0.48017385, 0.43905172, 0.59177912, 0.52225121,
        0.57350185, 0.62429002, 0.41273632, 0.52935063, 0.49725408]])

In [23]:
st = time.perf_counter()
centroids = kmeans_multiprocess(data, k, 
                            init_centroids.copy(), 
                            max_iterations=100, 
                            num_processes=3)
et = time.perf_counter()
print(f"Execute in {et-st} seconds")

Execute in 4.03635769803077 seconds


In [24]:
centroids

array([[0.49395107, 0.57331377, 0.52462958, 0.3068108 , 0.38209598,
        0.60287094, 0.35794831, 0.67290944, 0.49404066, 0.34783139],
       [0.60510198, 0.4776329 , 0.60299206, 0.47767964, 0.55563296,
        0.26324388, 0.36862252, 0.53048049, 0.44516577, 0.63539886],
       [0.45112466, 0.48017385, 0.43905172, 0.59177912, 0.52225121,
        0.57350185, 0.62429002, 0.41273632, 0.52935063, 0.49725408]])

In [18]:
from sklearn.cluster import KMeans

In [19]:
st = time.perf_counter()
kmeans = KMeans(n_clusters=k, 
                init=init_centroids.copy(), 
                max_iter=100, tol=1e-8).fit(data)
et = time.perf_counter()
print(f"Execute in {et-st} seconds")

Execute in 9.78438676521182 seconds


In [20]:
kmeans.cluster_centers_

array([[0.50046925, 0.50049585, 0.49996786, 0.49906113, 0.50097786,
        0.68321648, 0.23204047, 0.49996228, 0.49980606, 0.50041033],
       [0.50017562, 0.4993498 , 0.49960967, 0.50060317, 0.49988174,
        0.19631078, 0.49942716, 0.50026178, 0.50031572, 0.49963537],
       [0.49975323, 0.5001165 , 0.50050785, 0.49996848, 0.49949139,
        0.68230043, 0.76893009, 0.50001688, 0.49964111, 0.50011759]])