In [17]:
from sklearn.datasets import make_blobs
import time

# Crear un dataset con 2,000,000 puntos y 5 centros
X_np, _ = make_blobs(n_samples=2_000_000, centers=5, n_features=2, random_state=42)

In [18]:
import numpy as np

def kmeans(X, k, max_iters=100, tol=1e-7, verbose=False):
    # X es un array (n_samples, n_features)
    n_samples, n_features = X.shape

    # Elegir k puntos iniciales aleatorios como centroides
    rng = np.random.default_rng(42)
    random_indices = rng.choice(n_samples, size=k, replace=False)
    centroids = X[random_indices]

    for iteration in range(max_iters):
        # Calcular distancias (n_samples x k)
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        # Asignar cada punto al centroide más cercano
        labels = np.argmin(distances, axis=1)

        # Calcular nuevos centroides
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])

        # Verificar convergencia
        shift = np.linalg.norm(centroids - new_centroids)
        if verbose:
            print(f"Iter {iteration}, shift = {shift:.6f}")
        if shift < tol:
            break

        centroids = new_centroids

    return centroids, labels

In [19]:
#time pairwise distance calculations
start = time.time()
centroids, labels = kmeans(X_np, k=5, verbose=False)
print("Time to Kmeans with Numpy:", time.time() - start)
print("Centroids:\n", centroids)



Time to Kmeans with Numpy: 90.00986742973328
Centroids:
 [[-2.49178137  8.99316232]
 [-7.36719593 -6.24135696]
 [-6.40000796 -7.51111283]
 [-8.83723681  7.32405197]
 [ 3.3425889   3.05661113]]


In [None]:
import numpy as npy
import dpnp as np 

def kmeans_dpnp(X, k, max_iters=100, tol=1e-7, verbose=False):
    # X es un array (n_samples, n_features)
    n_samples, n_features = X.shape

    # Elegir k puntos iniciales aleatorios como centroides
    rng = npy.random.default_rng(42)
    random_indices = rng.choice(n_samples, size=k, replace=False)
    random_indices_dp = np.asarray(random_indices, device=X.sycl_device)
    centroids = X[random_indices_dp]

    for iteration in range(max_iters):
        # Calcular distancias (n_samples x k)
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        # Asignar cada punto al centroide más cercano
        labels = np.argmin(distances, axis=1)

        # Calcular nuevos centroides
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])

        # Verificar convergencia
        shift = np.linalg.norm(centroids - new_centroids)
        shift_host = shift.asnumpy()  # in host
        if verbose:
            print(f"Iter {iteration}, shift = {shift_host:.6f}")
        if shift < tol:
            break

        centroids = new_centroids

    return centroids, labels

X_dpnp = np.asarray(X_np, device="gpu")  # Convert from numpy to dpnp array

start = time.time()
centroids, labels = kmeans_dpnp(X_dpnp, k=5, verbose=False)
print("Time to Kmeans with Numpy:", time.time() - start)
print("Centroids:\n", centroids.asnumpy())
