In [11]:
import numpy as np
from sklearn.datasets import make_blobs
import os

def generate_and_save_datasets(
    dims=(2, 10, 50),
    n_samples=1000,
    n_clusters=5,
    cluster_std=1.0,
    seed=42,
    out_dir="kmeans_datasets"
):
    """
    For each dimensionality in `dims`, generate a Gaussian‑blob dataset
    with a fixed seed so everyone gets the same files, and save to .npz.
    """
    # 1) Fix NumPy's global RNG
    np.random.seed(seed)

    os.makedirs(out_dir, exist_ok=True)

    for d in dims:
        # 2) Also pass the same seed into make_blobs
        X, y = make_blobs(
            n_samples=n_samples,
            n_features=d,
            centers=n_clusters,
            cluster_std=cluster_std,
            random_state=seed
        )
        fname = os.path.join(out_dir, f"blobs_{n_samples}x{d}d_{n_clusters}c_seed{seed}.npz")
        np.savez_compressed(fname, X=X, y=y)
        print(f"Saved {X.shape} → {fname}")


generate_and_save_datasets(
    dims=[4, 16, 32, 64, 128],
    n_samples=1000,
    n_clusters=5,
    cluster_std=0.8,
    seed=1234,              # ← every teammate uses this same seed
    out_dir="kmeans_datasets"
)

generate_and_save_datasets(
    dims=[4, 16, 32, 64, 128],
    n_samples=1000,
    n_clusters=10,
    cluster_std=0.8,
    seed=1234,              # ← every teammate uses this same seed
    out_dir="kmeans_datasets"
)

generate_and_save_datasets(
    dims=[4, 16, 32, 64, 128],
    n_samples=1000,
    n_clusters=15,
    cluster_std=0.8,
    seed=1234,              # ← every teammate uses this same seed
    out_dir="kmeans_datasets"
)

Saved (1000, 4) → kmeans_datasets/blobs_1000x4d_5c_seed1234.npz
Saved (1000, 16) → kmeans_datasets/blobs_1000x16d_5c_seed1234.npz
Saved (1000, 32) → kmeans_datasets/blobs_1000x32d_5c_seed1234.npz
Saved (1000, 64) → kmeans_datasets/blobs_1000x64d_5c_seed1234.npz
Saved (1000, 128) → kmeans_datasets/blobs_1000x128d_5c_seed1234.npz
Saved (1000, 4) → kmeans_datasets/blobs_1000x4d_10c_seed1234.npz
Saved (1000, 16) → kmeans_datasets/blobs_1000x16d_10c_seed1234.npz
Saved (1000, 32) → kmeans_datasets/blobs_1000x32d_10c_seed1234.npz
Saved (1000, 64) → kmeans_datasets/blobs_1000x64d_10c_seed1234.npz
Saved (1000, 128) → kmeans_datasets/blobs_1000x128d_10c_seed1234.npz
Saved (1000, 4) → kmeans_datasets/blobs_1000x4d_15c_seed1234.npz
Saved (1000, 16) → kmeans_datasets/blobs_1000x16d_15c_seed1234.npz
Saved (1000, 32) → kmeans_datasets/blobs_1000x32d_15c_seed1234.npz
Saved (1000, 64) → kmeans_datasets/blobs_1000x64d_15c_seed1234.npz
Saved (1000, 128) → kmeans_datasets/blobs_1000x128d_15c_seed1234.npz


In [8]:
data = np.load("kmeans_datasets/blobs_1000x10d_5c.npz")
X, y = data["X"], data["y"]
print(X, y)

[[ 2.72807458 -6.9097581  -4.02305094 ... -0.4392695   0.40265826
  -9.29086799]
 [-2.24449344  9.6811093   3.04489032 ...  7.65873812  1.45829075
   4.11683629]
 [-1.86887     9.61771919  5.59100953 ...  6.61802319  1.89184666
   3.56552944]
 ...
 [-2.54876859  9.55414172  3.74170122 ...  9.28776303  1.51210825
   3.73665399]
 [ 2.33926167 -9.54020976 -9.34508859 ... -6.85053347  2.63428571
  -0.88176994]
 [-9.26186784  9.8388351   7.00467186 ... -1.17405349 -1.23270793
  -2.40539236]] [2 0 0 3 0 4 1 3 0 4 1 0 3 0 0 1 4 3 3 1 1 2 2 3 4 2 4 2 1 2 0 1 0 1 3 1 2
 0 2 2 3 0 0 2 2 2 4 1 1 4 2 3 0 4 4 0 0 2 4 1 0 1 0 4 3 3 3 2 1 0 1 1 1 3
 1 4 3 2 4 3 3 2 1 0 2 1 2 3 2 0 4 0 1 3 0 0 0 0 1 2 1 3 1 4 1 4 3 3 3 4 4
 2 2 0 4 4 4 0 0 2 1 1 0 0 4 4 1 2 4 0 3 4 4 3 3 0 2 2 0 1 3 2 2 3 0 1 1 0
 0 4 4 0 3 3 1 4 1 3 3 4 1 4 4 1 2 2 1 2 1 1 4 0 2 1 3 4 2 1 3 4 0 3 2 3 1
 3 4 3 2 0 0 4 4 4 3 4 3 1 2 0 2 2 0 1 1 0 1 0 2 4 1 3 2 2 3 1 2 3 2 1 0 3
 2 3 3 3 1 3 2 4 3 0 1 4 1 1 2 4 2 1 2 4 0 4 4 3 0 4 3 4 1

In [12]:
# Try loading the dataset and Run the standard sklean k-means on it
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# 1) Load the dataset
data = np.load("kmeans_datasets/blobs_1000x4d_15c_seed1234.npz")
X, y_true = data["X"], data["y"]
print(f"Loaded X shape = {X.shape}, y shape = {y_true.shape}")

# 2) Fit KMeans
k = len(np.unique(y_true))  # here 5
km = KMeans(n_clusters=k, random_state=1234, n_init=10).fit(X)
labels = km.labels_

# 3) Print diagnostics
print(f"Inertia (sum of squared distances to centers): {km.inertia_:.2f}")
print(f"Adjusted Rand Index vs. true labels: {adjusted_rand_score(y_true, labels):.3f}")
print("Cluster centers:")
print(km.cluster_centers_)

Loaded X shape = (1000, 4), y shape = (1000,)
Inertia (sum of squared distances to centers): 2441.24
Adjusted Rand Index vs. true labels: 0.994
Cluster centers:
[[-3.50027355  0.15319232 -7.88932624  2.0205975 ]
 [ 6.16869426 -6.97520434  4.06583079  4.10744249]
 [ 9.08231678  7.47247188 -2.73078921  0.10647892]
 [-6.08336754  2.5191089  -1.26621076  5.72669514]
 [ 5.64678199  9.78206178  9.08379511  5.88615085]
 [-3.70303593  1.44541859  7.3691199  -1.25040835]
 [ 1.32473922 -9.70346035  2.50953436  8.22277332]
 [-5.53560785  8.65018089 -1.28120247  8.30197841]
 [ 5.58775272 -4.56457793 -4.38233409  5.96089037]
 [ 8.55349654  3.06979102 -2.07752082  5.92970114]
 [-8.79432031 -6.20045176 -9.01896447  3.36056381]
 [-2.70417265  2.57170718 -8.48745928 -2.67715779]
 [ 3.59888359  4.2108529  -2.56126322  1.03184002]
 [ 1.79063942  0.77364445 -9.11669671  1.2239335 ]
 [ 0.19055982 -9.5843783   5.50249584  7.6392874 ]]
