In [1]:
import numpy as np
from sklearn.datasets import make_blobs
import os

def generate_and_save_datasets_csv(
    dims=(4, 16, 32, 64, 128),
    n_samples=1000,
    n_clusters=5,
    cluster_std=0.8,
    seed=1234,
    out_dir="kmeans_datasets_csv"
):
    """
    For each dimensionality in `dims`, generate Gaussian‐blob data
    and save (features + label) as a single CSV for easy C++ loading.
    """
    # fix RNG for everything
    np.random.seed(seed)
    os.makedirs(out_dir, exist_ok=True)

    for d in dims:
        # 1) sample from Gaussians
        X, y = make_blobs(
            n_samples=n_samples,
            n_features=d,
            centers=n_clusters,
            cluster_std=cluster_std,
            random_state=seed
        )

        # 2) stack features + label into one array
        data = np.hstack([X, y.reshape(-1, 1)])  # shape (n_samples, d+1)

        # 3) CSV filename
        fname = os.path.join(
            out_dir,
            f"blobs_{n_samples}x{d}d_{n_clusters}c_seed{seed}.csv"
        )

        # 4) write with a header: f0,f1,...,f{d-1},label
        header = ",".join([f"f{i}" for i in range(d)] + ["label"])
        np.savetxt(
            fname,
            data,
            delimiter=",",
            header=header,
            comments=""          # avoid '#' prefix on header line
        )

        print(f"Saved {X.shape} + labels → {fname}")

if __name__ == "__main__":
    # example: 3 different cluster counts
    for k in (5, 10, 15):
        generate_and_save_datasets_csv(
            dims=[4, 16, 32, 64, 128],
            n_samples=1000,
            n_clusters=k,
            cluster_std=0.8,
            seed=1234,
            out_dir="kmeans_datasets_csv"
        )




Saved (1000, 4) + labels → kmeans_datasets_csv/blobs_1000x4d_5c_seed1234.csv
Saved (1000, 16) + labels → kmeans_datasets_csv/blobs_1000x16d_5c_seed1234.csv
Saved (1000, 32) + labels → kmeans_datasets_csv/blobs_1000x32d_5c_seed1234.csv
Saved (1000, 64) + labels → kmeans_datasets_csv/blobs_1000x64d_5c_seed1234.csv
Saved (1000, 128) + labels → kmeans_datasets_csv/blobs_1000x128d_5c_seed1234.csv
Saved (1000, 4) + labels → kmeans_datasets_csv/blobs_1000x4d_10c_seed1234.csv
Saved (1000, 16) + labels → kmeans_datasets_csv/blobs_1000x16d_10c_seed1234.csv
Saved (1000, 32) + labels → kmeans_datasets_csv/blobs_1000x32d_10c_seed1234.csv
Saved (1000, 64) + labels → kmeans_datasets_csv/blobs_1000x64d_10c_seed1234.csv
Saved (1000, 128) + labels → kmeans_datasets_csv/blobs_1000x128d_10c_seed1234.csv
Saved (1000, 4) + labels → kmeans_datasets_csv/blobs_1000x4d_15c_seed1234.csv
Saved (1000, 16) + labels → kmeans_datasets_csv/blobs_1000x16d_15c_seed1234.csv
Saved (1000, 32) + labels → kmeans_datasets_csv

In [3]:
import numpy as np
import os
from glob import glob

def generate_and_save_inits_csv(
    data_dir="kmeans_datasets_csv",
    init_seed=999,
    init_out_dir="kmeans_inits",
    replace=False
):
    """
    For each CSV in `data_dir`, pick k initial centroids (rows of X),
    then save them both as .npz and as .csv for easy loading in any language.
    """
    rng = np.random.RandomState(init_seed)
    os.makedirs(init_out_dir, exist_ok=True)

    for csv_path in glob(os.path.join(data_dir, "*.csv")):
        base = os.path.basename(csv_path).rsplit(".", 1)[0]
        parts = base.split("_")
        k = int(parts[-2][:-1])  # e.g. "10c" → 10 clusters

        # Load data: last column is the label
        data = np.loadtxt(csv_path, delimiter=",", skiprows=1)
        X = data[:, :-1]  # drop the label column

        # Pick k unique initial indices
        if replace:
            idx = rng.randint(0, X.shape[0], size=k)
        else:
            idx = rng.choice(X.shape[0], size=k, replace=False)

        centers = X[idx]  # shape (k, d)

        # Prepare output names
        npz_name = f"{base}_init_seed{init_seed}.npz"
        csv_name = f"{base}_init_seed{init_seed}.csv"
        npz_path = os.path.join(init_out_dir, npz_name)
        csv_path_out = os.path.join(init_out_dir, csv_name)

        
        # 2) Save as .csv with header f0,f1,...,f{d-1}
        d = centers.shape[1]
        header = ",".join(f"f{i}" for i in range(d))
        np.savetxt(
            csv_path_out,
            centers,
            delimiter=",",
            header=header,
            comments=""  # no '#' prefix
        )
        print(f"Saved CSV init → {csv_path_out}")

if __name__ == "__main__":
    generate_and_save_inits_csv(
        data_dir="kmeans_datasets_csv",
        init_seed=1234,
        init_out_dir="kmeans_inits",
        replace=False
    )


Saved NPZ init → kmeans_inits/blobs_1000x128d_15c_seed1234_init_seed1234.npz
Saved CSV init → kmeans_inits/blobs_1000x128d_15c_seed1234_init_seed1234.csv
Saved NPZ init → kmeans_inits/blobs_1000x128d_10c_seed1234_init_seed1234.npz
Saved CSV init → kmeans_inits/blobs_1000x128d_10c_seed1234_init_seed1234.csv
Saved NPZ init → kmeans_inits/blobs_1000x16d_10c_seed1234_init_seed1234.npz
Saved CSV init → kmeans_inits/blobs_1000x16d_10c_seed1234_init_seed1234.csv
Saved NPZ init → kmeans_inits/blobs_1000x16d_15c_seed1234_init_seed1234.npz
Saved CSV init → kmeans_inits/blobs_1000x16d_15c_seed1234_init_seed1234.csv
Saved NPZ init → kmeans_inits/blobs_1000x4d_10c_seed1234_init_seed1234.npz
Saved CSV init → kmeans_inits/blobs_1000x4d_10c_seed1234_init_seed1234.csv
Saved NPZ init → kmeans_inits/blobs_1000x4d_15c_seed1234_init_seed1234.npz
Saved CSV init → kmeans_inits/blobs_1000x4d_15c_seed1234_init_seed1234.csv
Saved NPZ init → kmeans_inits/blobs_1000x32d_5c_seed1234_init_seed1234.npz
Saved CSV ini

In [12]:
# Try loading the dataset and Run the standard sklean k-means on it
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# 1) Load the dataset
data = np.load("kmeans_datasets/blobs_1000x4d_15c_seed1234.npz")
X, y_true = data["X"], data["y"]
print(f"Loaded X shape = {X.shape}, y shape = {y_true.shape}")

# 2) Fit KMeans
k = len(np.unique(y_true))  # here 5
km = KMeans(n_clusters=k, random_state=1234, n_init=10).fit(X)
labels = km.labels_

# 3) Print diagnostics
print(f"Inertia (sum of squared distances to centers): {km.inertia_:.2f}")
print(f"Adjusted Rand Index vs. true labels: {adjusted_rand_score(y_true, labels):.3f}")
print("Cluster centers:")
print(km.cluster_centers_)

Loaded X shape = (1000, 4), y shape = (1000,)
Inertia (sum of squared distances to centers): 2441.24
Adjusted Rand Index vs. true labels: 0.994
Cluster centers:
[[-3.50027355  0.15319232 -7.88932624  2.0205975 ]
 [ 6.16869426 -6.97520434  4.06583079  4.10744249]
 [ 9.08231678  7.47247188 -2.73078921  0.10647892]
 [-6.08336754  2.5191089  -1.26621076  5.72669514]
 [ 5.64678199  9.78206178  9.08379511  5.88615085]
 [-3.70303593  1.44541859  7.3691199  -1.25040835]
 [ 1.32473922 -9.70346035  2.50953436  8.22277332]
 [-5.53560785  8.65018089 -1.28120247  8.30197841]
 [ 5.58775272 -4.56457793 -4.38233409  5.96089037]
 [ 8.55349654  3.06979102 -2.07752082  5.92970114]
 [-8.79432031 -6.20045176 -9.01896447  3.36056381]
 [-2.70417265  2.57170718 -8.48745928 -2.67715779]
 [ 3.59888359  4.2108529  -2.56126322  1.03184002]
 [ 1.79063942  0.77364445 -9.11669671  1.2239335 ]
 [ 0.19055982 -9.5843783   5.50249584  7.6392874 ]]
