In [None]:
import numpy as np
import pandas as pd


In [None]:
data = pd.read_csv("Wholesale customers data.csv")

# Drop non-behavior columns if present
X = data.drop(columns=['Channel', 'Region'], errors='ignore')

X = X.values  # convert to numpy array

In [None]:
X

array([[12669,  9656,  7561,   214,  2674,  1338],
       [ 7057,  9810,  9568,  1762,  3293,  1776],
       [ 6353,  8808,  7684,  2405,  3516,  7844],
       ...,
       [14531, 15488, 30243,   437, 14841,  1867],
       [10290,  1981,  2232,  1038,   168,  2125],
       [ 2787,  1698,  2510,    65,   477,    52]])

In [None]:
def standardize(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return (X - mean) / std

X = standardize(X)


In [None]:
X

array([[ 0.05293319,  0.52356777, -0.04111489, -0.58936716, -0.04356873,
        -0.06633906],
       [-0.39130197,  0.54445767,  0.17031835, -0.27013618,  0.08640684,
         0.08915105],
       [-0.44702926,  0.40853771, -0.0281571 , -0.13753572,  0.13323164,
         2.24329255],
       ...,
       [ 0.20032554,  1.31467078,  2.34838631, -0.54337975,  2.51121768,
         0.12145607],
       [-0.13538389, -0.51753572, -0.60251388, -0.41944059, -0.56977032,
         0.21304614],
       [-0.72930698, -0.5559243 , -0.57322717, -0.62009417, -0.50488752,
        -0.52286938]])

In [None]:
def initialize_centroids(X, k):
    indices = np.random.choice(len(X), k, replace=False)
    return X[indices]


In [None]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))


In [None]:
def assign_clusters(X, centroids):
    clusters = []
    for x in X:
        distances = [euclidean_distance(x, centroid) for centroid in centroids]
        cluster = np.argmin(distances)
        clusters.append(cluster)
    return np.array(clusters)


In [None]:
def update_centroids(X, clusters, k):
    new_centroids = []
    for i in range(k):
        cluster_points = X[clusters == i]
        centroid = np.mean(cluster_points, axis=0)
        new_centroids.append(centroid)
    return np.array(new_centroids)


In [None]:
def kmeans(X, k, max_iters=100):
    centroids = initialize_centroids(X, k)

    for _ in range(max_iters):
        clusters = assign_clusters(X, centroids)
        new_centroids = update_centroids(X, clusters, k)

        if np.allclose(centroids, new_centroids):
            break

        centroids = new_centroids

    return clusters, centroids


In [None]:
k = 3  # choose number of clusters
clusters, centroids = kmeans(X, k)

print("Cluster labels:", np.unique(clusters))


Cluster labels: [0 1 2]


### K-Means Unsupervised Learning Results

We applied the K-Means algorithm with `k=3` clusters. Below you can see the first few cluster assignments for your data points and the coordinates of the final centroids for each of the 3 clusters.

In [None]:
print("First 10 cluster assignments:")
display(clusters[:10])

print("\nFinal cluster centroids:")
display(centroids)

First 10 cluster assignments:


array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1])


Final cluster centroids:


array([[ 0.1281259 , -0.35989961, -0.42871402,  0.08230684, -0.42553086,
        -0.11812109],
       [-0.50380376,  0.57492137,  0.82322505, -0.33291267,  0.81607777,
         0.07552425],
       [ 1.09004412,  3.98320348,  3.58457916,  0.77799282,  3.56664129,
         2.25618179]])