# Generate and Explore Data

In [1]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Set random seed for reproducibility
np.random.seed(42)

# 2D Data: 3 clusters, 150 points total
cluster_1_2d = np.random.randn(50, 2) + [2, 2]
cluster_2_2d = np.random.randn(50, 2) + [-2, -2]
cluster_3_2d = np.random.randn(50, 2) + [0, 4]
X_2d = np.vstack([cluster_1_2d, cluster_2_2d, cluster_3_2d])

# 3D Data: 3 clusters, 150 points total
cluster_1_3d = np.random.randn(50, 3) + [2, 2, 2]
cluster_2_3d = np.random.randn(50, 3) + [-2, -2, -2]
cluster_3_3d = np.random.randn(50, 3) + [0, 4, 0]
X_3d = np.vstack([cluster_1_3d, cluster_2_3d, cluster_3_3d])

# Visualize 2D data
fig_2d = px.scatter(x=X_2d[:, 0], y=X_2d[:, 1], title="2D Synthetic Data (Unclustered)",
                    labels={'x': 'Feature 1', 'y': 'Feature 2'})
fig_2d.show()

# Visualize 3D data
fig_3d = px.scatter_3d(x=X_3d[:, 0], y=X_3d[:, 1], z=X_3d[:, 2],
                       title="3D Synthetic Data (Unclustered)",
                       labels={'x': 'Feature 1', 'y': 'Feature 2', 'z': 'Feature 3'})
fig_3d.show()

# Build K-Means from Scratch

In [2]:
class KMeans:
    def __init__(self, k=3, max_iters=100):
        self.k = k
        self.max_iters = max_iters
        self.centroids = None
        self.labels = None

    def fit(self, X):
        n_samples, n_features = X.shape

        # Initialize centroids randomly from data points
        random_idx = np.random.choice(n_samples, self.k, replace=False)
        self.centroids = X[random_idx]

        for _ in range(self.max_iters):
            # Assign points to nearest centroid
            old_centroids = self.centroids.copy()
            distances = self._compute_distances(X)
            self.labels = np.argmin(distances, axis=1)

            # Update centroids
            for i in range(self.k):
                cluster_points = X[self.labels == i]
                if len(cluster_points) > 0:  # Avoid empty clusters
                    self.centroids[i] = cluster_points.mean(axis=0)

            # Check convergence
            if np.all(old_centroids == self.centroids):
                break

    def _compute_distances(self, X):
        # Euclidean distance from each point to each centroid
        distances = np.zeros((X.shape[0], self.k))
        for i, centroid in enumerate(self.centroids):
            distances[:, i] = np.linalg.norm(X - centroid, axis=1)
        return distances

# Train K-Means on 2D data
kmeans_2d = KMeans(k=3)
kmeans_2d.fit(X_2d)

# Train K-Means on 3D data
kmeans_3d = KMeans(k=3)
kmeans_3d.fit(X_3d)

# 2D Visualization

In [3]:
# 2D Plot with clusters and centroids
fig_2d = px.scatter(x=X_2d[:, 0], y=X_2d[:, 1], color=kmeans_2d.labels.astype(str),
                    title="2D K-Means Clustering (k=3)",
                    labels={'x': 'Feature 1', 'y': 'Feature 2'})
fig_2d.add_scatter(x=kmeans_2d.centroids[:, 0], y=kmeans_2d.centroids[:, 1],
                   mode='markers', marker=dict(size=15, color='black', symbol='x'),
                   name='Centroids')
fig_2d.show()

# 3D Visualization

In [4]:
# 3D Plot with clusters and centroids
fig_3d = px.scatter_3d(x=X_3d[:, 0], y=X_3d[:, 1], z=X_3d[:, 2],
                       color=kmeans_3d.labels.astype(str),
                       title="3D K-Means Clustering (k=3)",
                       labels={'x': 'Feature 1', 'y': 'Feature 2', 'z': 'Feature 3'})
fig_3d.add_scatter3d(x=kmeans_3d.centroids[:, 0], y=kmeans_3d.centroids[:, 1],
                     z=kmeans_3d.centroids[:, 2], mode='markers',
                     marker=dict(size=10, color='black', symbol='x'),
                     name='Centroids')
fig_3d.show()