Feature Normalization

In [3]:
def standardize(data):
    means = [sum(feature) / len(feature) for feature in zip(*data)]
    std_devs = [max(1e-7, (sum((x_i - mean) ** 2 for x_i in feature) / len(feature)) ** 0.5) for feature, mean in zip(zip(*data), means)]
    standardized_data = [[(x_i - mean) / std_dev for x_i, mean, std_dev in zip(row, means, std_devs)] for row in data]
    return standardized_data

Initialization

In [4]:
import random

def initialize_centroids(data, k):
    centroids = random.sample(data, k)
    return centroids

Assignment Step

In [5]:
def euclidean_distance(point1, point2):
    return sum((p1 - p2) ** 2 for p1, p2 in zip(point1, point2)) ** 0.5

def assign_points_to_centroids(data, centroids):
    clusters = [[] for _ in range(len(centroids))]
    for point in data:
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        closest_centroid = distances.index(min(distances))
        clusters[closest_centroid].append(point)
    return clusters

Update Step

In [7]:
def calculate_new_centroids(clusters):
    centroids = []
    for cluster in clusters:
        new_centroid = [sum(feature) / len(feature) for feature in zip(*cluster)]
        centroids.append(new_centroid)
    return centroids

KMeans Algorithm

In [8]:
def kmeans(data, k, max_iterations=100):
    data = standardize(data)
    centroids = initialize_centroids(data, k)
    for _ in range(max_iterations):
        clusters = assign_points_to_centroids(data, centroids)
        new_centroids = calculate_new_centroids(clusters)
        if new_centroids == centroids:
            break
        centroids = new_centroids
    return centroids, clusters

Example Usage

In [9]:
# Example Data
data = [[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]

# Number of Clusters
k = 2

# Run KMeans
centroids, clusters = kmeans(data, k)

print("Centroids:", centroids)
print("Clusters:", clusters)

Centroids: [[0.0, -1.224744871391589], [0.0, 0.6123724356957945]]
Clusters: [[[-1.0, -1.224744871391589], [1.0, -1.224744871391589]], [[-1.0, 0.0], [-1.0, 1.224744871391589], [1.0, 0.0], [1.0, 1.224744871391589]]]
