In [1]:
import numpy as np
import pandas as pd

def initialize_centroids(data, k):
    """Randomly initialize centroids."""
    indices = np.random.choice(data.shape[0], k, replace=False)
    return data.iloc[indices].values

def assign_clusters(data, centroids):
    """Assign data points to the nearest centroid."""
    distances = np.sqrt(((data.values[:, np.newaxis, :] - centroids[np.newaxis, :, :])**2).sum(axis=2))
    return np.argmin(distances, axis=1)

def update_centroids(data, assignments, k):
    """Update centroids to the mean of assigned data points."""
    return np.array([data[assignments == i].mean().values for i in range(k)])

def k_means_clustering(data, k, max_iters=100):
    """K-means clustering algorithm."""
    centroids = initialize_centroids(data, k)
    for _ in range(max_iters):
        old_centroids = centroids
        assignments = assign_clusters(data, centroids)
        centroids = update_centroids(data, assignments, k)
        
        if np.allclose(centroids, old_centroids):
            break

    return centroids, assignments

# Example usage
data = pd.DataFrame({
    'Feature1': [1, 1, 1, 10, 10, 10],
    'Feature2': [2, 4, 0, 2, 4, 0]
})
k = 2  # number of clusters
centroids, assignments = k_means_clustering(data, k)
print("Centroids:\n", centroids)
print("Assignments:\n", assignments)


Centroids:
 [[5.5 3. ]
 [5.5 0. ]]
Assignments:
 [0 0 1 0 0 1]


In [2]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

def jaccard_distance(point1, point2):
    M11 = np.sum(np.logical_and(point1, point2))
    min_A_B = min(np.sum(point1), np.sum(point2))
    return 1 - (M11 / min_A_B) if min_A_B != 0 else 1

def assign_clusters(data, medoids, distance_func):
    distances = cdist(data, medoids, metric=distance_func)
    return np.argmin(distances, axis=1)

def find_medoids(data, assignments, k):
    medoids = np.zeros((k, data.shape[1]))
    for i in range(k):
        cluster_points = data[assignments == i]
        if len(cluster_points) == 0:
            continue
        distances = cdist(cluster_points, cluster_points, metric='euclidean')
        medoid_index = np.argmin(np.sum(distances, axis=1))
        medoids[i] = cluster_points[medoid_index]
    return medoids

def k_medoids_clustering(data, k, distance_func, max_iters=100):
    # Step 1: Initialize medoids
    indices = np.random.choice(data.shape[0], k, replace=False)
    medoids = data[indices]

    for _ in range(max_iters):
        old_medoids = np.copy(medoids)
        
        # Step 2 & 3: Assign clusters
        assignments = assign_clusters(data, medoids, distance_func)
        
        # Step 4: Find new medoids
        medoids = find_medoids(data, assignments, k)

        if np.all(medoids == old_medoids):
            break

    return medoids, assignments

# Example usage
data = np.array([
    # example binary data for Hist1 region
])

k = 3  # number of clusters
medoids, assignments = k_medoids_clustering(data, k, jaccard_distance)


ValueError: a must be greater than 0 unless no samples are taken