In [4]:
import os
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors
import numpy as np
import random

## Reference
Li, Chen, et al. "Mux-Kmeans: Multiplex Kmeans for Clustering Large-Scale Data Set." *Proceedings of the 5th ACM Workshop on Scientific Cloud Computing (ScienceCloud'14)*, 23–27 June 2014, Vancouver, BC, Canada, ACM, 2014, pp. 1-7. https://doi.org/10.1145/2608029.2608033

## Adaptation
1. The algorithm was converted from map-reduce to spark
2. The algorithm was adapted to parallelize all the heavy computations including the cluster assignment and twcv
3. The TWCV is calculated in parallel utilizing the rdd

# TWCV Calculation Function

### What is TWCV?
The **Total Within-Cluster Variation (TWCV)** is a measure of the clustering quality. It calculates the sum of squared Euclidean distances between each data point and the centroid of its assigned cluster. A lower TWCV indicates better clustering as the points are closer to their respective centroids.

### Formula
$$
\text{TWCV} = \sum_{j=1}^K \sum_{x_i \in C_j} \| x_i - c_j \|^2
$$
Where:
- $( K )$: Number of clusters.
- $( C_j)$: Set of data points assigned to the $j$-th cluster.
- $( x_i)$: Data point in cluster $( C_j )$.
- $( c_j)$: Centroid of the $( j )$-th cluster.
- $( | x_i - c_j |^2 )$: Squared Euclidean distance between the data point $( x_i )$ and the centroid $( c_j $).

In [16]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Mux-Kmeans") \
    .getOrCreate()

sc = spark.sparkContext

# Total Within-Cluster Variation
def calculate_twcv(points, centroids):
    twcv = 0
    for point in points:
        distances = [np.linalg.norm(point - centroid) for centroid in centroids]
        twcv += min(distances) ** 2
    return twcv

# Function for RSDS strategy
def rsds(centroids, cluster_points):
    new_centroids = []
    for cluster in cluster_points:
        if len(cluster) > 0:
            dense_center = np.mean(cluster, axis=0)
            radius = np.min([np.linalg.norm(dense_center - c) for c in centroids])
            random_point = dense_center + np.random.uniform(-radius, radius, size=dense_center.shape)
            new_centroids.append(random_point)
    return new_centroids

# Function for ADGP strategy
def adgp(centroids):
    new_centroids = []
    for i, c1 in enumerate(centroids):
        for j, c2 in enumerate(centroids):
            if i < j:  # Avoid duplicate pairings
                midpoint = (c1 + c2) / 2
                new_centroids.append(midpoint)
    return random.sample(new_centroids, len(centroids))

In [13]:
A3_DATASET_URL = "https://cs.joensuu.fi/sipu/datasets/a3.txt"
DATA_FOLDER = "/home/jovyan/work/data"
A3_LOCAL_PATH = os.path.join(DATA_FOLDER, "a3.txt")

k = 50  # Number of clusters
s = 10  # Number of centroid groups
max_iterations = 10
num_partitions = 10


# Download Data
if not os.path.exists(A3_LOCAL_PATH):
    with open(A3_LOCAL_PATH, 'wb') as file:
        response = requests.get(A3_DATASET_URL)
        file.write(response.content)

# Load clean data into spark
data = sc.textFile(A3_LOCAL_PATH)
parsed_data = data.map(lambda row: np.array([float(x) for x in row.strip().split()])).repartition(num_partitions).cache()

parsed_data.take(5)

[array([53769., 43786.]),
 array([52883., 41365.]),
 array([54448., 42846.]),
 array([53358., 40498.]),
 array([54626., 43461.])]

In [17]:
# Initialize multiple centroid groups
initial_centroids_groups = [np.random.rand(k, len(parsed_data.take(1)[0])) for _ in range(s)]

# Main Mux-Kmeans Loop
for iteration in range(max_iterations):
    twcv_scores = []
    cluster_assignments = []

    # Evaluate all centroid groups
    for centroids in initial_centroids_groups:
        # Assign points to the nearest centroids
        cluster_points = parsed_data.map(
            lambda point: (np.argmin([np.linalg.norm(point - c) for c in centroids]), point)
        ).groupByKey().mapValues(list).collect()

        # Calculate TWCV for the current group
        twcv = calculate_twcv(parsed_data.collect(), centroids)
        twcv_scores.append((centroids, twcv, cluster_points))
    
    # Prune: Retain top-performing centroid groups
    twcv_scores.sort(key=lambda x: x[1])  # Sort by TWCV (lower is better)
    best_groups = twcv_scores[:s // 2]  # Retain top s/2 groups
    
    # Permute and Incubate
    new_centroids_groups = []
    for centroids, _, cluster_points in best_groups:
        if iteration % 2 == 0:  # Alternate between RSDS and ADGP
            new_centroids = rsds(centroids, cluster_points)
        else:
            new_centroids = adgp(centroids)
        new_centroids_groups.append(new_centroids)
    
    # Prepare centroid groups for the next iteration
    initial_centroids_groups = [group[0] for group in best_groups] + new_centroids_groups

# Final Output: Select the best centroid group
final_group = min(twcv_scores, key=lambda x: x[1])
print(f"Best TWCV: {final_group[1]}")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.