In [4]:
import os
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors
import numpy as np
import random

In [6]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Mux-Kmeans") \
    .getOrCreate()

sc = spark.sparkContext

# Function to calculate TWCV
def calculate_twcv(clusters, centroids):
    twcv = 0
    for point, cluster_id in clusters:
        twcv += np.linalg.norm(point - centroids[cluster_id])**2
    return twcv

# Function for RSDS strategy
def rsds(centroids, clusters):
    new_centroids = []
    for cluster_points in clusters:
        dense_center = np.mean(cluster_points, axis=0)
        radius = np.min([np.linalg.norm(dense_center - c) for c in centroids])
        random_point = dense_center + np.random.uniform(-radius, radius, size=len(dense_center))
        new_centroids.append(random_point)
    return new_centroids

# Function for ADGP strategy
def adgp(centroids):
    new_centroids = []
    for i, c1 in enumerate(centroids):
        for j, c2 in enumerate(centroids):
            if i != j:
                midpoint = (c1 + c2) / 2
                new_centroids.append(midpoint)
    return random.sample(new_centroids, len(centroids))

In [13]:
A3_DATASET_URL = "https://cs.joensuu.fi/sipu/datasets/a3.txt"
DATA_FOLDER = "/home/jovyan/work/data"
A3_LOCAL_PATH = os.path.join(DATA_FOLDER, "a3.txt")

k = 50  # Number of clusters
s = 10  # Number of centroid groups
max_iterations = 10
num_partitions = 10


# Download Data
if not os.path.exists(A3_LOCAL_PATH):
    with open(A3_LOCAL_PATH, 'wb') as file:
        response = requests.get(A3_DATASET_URL)
        file.write(response.content)

# Load clean data into spark
data = sc.textFile(A3_LOCAL_PATH)
parsed_data = data.map(lambda row: np.array([float(x) for x in row.strip().split()])).repartition(num_partitions).cache()

parsed_data.take(5)

[array([53769., 43786.]),
 array([52883., 41365.]),
 array([54448., 42846.]),
 array([53358., 40498.]),
 array([54626., 43461.])]

In [14]:


# Initialize multiple centroid groups
initial_centroids_groups = [np.random.rand(k, parsed_data.first().size) for _ in range(s)]

# Main Mux-Kmeans loop
for iteration in range(max_iterations):
    twcv_scores = []
    new_centroids_groups = []

    # Step 1: Run KMeans for all centroid groups
    for centroids in initial_centroids_groups:
        kmeans = KMeans(k=k, initMode="k-means||").setInitialModel(centroids)
        model = kmeans.fit(data)
        clusters = model.transform(data).collect()
        twcv = calculate_twcv(clusters, centroids)
        twcv_scores.append((centroids, twcv))
    
    # Step 2: Prune half of the centroid groups
    twcv_scores.sort(key=lambda x: x[1])
    best_centroid_groups = [x[0] for x in twcv_scores[:s // 2]]
    
    # Step 3: Permute and Incubate
    for centroids in best_centroid_groups:
        # Choose RSDS or ADGP
        if iteration % 2 == 0:  # Alternate between strategies
            new_centroids = rsds(centroids, clusters)
        else:
            new_centroids = adgp(centroids)
        new_centroids_groups.append(new_centroids)
    
    # Update centroids for the next iteration
    initial_centroids_groups = best_centroid_groups + new_centroids_groups

# Select the best centroid group after iterations
final_model = min(twcv_scores, key=lambda x: x[1])
print(f"Best TWCV: {final_model[1]}")

AttributeError: 'KMeans' object has no attribute 'setInitialModel'