
# Parallel Multiobjective PSO Weighted Average Clustering Algorithm Implementation
Based on Ling, Huidong, et al. "A Parallel Multiobjective PSO Weighted Average Clustering Algorithm Based on Apache Spark." Entropy 25.2 (2023): 259.

## Algorithm Outline

 

## Adaptations made to suggested implementation of the algorithm:
1. 

## Description of production cluster:



Datasets: https://cs.joensuu.fi/sipu/datasets/

In [1]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import split, col, size, trim, lit
from sklearn.cluster import KMeans


# Initialize a Spark session
spark = SparkSession.builder \
    .appName("FirstSparkJob") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/26 20:23:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [28]:
# Cluster and data Parameters
np.random.seed(301191)
num_clusters = 50

In [31]:
A3_DATASET_URL = "https://cs.joensuu.fi/sipu/datasets/a3.txt"
DATA_FOLDER = "/home/jovyan/work/data"
A3_LOCAL_PATH = os.path.join(DATA_FOLDER, "a3.txt")

# Download Data
response = requests.get(A3_DATASET_URL)
if not os.path.exists(A3_LOCAL_PATH):
    with open(A3_LOCAL_PATH, 'wb') as file:
        file.write(response.content)

# Load clean data into spark
data = sc.textFile(A3_LOCAL_PATH)
parsed_data = data.map(lambda row: np.array([float(x) for x in row.strip().split()])).cache()

parsed_data.take(5)

                                                                                

[array([53920., 42968.]),
 array([52019., 42206.]),
 array([52570., 42476.]),
 array([54220., 42081.]),
 array([54268., 43420.])]

In [None]:
# Algorithm parameters, taken from the paper page 9
c1 = c2 = 1.49445    # Learning parameters
w = 3                # Weight factor
num_particles = 50   # Particles number
archive_depth = 15   # Number of items in the archive

# Get Data dimensions
dimensions = len(parsed_data.take(1)[0])

# Particle Initialization
def initialize_particles(num_particles, num_clusters, num_features):
    particles = []
    for _ in range(num_particles):
        particle = {
            'position': np.random.rand(num_clusters, num_features),
            'velocity': np.random.rand(num_clusters, num_features),
            'pbest_position': None,
            'pbest_value': float('inf')
        }
        particles.append(particle)
    return particles

particles = initialize_particles(num_particles, num_clusters, num_features)
particles_bc = sc.broadcast(particles)



In [2]:
# Fitness Evaluation Function
def compute_fitness(data_partition, particle_position):
    fitness_compactness = 0
    fitness_connectivity = 0
    cluster_assignments = []

    for point in data_partition:
        distances = [euclidean(point, centroid) for centroid in particle_position]
        assigned_cluster = np.argmin(distances)
        cluster_assignments.append((assigned_cluster, point))
        fitness_compactness += distances[assigned_cluster]

    # Connectivity fitness calculation (pairwise distance within clusters)
    for i, (cluster, point) in enumerate(cluster_assignments):
        if i < len(cluster_assignments) - 1:
            next_point = cluster_assignments[i + 1][1]
            fitness_connectivity += euclidean(point, next_point)

    return fitness_compactness, fitness_connectivity

# Local Fitness Calculation in Partitions with K-means
def calculate_local_fitness(data_partition, particles_bc: Broadcast):
    particles = particles_bc.value
    local_fitness = []

    # Convert partition to a list to be compatible with KMeans (since it's an RDD partition)
    data_partition = list(data_partition)

    for particle in particles:
        # Using particle's position as the initial cluster centers in K-means
        kmeans = KMeans(n_clusters=num_clusters, init=particle['position'], n_init=1, max_iter=10)
        
        # Fit the KMeans model to the data in this partition
        kmeans.fit(data_partition)
        
        # Calculate compactness (within-cluster distances)
        compactness = 0
        for i, center in enumerate(kmeans.cluster_centers_):
            # Get points assigned to this cluster
            points_in_cluster = [point for idx, point in enumerate(data_partition) if kmeans.labels_[idx] == i]
            compactness += sum(euclidean(point, center) for point in points_in_cluster)

        # Calculate connectivity (pairwise distance within each cluster)
        connectivity = 0
        for i, point in enumerate(data_partition[:-1]):
            # Distance to the next point in the same partition
            connectivity += euclidean(point, data_partition[i + 1])

        # Append fitness (compactness and connectivity) for this particle in this partition
        local_fitness.append((compactness, connectivity))

    return local_fitness

# Aggregate and Weighted Average Fitness Calculation
def aggregate_fitness(local_fitness_rdd, num_samples):
    fitness_values = local_fitness_rdd.collect()
    weighted_fitness = np.average(fitness_values, axis=0, weights=[partition_size / num_samples for partition_size in [len(f) for f in fitness_values]])
    return weighted_fitness

# Update Particle Position and Velocity
def update_particles(particles, global_best_position):
    for particle in particles:
        for cluster in range(num_clusters):
            inertia = w * particle['velocity'][cluster]
            cognitive = c1 * np.random.rand() * (particle['pbest_position'][cluster] - particle['position'][cluster])
            social = c2 * np.random.rand() * (global_best_position[cluster] - particle['position'][cluster])
            particle['velocity'][cluster] = inertia + cognitive + social
            particle['position'][cluster] += particle['velocity'][cluster]

    return particles

# External Archive and Crowding Distance Calculation
def update_archive(particles):
    # Use non-dominated sorting or other Pareto-based method to keep top solutions
    archive = []
    for particle in particles:
        archive.append((particle['position'], particle['pbest_value']))
    # Sort and prune archive if needed (e.g., keep unique non-dominated solutions)
    archive = sorted(archive, key=lambda x: x[1])[:num_particles]
    return archive

# Main Optimization Loop
for iteration in range(num_iterations):
    # Local Fitness Calculation
    local_fitness_rdd = data_rdd.mapPartitions(lambda partition: calculate_local_fitness(partition, particles_bc))
    global_fitness = aggregate_fitness(local_fitness_rdd, num_samples=data_rdd.count())

    # Update pbest and gbest
    for particle in particles:
        if particle['pbest_value'] > global_fitness:
            particle['pbest_position'] = particle['position']
            particle['pbest_value'] = global_fitness

    global_best_position = min(particles, key=lambda p: p['pbest_value'])['pbest_position']
    particles = update_particles(particles, global_best_position)

    # Broadcast updated particles
    particles_bc = sc.broadcast(particles)

# Final Archive of Best Solutions
archive = update_archive(particles)
print("Final Archive:", archive)

# Stopping Spark Context
sc.stop()

NameError: name 'num_particles' is not defined