### Download and prepare KDD1999 10% dataset

In [1]:
import os
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
import zipfile
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row

def load_kddcup_data(filepath):
    columns = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
        'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
        'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
        'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
        'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
        'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
        'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
    ]
    
    data = pd.read_csv(filepath, header=None, names=columns)
    return data

def preprocess_kddcup_data(data):
    # Separate features and labels
    X = data.drop(columns=['label'])
    
    # Define categorical and numerical columns
    categorical_features = ['protocol_type', 'service', 'flag']
    numerical_features = [col for col in X.columns if col not in categorical_features]
    
    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            # Convert all numerical features to standard normal destribution
            ('num', StandardScaler(), numerical_features),
            # One hot encode categorical features
            # ('cat', OneHotEncoder(), categorical_features)
        ]
    )
    
    return preprocessor.fit_transform(X)

def download_kddcup99(url, destination_file_name, destination_folder="/home/jovyan/work/data"):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
        
    compressed_file = os.path.join(destination_folder, destination_file_name + ".gz")
    extracted_file = os.path.join(destination_folder, destination_file_name)
    
    # Download the dataset if not already downloaded
    if not os.path.exists(compressed_file):
        print("Downloading dataset...")
        urlretrieve(url, compressed_file)
        print("Download complete.")
    
    # Extract the dataset if not already extracted
    if not os.path.exists(extracted_file):
        print("Extracting dataset...")
        os.system(f"gunzip {compressed_file}")
        print("Extraction complete.")
    
    return extracted_file

In [2]:
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
data_file_path = download_kddcup99(url, "kdd_10_percent")


In [3]:
k = 5  # Number of clusters
max_iterations = 20
num_partitions = 10

In [4]:
data = load_kddcup_data(data_file_path)
processed_data = preprocess_kddcup_data(data)

### Simple KMeans

In [8]:
import time
import random
import math

def initialize_centroids(X, k, seed=314):
    """
    Randomly initialize centroids from the dataset with a fixed seed for reproducibility.
    """
    random.seed(seed)
    random_indices = random.sample(range(len(X)), k)
    centroids = [X[idx][:] for idx in random_indices]  # Copy selected points
    return centroids

def euclidean_distance(point1, point2):
    """Compute the Euclidean distance between two points."""
    return math.sqrt(sum((p1 - p2) ** 2 for p1, p2 in zip(point1, point2)))

def compute_distances(X, centroids):
    """
    Compute the distance between each data point and each centroid.
    """
    distances = []
    for point in X:
        distances.append([euclidean_distance(point, centroid) for centroid in centroids])
    return distances

def assign_clusters(distances):
    """
    Assign each data point to the closest centroid.
    """
    labels = []
    for distance_list in distances:
        min_distance_index = distance_list.index(min(distance_list))
        labels.append(min_distance_index)
    return labels

def update_centroids(X, labels, k):
    """
    Update centroids as the mean of all points assigned to each cluster.
    """
    centroids = [[0] * len(X[0]) for _ in range(k)]
    counts = [0] * k
    
    # Sum points for each cluster
    for idx, label in enumerate(labels):
        for dim in range(len(X[idx])):
            centroids[label][dim] += X[idx][dim]
        counts[label] += 1
    
    # Divide by the count to compute the mean
    for i in range(k):
        if counts[i] > 0:  # Avoid division by zero
            centroids[i] = [val / counts[i] for val in centroids[i]]
    return centroids

def has_converged(new_centroids, old_centroids, tolerance):
    """
    Check if centroids have converged based on a given tolerance.
    """
    for nc, oc in zip(new_centroids, old_centroids):
        if any(abs(n - o) > tolerance for n, o in zip(nc, oc)):
            return False
    return True

def simple_kmeans(X, k, max_iters=150, tolerance=0):
    """
    Perform the K-Means clustering algorithm.
    """
    start_time = time.time()
    centroids = initialize_centroids(X, k)
    
    for i in range(max_iters):
        iteration_time = time.time()
        old_centroids = [c[:] for c in centroids]  # Deep copy
        distances = compute_distances(X, centroids)
        labels = assign_clusters(distances)
        centroids = update_centroids(X, labels, k)
        
        print(f"Iteration: {i}\ttime taken: {time.time() - iteration_time:.4f} seconds")
        
        # Check for convergence
        if has_converged(centroids, old_centroids, tolerance):
            print(f"K-Means converged after {i+1} iterations.")
            break
    
    print(f"Total time: {time.time() - start_time:.4f} seconds")
    return centroids, labels


In [9]:
centroids, labels = simple_kmeans(processed_data, k)

Iteration: 0	time taken: 19.4609 seconds
Iteration: 1	time taken: 18.6644 seconds
Iteration: 2	time taken: 18.4806 seconds
Iteration: 3	time taken: 18.4261 seconds
Iteration: 4	time taken: 18.2458 seconds
Iteration: 5	time taken: 18.3331 seconds
Iteration: 6	time taken: 18.4883 seconds
Iteration: 7	time taken: 18.1828 seconds
Iteration: 8	time taken: 18.4302 seconds
Iteration: 9	time taken: 18.2313 seconds
Iteration: 10	time taken: 18.1955 seconds
Iteration: 11	time taken: 18.4371 seconds
Iteration: 12	time taken: 18.2244 seconds
Iteration: 13	time taken: 18.4230 seconds
K-Means converged after 14 iterations.
Total time: 258.2253 seconds


In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row




In [None]:
# Create Spark session
spark = SparkSession.builder \
    .appName("DockerClusterApp") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

In [6]:
def kmeans_to_spark_df(X, labels):
    """
    Convert NumPy array and cluster labels into a PySpark DataFrame with features and predictions.
    """
    # Convert NumPy data to a list of Rows with features and predictions
    rows = [Row(features=Vectors.dense(X[i]), prediction=int(labels[i])) for i in range(len(labels))]
    return spark.createDataFrame(rows)

In [11]:
spark_df = kmeans_to_spark_df(processed_data, labels)

evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette_score = evaluator.evaluate(spark_df)

print(f"Silhouette Score: {silhouette_score}")

spark.stop()

24/12/17 18:50:07 WARN TaskSetManager: Stage 0 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.
24/12/17 18:50:13 WARN TaskSetManager: Stage 1 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.
24/12/17 18:50:17 WARN TaskSetManager: Stage 3 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Silhouette Score: 0.6579106198643335


# PKMEANS

In [7]:
def assign_clusters_to_data(X, centroids):
    labels = []
    for point in X:
        distances = np.linalg.norm(centroids - point, axis=1)
        labels.append(np.argmin(distances))
    return np.array(labels)

In [10]:
import random 
import time
import math
import logging
import itertools
import typing
from numpy import array
import numpy as np
import pandas as pd
from scipy import spatial
import matplotlib.pyplot as plt
from pprint import pprint
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.clustering import KMeans, KMeansModel
from itertools import groupby, compress

from typing import Tuple, Sequence

def euclidean_dist(v1, v2):
    return np.sqrt(np.sum((v1 - v2) ** 2, axis=1))

def closest_centroid(points, centroids):
    distances = np.sqrt(((points[:, None] - centroids[None, :]) ** 2).sum(axis=2))
    closest_centroids_indices = np.argmin(distances, axis=1)
    return closest_centroids_indices

def calc_partition_centroid_means(partition, centroids):
    points = np.array(list(partition)) 
    if len(points) == 0:  # Handle empty partitions
        return []

    closest_indices = closest_centroid(points, centroids.value)
    
    # Combine points with their respective closest centroids
    data = pd.DataFrame({
        "Centroid": closest_indices,
        "Point": list(points)
    })

    # Expand Point into multiple dimensions
    point_df = pd.DataFrame(data['Point'].tolist(), index=data.index)
    combined_df = pd.concat([data['Centroid'], point_df], axis=1)

    # Group by Centroid and calculate mean for each group
    means = combined_df.groupby('Centroid').mean().reset_index().values
    return means

def closest_centroids(data, centroids):
    return data.mapPartitions(lambda partition: [calc_partition_centroid_means(partition, centroids)])

def aggregate_means(rdd):
    partition_means = np.concatenate(rdd.collect(), axis=0)

    num_dimensions = partition_means.shape[1] - 1  # Subtract 1 for 'Centroid'
    columns = ['Centroid'] + [f'dim_{i}' for i in range(num_dimensions)]

    df = pd.DataFrame(partition_means, columns=columns)

    grouped_means = df.groupby('Centroid').mean()
    return grouped_means

def handle_missing_centorids(aggregated_centroids, old_centroids):
    num_centorids = len(old_centroids)
    missing_centroids = set(range(num_centorids)) - set(aggregated_centroids.index)
    for ix in missing_centroids:
        # Copy old centroids to replace the missing
        aggregated_centroids.loc[ix] = old_centroids[ix]
    return aggregated_centroids.sort_index().to_numpy()

def calc_error(new_centroids, old_centroids):
    return np.sum(euclidean_dist(new_centroids, old_centroids))

def pkmeans(data, n, max_iterations=150, stop_distance=0.001):
    print(time.asctime(), "Started")
    start_time = time.time()
    init_centroids = np.array(data.takeSample(False, n, seed=42)) 
    centroids = sc.broadcast(init_centroids)

    iteration = 1
    error = float("inf")
    while error > stop_distance and iteration <= max_iterations:
        loop_start = time.time()
        closest_centroids_rdd = closest_centroids(data, centroids)
        aggregated_centroids = aggregate_means(closest_centroids_rdd)
        new_centroids = handle_missing_centorids(aggregated_centroids, centroids.value)
        error = calc_error(new_centroids, centroids.value)
        print("{3} Iteration #{0}\tDistance between old and new centroids: {1:.4f}\tIteration took: {2:.4f} sec".format(
            iteration, error, time.time() - loop_start, time.asctime()))
        
        # Update centroids
        centroids = sc.broadcast(new_centroids) 
        iteration += 1

    print(f"Total time: {time.time() - start_time}")

    return centroids.value
    


In [11]:
np.savetxt("data.csv", processed_data, delimiter=",")

In [20]:
spark = SparkSession.builder \
    .appName("PKmeans") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

sc = spark.sparkContext

rdd = sc.textFile("data.csv")

parsed_data = rdd.map(lambda line: [float(x) for x in line.split(",")])
parsed_data = parsed_data.cache()

24/12/17 19:56:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [13]:
final_centorids = pkmeans(parsed_data, k)

Mon Dec 16 10:47:02 2024 Started


                                                                                

Mon Dec 16 10:47:07 2024 Iteration #1	Distance between old and new centroids: 11.1520	Iteration took: 2.0140 sec


                                                                                

Mon Dec 16 10:47:09 2024 Iteration #2	Distance between old and new centroids: 13.1178	Iteration took: 1.6981 sec


                                                                                

Mon Dec 16 10:47:11 2024 Iteration #3	Distance between old and new centroids: 13.4210	Iteration took: 1.6940 sec


                                                                                

Mon Dec 16 10:47:12 2024 Iteration #4	Distance between old and new centroids: 19.5009	Iteration took: 1.7703 sec


                                                                                

Mon Dec 16 10:47:14 2024 Iteration #5	Distance between old and new centroids: 31.9940	Iteration took: 1.7832 sec


                                                                                

Mon Dec 16 10:47:16 2024 Iteration #6	Distance between old and new centroids: 38.0124	Iteration took: 1.6682 sec


                                                                                

Mon Dec 16 10:47:18 2024 Iteration #7	Distance between old and new centroids: 13.4152	Iteration took: 1.6994 sec


                                                                                

Mon Dec 16 10:47:19 2024 Iteration #8	Distance between old and new centroids: 5.1324	Iteration took: 1.7395 sec


                                                                                

Mon Dec 16 10:47:21 2024 Iteration #9	Distance between old and new centroids: 2.2159	Iteration took: 1.6679 sec


                                                                                

Mon Dec 16 10:47:23 2024 Iteration #10	Distance between old and new centroids: 0.3901	Iteration took: 1.7016 sec


                                                                                

Mon Dec 16 10:47:24 2024 Iteration #11	Distance between old and new centroids: 0.4805	Iteration took: 1.6348 sec


                                                                                

Mon Dec 16 10:47:26 2024 Iteration #12	Distance between old and new centroids: 0.9735	Iteration took: 1.5997 sec


                                                                                

Mon Dec 16 10:47:28 2024 Iteration #13	Distance between old and new centroids: 1.0434	Iteration took: 1.6163 sec


                                                                                

Mon Dec 16 10:47:29 2024 Iteration #14	Distance between old and new centroids: 0.5726	Iteration took: 1.6341 sec


                                                                                

Mon Dec 16 10:47:31 2024 Iteration #15	Distance between old and new centroids: 0.2786	Iteration took: 1.6155 sec


                                                                                

Mon Dec 16 10:47:32 2024 Iteration #16	Distance between old and new centroids: 0.1388	Iteration took: 1.5776 sec


                                                                                

Mon Dec 16 10:47:34 2024 Iteration #17	Distance between old and new centroids: 0.0383	Iteration took: 1.5980 sec


                                                                                

Mon Dec 16 10:47:36 2024 Iteration #18	Distance between old and new centroids: 0.0235	Iteration took: 1.9376 sec


                                                                                

Mon Dec 16 10:47:38 2024 Iteration #19	Distance between old and new centroids: 0.0025	Iteration took: 1.7273 sec


                                                                                

Mon Dec 16 10:47:39 2024 Iteration #20	Distance between old and new centroids: 0.0020	Iteration took: 1.6290 sec




Mon Dec 16 10:47:41 2024 Iteration #21	Distance between old and new centroids: 0.0000	Iteration took: 1.9044 sec
Total time: 39.3495032787323


                                                                                

In [14]:
labels = assign_clusters_to_data(processed_data, final_centorids)
spark_df = kmeans_to_spark_df(processed_data, labels)
evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette_score = evaluator.evaluate(spark_df)

print(f"Silhouette Score: {silhouette_score}")

24/12/16 10:47:55 WARN TaskSetManager: Stage 23 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.
24/12/16 10:47:59 WARN TaskSetManager: Stage 24 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.
24/12/16 10:48:02 WARN TaskSetManager: Stage 26 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.

Silhouette Score: 0.615598344068801


                                                                                

In [8]:
import os
import random
import time
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors
import numpy as np

def calculate_similarity(group1, group2):
    """
    Calculate the similarity score between two centroid groups.
    The similarity is the inverse of the sum of distances between corresponding centroids.
    """
    distances = [np.linalg.norm(c1 - c2) for c1, c2 in zip(group1, group2)]
    return 1 / sum(distances) if sum(distances) != 0 else float('inf')


def find_most_dissimilar_group(target_group, groups):
    """
    Find the most dissimilar group to the target group.
    """
    max_dissimilarity = -float('inf')
    most_dissimilar = None
    
    for group in groups:
        if np.array_equal(target_group, group):
            continue  # Skip self-comparison
        
        similarity = calculate_similarity(target_group, group)
        if similarity > max_dissimilarity:
            max_dissimilarity = similarity
            most_dissimilar = group
    
    return most_dissimilar


def adgp(groups):
    """
    Generate new centroid groups using the Average of Dissimilar Group Pairs (ADGP).
    """
    new_groups = []
    group_count = len(groups)
    
    for i, group1 in enumerate(groups):
        group2 = find_most_dissimilar_group(group1, groups)
        
        # Compute the average of corresponding centroids to form a new group
        new_group = [(c1 + c2) / 2 for c1, c2 in zip(group1, group2)]
        new_groups.append(new_group)
    
    return new_groups

# Permutation function to align centroids across groups
def permute_centroids(centroid_groups):
    base_group = centroid_groups[0]
    permuted_groups = []

    for group in centroid_groups[1:]:
        # Track matched indices to prevent duplication
        matched = set()
        permuted_group = []
        for base_c in base_group:
            # Find the closest unmatched centroid
            distances = [(i, np.linalg.norm(base_c - c)) for i, c in enumerate(group) if i not in matched]
            if distances:
                closest_idx = min(distances, key=lambda x: x[1])[0]
                permuted_group.append(group[closest_idx])
                matched.add(closest_idx)
            else:
                # Handle unmatched cases by using a default
                permuted_group.append(base_c)
        permuted_groups.append(permuted_group)
    return [base_group] + permuted_groups

def initialize_centroid_groups(parsed_data, k, s):
    # Collect a small subset of the data for initialization
    sample_data = parsed_data.takeSample(False, k * s, seed=1)
    centroid_groups = [
        sample_data[i * k:(i + 1) * k] for i in range(s)
    ]
    return np.array(centroid_groups)

def mux_kmeans(data, k, s, max_iterations=20):
    start_time = time.time()
    
    # Initialize centroid groups
    initial_centroids_groups = initialize_centroid_groups(data, k, s)
    
    # Mux-Kmeans main loop
    for iteration in range(max_iterations):
        iteration_time = time.time()
        twcv_scores = []
        
        # Evaluate centroid groups
        for centroids in initial_centroids_groups:
            centroids_broadcast = sc.broadcast(centroids)
    
            # Assign points to clusters
            clustered_rdd = data.map(
                lambda p: (
                    np.argmin([np.linalg.norm(np.subtract(p, c)) for c in centroids_broadcast.value]),
                    (p, 1)
                )
            ) # (Cluster index, (point, 1))

            # Recalculate centroids
            new_centroids = (
                clustered_rdd
                .reduceByKey(lambda x, y: (np.add(x[0], y[0]), x[1] + y[1]))  # Sum points and count
                .map(lambda x: (x[0], x[1][0] / x[1][1]))  # Compute new centroids
                .collectAsMap()
            )

            new_centroids_arr = np.array(
                    [new_centroids[j] if j in new_centroids else centroids[j] for j in range(len(centroids))]
            )
    
            # Update the centroids to the new centroids
            centroids_broadcast = sc.broadcast(new_centroids_arr)
            
            # Assign points to the nearest new centroids
            centroid_to_point = data.map(
                lambda point: (
                    np.argmin([np.linalg.norm(point - c) for c in centroids_broadcast.value]), # cluster_id
                    point # original point
                )
            )
    
            # Calculate TWCV for the current group
            # Row is (cluster_id, point)
            twcv = centroid_to_point.map(lambda row: np.linalg.norm(row[1] - centroids_broadcast.value[row[0]]) ** 2).sum()
            twcv_scores.append((new_centroids_arr, twcv))
    
        # Prune and incubate
        twcv_scores.sort(key=lambda x: x[1])
        best_groups = twcv_scores[:s // 2]
        best_centroids = [x[0] for x in best_groups]
    
        # Permutation
        aligned_centroid_groups = permute_centroids(best_centroids)
        
        # Incubate new groups
        new_centroid_groups = []
        for group in aligned_centroid_groups:
            new_centroid_groups.append(adgp(group))
    
        # Prepare for the next iteration
        initial_centroids_groups = best_centroids + new_centroid_groups
        
        best_twcv = min(twcv_scores, key=lambda x: x[1])[1]

        print(f"Iteration: {iteration}\ttime taken: {time.time() - iteration_time:.4f} seconds\tBest TWCV: {best_twcv}")

    # Select the final best group
    final_group = min(twcv_scores, key=lambda x: x[1])
    print(f"Total time: {time.time() - start_time:.4f} seconds")
    return final_group[0]

In [9]:
spark = SparkSession.builder \
    .appName("Mux-KMeans") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

sc = spark.sparkContext

rdd = sc.textFile("data.csv")

parsed_data = rdd.map(lambda line: [float(x) for x in line.split(",")])
parsed_data = parsed_data.cache()
final_centorids = mux_kmeans(parsed_data, k, s=6)
spark.stop()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/21 14:36:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Iteration: 0	time taken: 23.8191 seconds	Best TWCV: 10361162.584906505


                                                                                

Iteration: 1	time taken: 23.7854 seconds	Best TWCV: 9891506.035477528


                                                                                

Iteration: 2	time taken: 23.4082 seconds	Best TWCV: 9847454.353318198


                                                                                

Iteration: 3	time taken: 23.9537 seconds	Best TWCV: 9846769.03706399


                                                                                

Iteration: 4	time taken: 23.7534 seconds	Best TWCV: 9846736.705185255


                                                                                

Iteration: 5	time taken: 22.6463 seconds	Best TWCV: 9846731.241614208


                                                                                

Iteration: 6	time taken: 22.6670 seconds	Best TWCV: 9846731.061911369


                                                                                

Iteration: 7	time taken: 25.4484 seconds	Best TWCV: 9846731.03228543


                                                                                

Iteration: 8	time taken: 23.3847 seconds	Best TWCV: 9846731.03228543


                                                                                

Iteration: 9	time taken: 24.2897 seconds	Best TWCV: 9846731.03228543


                                                                                

Iteration: 10	time taken: 22.5967 seconds	Best TWCV: 9846731.03228543


                                                                                

Iteration: 11	time taken: 22.5518 seconds	Best TWCV: 9846731.03228543


                                                                                

Iteration: 12	time taken: 23.1161 seconds	Best TWCV: 9835561.933573037


                                                                                

Iteration: 13	time taken: 22.6853 seconds	Best TWCV: 9830718.41686887


                                                                                

Iteration: 14	time taken: 22.6829 seconds	Best TWCV: 9827567.668052265


                                                                                

Iteration: 15	time taken: 22.9280 seconds	Best TWCV: 9827407.692029335


                                                                                

Iteration: 16	time taken: 24.5223 seconds	Best TWCV: 9827406.47899581


                                                                                

Iteration: 17	time taken: 22.8960 seconds	Best TWCV: 9827406.469603414


                                                                                

Iteration: 18	time taken: 22.5367 seconds	Best TWCV: 9827406.456729176


24/12/21 14:44:08 WARN Dispatcher: Message RemoteProcessDisconnected(172.18.0.5:45342) dropped. Could not find OutputCommitCoordinator.


Iteration: 19	time taken: 22.4149 seconds	Best TWCV: 9827406.451064702
Total time: 470.5373 seconds


In [11]:
spark = SparkSession.builder \
    .appName("Mux-KMeans-Silhouette") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

labels = assign_clusters_to_data(processed_data, final_centorids)
spark_df = kmeans_to_spark_df(processed_data, labels)
evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette_score = evaluator.evaluate(spark_df)

print(f"Silhouette Score: {silhouette_score}")
spark.stop()

24/12/21 14:45:49 WARN TaskSetManager: Stage 0 contains a task of very large size (21431 KiB). The maximum recommended task size is 1000 KiB.
24/12/21 14:45:54 WARN TaskSetManager: Stage 1 contains a task of very large size (21431 KiB). The maximum recommended task size is 1000 KiB.
24/12/21 14:45:58 WARN TaskSetManager: Stage 3 contains a task of very large size (21431 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Silhouette Score: 0.657910619864277
