### Download and prepare KDD1999 10% dataset

In [1]:
import os
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
import zipfile
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

def load_kddcup_data(filepath):
    columns = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
        'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
        'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
        'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
        'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
        'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
        'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
    ]
    
    data = pd.read_csv(filepath, header=None, names=columns)
    return data

def preprocess_kddcup_data(data):
    # Separate features and labels
    X = data.drop(columns=['label'])
    
    # Define categorical and numerical columns
    categorical_features = ['protocol_type', 'service', 'flag']
    numerical_features = [col for col in X.columns if col not in categorical_features]
    
    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            # Convert all numerical features to standard normal destribution
            ('num', StandardScaler(), numerical_features),
            # One hot encode categorical features
            # ('cat', OneHotEncoder(), categorical_features)
        ]
    )
    
    return preprocessor.fit_transform(X)

def download_kddcup99(url, destination_file_name, destination_folder="/home/jovyan/work/data"):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
        
    compressed_file = os.path.join(destination_folder, destination_file_name + ".gz")
    extracted_file = os.path.join(destination_folder, destination_file_name)
    
    # Download the dataset if not already downloaded
    if not os.path.exists(compressed_file):
        print("Downloading dataset...")
        urlretrieve(url, compressed_file)
        print("Download complete.")
    
    # Extract the dataset if not already extracted
    if not os.path.exists(extracted_file):
        print("Extracting dataset...")
        os.system(f"gunzip {compressed_file}")
        print("Extraction complete.")
    
    return extracted_file

In [2]:
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
data_file_path = download_kddcup99(url, "kdd_10_percent")


In [3]:
k = 5  # Number of clusters
max_iterations = 20
num_partitions = 10

In [4]:
data = load_kddcup_data(data_file_path)
processed_data = preprocess_kddcup_data(data)

### Simple KMeans

In [5]:
import time
import numpy as np
import matplotlib.pyplot as plt

def initialize_centroids(X, k, seed=314):
    """
    Randomly initialize centroids from the dataset with a fixed seed for reproducibility.
    """
    np.random.seed(seed)
    random_indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[random_indices]
    return centroids

def compute_distances(X, centroids):
    """
    Compute the distance between each data point and each centroid.
    """
    distances = np.zeros((X.shape[0], len(centroids)))
    for i, centroid in enumerate(centroids):
        distances[:, i] = np.linalg.norm(X - centroid, axis=1)
    return distances

def assign_clusters(distances):
    """
    Assign each data point to the closest centroid.
    """
    return np.argmin(distances, axis=1)

def update_centroids(X, labels, k):
    """
    Update centroids as the mean of all points assigned to each cluster.
    """
    centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        points = X[labels == i]
        if points.shape[0] > 0:
            centroids[i] = points.mean(axis=0)
    return centroids

def simple_kmeans(X, k, max_iters=150, tolerance=0):
    """
    Perform the K-Means clustering algorithm.
    """
    start_time = time.time()
    centroids = initialize_centroids(X, k)
    for i in range(max_iters):
        iteration_time = time.time()
        old_centroids = centroids
        distances = compute_distances(X, centroids)
        labels = assign_clusters(distances)
        centroids = update_centroids(X, labels, k)

        print(f"Iteration: {i}\ttime taken: {time.time() - iteration_time}")
        
        # Check for convergence
        if np.all(np.abs(centroids - old_centroids) <= tolerance):
            print(f"K-Means converged after {i+1} iterations.")
            break
    print(f"Total time: {time.time() - start_time}")
    return centroids, labels

In [6]:
centroids, labels = simple_kmeans(processed_data, k)

Iteration: 0	time taken: 1.1453568935394287
Iteration: 1	time taken: 1.1440613269805908
Iteration: 2	time taken: 1.1610233783721924
Iteration: 3	time taken: 1.1426568031311035
Iteration: 4	time taken: 1.1440942287445068
Iteration: 5	time taken: 1.143282413482666
Iteration: 6	time taken: 1.145038366317749
Iteration: 7	time taken: 1.1507244110107422
Iteration: 8	time taken: 1.1338989734649658
Iteration: 9	time taken: 1.135542631149292
Iteration: 10	time taken: 1.1392748355865479
Iteration: 11	time taken: 1.1363136768341064
K-Means converged after 12 iterations.
Total time: 13.729131937026978


In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row

# Create Spark session
spark = SparkSession.builder \
    .appName("DockerClusterApp") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

def kmeans_to_spark_df(X, labels):
    """
    Convert NumPy array and cluster labels into a PySpark DataFrame with features and predictions.
    """
    # Convert NumPy data to a list of Rows with features and predictions
    rows = [Row(features=Vectors.dense(X[i]), prediction=int(labels[i])) for i in range(len(labels))]
    return spark.createDataFrame(rows)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/16 10:46:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
spark_df = kmeans_to_spark_df(processed_data, labels)

evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette_score = evaluator.evaluate(spark_df)

print(f"Silhouette Score: {silhouette_score}")

spark.stop()

24/12/16 10:46:32 WARN TaskSetManager: Stage 0 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.
24/12/16 10:46:37 WARN TaskSetManager: Stage 1 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.
24/12/16 10:46:41 WARN TaskSetManager: Stage 3 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Silhouette Score: 0.6279357756715856


# PKMEANS

In [9]:
def assign_clusters_to_data(X, centroids):
    labels = []
    for point in X:
        distances = np.linalg.norm(centroids - point, axis=1)
        labels.append(np.argmin(distances))
    return np.array(labels)

In [10]:
import random 
import time
import math
import logging
import itertools
import typing
from numpy import array
import numpy as np
import pandas as pd
from scipy import spatial
import matplotlib.pyplot as plt
from pprint import pprint
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.clustering import KMeans, KMeansModel
from itertools import groupby, compress

from typing import Tuple, Sequence

spark = SparkSession.builder \
    .appName("PKmeans") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

sc = spark.sparkContext

def euclidean_dist(v1, v2):
    return np.sqrt(np.sum((v1 - v2) ** 2, axis=1))

def closest_centroid(points, centroids):
    distances = np.sqrt(((points[:, None] - centroids[None, :]) ** 2).sum(axis=2))
    closest_centroids_indices = np.argmin(distances, axis=1)
    return closest_centroids_indices

def calc_partition_centroid_means(partition, centroids):
    points = np.array(list(partition)) 
    if len(points) == 0:  # Handle empty partitions
        return []

    closest_indices = closest_centroid(points, centroids.value)
    
    # Combine points with their respective closest centroids
    data = pd.DataFrame({
        "Centroid": closest_indices,
        "Point": list(points)
    })

    # Expand Point into multiple dimensions
    point_df = pd.DataFrame(data['Point'].tolist(), index=data.index)
    combined_df = pd.concat([data['Centroid'], point_df], axis=1)

    # Group by Centroid and calculate mean for each group
    means = combined_df.groupby('Centroid').mean().reset_index().values
    return means

def closest_centroids(data, centroids):
    return data.mapPartitions(lambda partition: [calc_partition_centroid_means(partition, centroids)])

def aggregate_means(rdd):
    partition_means = np.concatenate(rdd.collect(), axis=0)

    num_dimensions = partition_means.shape[1] - 1  # Subtract 1 for 'Centroid'
    columns = ['Centroid'] + [f'dim_{i}' for i in range(num_dimensions)]

    df = pd.DataFrame(partition_means, columns=columns)

    grouped_means = df.groupby('Centroid').mean()
    return grouped_means

def handle_missing_centorids(aggregated_centroids, old_centroids):
    num_centorids = len(old_centroids)
    missing_centroids = set(range(num_centorids)) - set(aggregated_centroids.index)
    for ix in missing_centroids:
        # Copy old centroids to replace the missing
        aggregated_centroids.loc[ix] = old_centroids[ix]
    return aggregated_centroids.sort_index().to_numpy()

def calc_error(new_centroids, old_centroids):
    return np.sum(euclidean_dist(new_centroids, old_centroids))

def pkmeans(data, n, max_iterations=150, stop_distance=0.001):
    print(time.asctime(), "Started")
    start_time = time.time()
    init_centroids = np.array(data.takeSample(False, n, seed=42)) 
    centroids = sc.broadcast(init_centroids)

    iteration = 1
    error = float("inf")
    while error > stop_distance and iteration <= max_iterations:
        loop_start = time.time()
        closest_centroids_rdd = closest_centroids(data, centroids)
        aggregated_centroids = aggregate_means(closest_centroids_rdd)
        new_centroids = handle_missing_centorids(aggregated_centroids, centroids.value)
        error = calc_error(new_centroids, centroids.value)
        print("{3} Iteration #{0}\tDistance between old and new centroids: {1:.4f}\tIteration took: {2:.4f} sec".format(
            iteration, error, time.time() - loop_start, time.asctime()))
        
        # Update centroids
        centroids = sc.broadcast(new_centroids) 
        iteration += 1

    print(f"Total time: {time.time() - start_time}")

    return centroids.value
    


In [11]:
np.savetxt("data.csv", processed_data, delimiter=",")

In [12]:
rdd = sc.textFile("data.csv")

parsed_data = rdd.map(lambda line: [float(x) for x in line.split(",")])
parsed_data = parsed_data.cache()

In [13]:
final_centorids = pkmeans(parsed_data, k)

Mon Dec 16 10:47:02 2024 Started


                                                                                

Mon Dec 16 10:47:07 2024 Iteration #1	Distance between old and new centroids: 11.1520	Iteration took: 2.0140 sec


                                                                                

Mon Dec 16 10:47:09 2024 Iteration #2	Distance between old and new centroids: 13.1178	Iteration took: 1.6981 sec


                                                                                

Mon Dec 16 10:47:11 2024 Iteration #3	Distance between old and new centroids: 13.4210	Iteration took: 1.6940 sec


                                                                                

Mon Dec 16 10:47:12 2024 Iteration #4	Distance between old and new centroids: 19.5009	Iteration took: 1.7703 sec


                                                                                

Mon Dec 16 10:47:14 2024 Iteration #5	Distance between old and new centroids: 31.9940	Iteration took: 1.7832 sec


                                                                                

Mon Dec 16 10:47:16 2024 Iteration #6	Distance between old and new centroids: 38.0124	Iteration took: 1.6682 sec


                                                                                

Mon Dec 16 10:47:18 2024 Iteration #7	Distance between old and new centroids: 13.4152	Iteration took: 1.6994 sec


                                                                                

Mon Dec 16 10:47:19 2024 Iteration #8	Distance between old and new centroids: 5.1324	Iteration took: 1.7395 sec


                                                                                

Mon Dec 16 10:47:21 2024 Iteration #9	Distance between old and new centroids: 2.2159	Iteration took: 1.6679 sec


                                                                                

Mon Dec 16 10:47:23 2024 Iteration #10	Distance between old and new centroids: 0.3901	Iteration took: 1.7016 sec


                                                                                

Mon Dec 16 10:47:24 2024 Iteration #11	Distance between old and new centroids: 0.4805	Iteration took: 1.6348 sec


                                                                                

Mon Dec 16 10:47:26 2024 Iteration #12	Distance between old and new centroids: 0.9735	Iteration took: 1.5997 sec


                                                                                

Mon Dec 16 10:47:28 2024 Iteration #13	Distance between old and new centroids: 1.0434	Iteration took: 1.6163 sec


                                                                                

Mon Dec 16 10:47:29 2024 Iteration #14	Distance between old and new centroids: 0.5726	Iteration took: 1.6341 sec


                                                                                

Mon Dec 16 10:47:31 2024 Iteration #15	Distance between old and new centroids: 0.2786	Iteration took: 1.6155 sec


                                                                                

Mon Dec 16 10:47:32 2024 Iteration #16	Distance between old and new centroids: 0.1388	Iteration took: 1.5776 sec


                                                                                

Mon Dec 16 10:47:34 2024 Iteration #17	Distance between old and new centroids: 0.0383	Iteration took: 1.5980 sec


                                                                                

Mon Dec 16 10:47:36 2024 Iteration #18	Distance between old and new centroids: 0.0235	Iteration took: 1.9376 sec


                                                                                

Mon Dec 16 10:47:38 2024 Iteration #19	Distance between old and new centroids: 0.0025	Iteration took: 1.7273 sec


                                                                                

Mon Dec 16 10:47:39 2024 Iteration #20	Distance between old and new centroids: 0.0020	Iteration took: 1.6290 sec




Mon Dec 16 10:47:41 2024 Iteration #21	Distance between old and new centroids: 0.0000	Iteration took: 1.9044 sec
Total time: 39.3495032787323


                                                                                

In [14]:
labels = assign_clusters_to_data(processed_data, final_centorids)
spark_df = kmeans_to_spark_df(processed_data, labels)
evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette_score = evaluator.evaluate(spark_df)

print(f"Silhouette Score: {silhouette_score}")

24/12/16 10:47:55 WARN TaskSetManager: Stage 23 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.
24/12/16 10:47:59 WARN TaskSetManager: Stage 24 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.
24/12/16 10:48:02 WARN TaskSetManager: Stage 26 contains a task of very large size (18932 KiB). The maximum recommended task size is 1000 KiB.

Silhouette Score: 0.615598344068801


                                                                                