### Download and prepare KDD1999 10% dataset

In [1]:
import os
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
import zipfile
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

def load_kddcup_data(filepath):
    columns = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
        'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
        'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
        'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
        'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
        'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
        'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
    ]
    
    data = pd.read_csv(filepath, header=None, names=columns)
    return data

def preprocess_kddcup_data(data):
    # Separate features and labels
    X = data.drop(columns=['label'])
    
    # Define categorical and numerical columns
    categorical_features = ['protocol_type', 'service', 'flag']
    numerical_features = [col for col in X.columns if col not in categorical_features]
    
    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            # Convert all numerical features to standard normal destribution
            ('num', StandardScaler(), numerical_features),
            # One hot encode categorical features
            # ('cat', OneHotEncoder(), categorical_features)
        ]
    )
    
    return preprocessor.fit_transform(X)

def download_kddcup99(url, destination_file_name, destination_folder="/home/jovyan/work/data"):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
        
    compressed_file = os.path.join(destination_folder, destination_file_name + ".gz")
    extracted_file = os.path.join(destination_folder, destination_file_name)
    
    # Download the dataset if not already downloaded
    if not os.path.exists(compressed_file):
        print("Downloading dataset...")
        urlretrieve(url, compressed_file)
        print("Download complete.")
    
    # Extract the dataset if not already extracted
    if not os.path.exists(extracted_file):
        print("Extracting dataset...")
        os.system(f"gunzip {compressed_file}")
        print("Extraction complete.")
    
    return extracted_file

In [2]:
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
data_file_path = download_kddcup99(url, "kdd_10_percent")


In [3]:
k = 5  # Number of clusters
max_iterations = 20
num_partitions = 10

In [4]:
data = load_kddcup_data(data_file_path)
processed_data = preprocess_kddcup_data(data)

### Simple KMeans

In [5]:
import time
import numpy as np
import matplotlib.pyplot as plt

def initialize_centroids(X, k, seed=314):
    """
    Randomly initialize centroids from the dataset with a fixed seed for reproducibility.
    """
    np.random.seed(seed)
    random_indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[random_indices]
    return centroids

def compute_distances(X, centroids):
    """
    Compute the distance between each data point and each centroid.
    """
    distances = np.zeros((X.shape[0], len(centroids)))
    for i, centroid in enumerate(centroids):
        distances[:, i] = np.linalg.norm(X - centroid, axis=1)
    return distances

def assign_clusters(distances):
    """
    Assign each data point to the closest centroid.
    """
    return np.argmin(distances, axis=1)

def update_centroids(X, labels, k):
    """
    Update centroids as the mean of all points assigned to each cluster.
    """
    centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        points = X[labels == i]
        if points.shape[0] > 0:
            centroids[i] = points.mean(axis=0)
    return centroids

def simple_kmeans(X, k, max_iters=20, tolerance=1e-4):
    """
    Perform the K-Means clustering algorithm.
    """
    start_time = time.time()
    centroids = initialize_centroids(X, k)
    for i in range(max_iters):
        iteration_time = time.time()
        old_centroids = centroids
        distances = compute_distances(X, centroids)
        labels = assign_clusters(distances)
        centroids = update_centroids(X, labels, k)

        print(f"Iteration: {i}\ttime taken: {time.time() - iteration_time}")
        
        # Check for convergence
        if np.all(np.abs(centroids - old_centroids) < tolerance):
            print(f"K-Means converged after {i+1} iterations.")
            break
    print(f"Total time: {time.time() - start_time}")
    return centroids, labels

In [6]:
centroids, labels = simple_kmeans(processed_data, k)

Iteration: 0	time taken: 1.1470954418182373
Iteration: 1	time taken: 1.1386845111846924
Iteration: 2	time taken: 1.1381573677062988
Iteration: 3	time taken: 1.1244182586669922
Iteration: 4	time taken: 1.131394386291504
Iteration: 5	time taken: 1.1390714645385742
Iteration: 6	time taken: 1.1399877071380615
Iteration: 7	time taken: 1.1340887546539307
Iteration: 8	time taken: 1.1325044631958008
Iteration: 9	time taken: 1.1323182582855225
Iteration: 10	time taken: 1.132812261581421
Iteration: 11	time taken: 1.1314568519592285
K-Means converged after 12 iterations.
Total time: 13.63569450378418


In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row

spark = SparkSession.builder \
    .appName("SilhouetteEvaluation") \
    .getOrCreate()

def kmeans_to_spark_df(X, labels):
    """
    Convert NumPy array and cluster labels into a PySpark DataFrame with features and predictions.
    """
    # Convert NumPy data to a list of Rows with features and predictions
    rows = [Row(features=Vectors.dense(X[i]), prediction=int(labels[i])) for i in range(len(labels))]
    return spark.createDataFrame(rows)

spark_df = kmeans_to_spark_df(processed_data, labels)

evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette_score = evaluator.evaluate(spark_df)

print(f"Silhouette Score: {silhouette_score}")

spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/30 20:35:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Iteration: 0	time taken: 1.2025256156921387
Iteration: 1	time taken: 1.1561334133148193
Iteration: 2	time taken: 1.148763656616211
Iteration: 3	time taken: 1.1356165409088135
Iteration: 4	time taken: 1.1407718658447266
Iteration: 5	time taken: 1.1479098796844482
Iteration: 6	time taken: 1.1337993144989014
Iteration: 7	time taken: 1.1233820915222168
Iteration: 8	time taken: 1.1338443756103516
Iteration: 9	time taken: 1.1287212371826172
Iteration: 10	time taken: 1.1295108795166016
Iteration: 11	time taken: 1.1210384368896484
K-Means converged after 12 iterations.
Total time: 13.70983099937439


24/11/30 20:36:02 WARN TaskSetManager: Stage 0 contains a task of very large size (17146 KiB). The maximum recommended task size is 1000 KiB.
24/11/30 20:36:07 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 0 (TID 0): Attempting to kill Python Worker
24/11/30 20:36:07 WARN TaskSetManager: Stage 1 contains a task of very large size (17146 KiB). The maximum recommended task size is 1000 KiB.
24/11/30 20:36:08 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/11/30 20:36:08 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/11/30 20:36:09 WARN TaskSetManager: Stage 3 contains a task of very large size (17146 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Silhouette Score: 0.6279357756715807
