In [9]:
import os
import random 
import time
import requests

from numpy import array
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from pprint import pprint
from itertools import groupby, compress
from typing import Tuple, Sequence


# For reproducability of results
RANDOM_SEED = 30111991

%matplotlib inline

In [10]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import split, col, size, trim, lit
from pyspark.ml.linalg import Vectors, DenseVector

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("BetterKMeans") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/14 18:30:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [30]:
def euclidean_dist(v1, v2):
    return np.sqrt(np.sum((v1 - v2) ** 2, axis=1))

def closest_centroid(points, centroids):
    distances = np.sqrt(((points[:, None] - centroids[None, :]) ** 2).sum(axis=2))
    closest_centroids_indices = np.argmin(distances, axis=1)
    return closest_centroids_indices

def calc_partition_centroid_means(partition, centroids):
    points = np.array(list(partition)) 
    if len(points) == 0:  # Handle empty partitions
        return []

    closest_indices = closest_centroid(points, centroids.value)
    
    # Combine points with their respective closest centroids
    data = pd.DataFrame({
        "Centroid": closest_indices,
        "Point": list(points)
    })

    # Expand Point into multiple dimensions
    point_df = pd.DataFrame(data['Point'].tolist(), index=data.index)
    combined_df = pd.concat([data['Centroid'], point_df], axis=1)

    # Group by Centroid and calculate mean for each group
    means = combined_df.groupby('Centroid').mean().reset_index().values
    return means

def closest_centroids(data, centroids):
    return data.mapPartitions(lambda partition: [calc_partition_centroid_means(partition, centroids)])

def aggregate_means(rdd):
    partition_means = np.concatenate(rdd.collect(), axis=0)
    num_dimensions = partition_means.shape[1] - 1  # Subtract 1 for the 'Centroid' column    
    columns = ['Centroid'] + [f'dim_{i}' for i in range(num_dimensions)]
    
    df = pd.DataFrame(partition_means, columns=columns)
    
    # Group by Centroid and calculate the mean for each dimension
    new_centroids = df.groupby('Centroid').mean().reset_index().values
    return new_centroids

def calc_error(new_centroids, old_centroids):
    return np.sum(euclidean_dist(new_centroids, old_centroids))

def pkmeans(data, n, max_iterations=150, stop_distance=0.001):
    print(time.asctime(), "Started")
    init_centroids = np.array(data.takeSample(False, n, seed=42)) 
    centroids = sc.broadcast(init_centroids)

    iteration = 1
    error = float("inf")
    while error > stop_distance and iteration <= max_iterations:
        loop_start = time.time()
        closest_centroids_rdd = closest_centroids(data, centroids)
        new_centroids = aggregate_means(closest_centroids_rdd)
        error = calc_error(new_centroids[:, 1:], centroids.value)
        print("{3} Iteration #{0}\tDistance between old and new centroids: {1:.4f}\tIteration took: {2:.4f} sec".format(
            iteration, error, time.time() - loop_start, time.asctime()))
        
        # Update centroids
        centroids = sc.broadcast(new_centroids[:, 1:])  # Use the new centroids
        iteration += 1

    return centroids.value

In [11]:
A3_DATASET_URL = "https://cs.joensuu.fi/sipu/datasets/a3.txt"
DATA_FOLDER = "/home/jovyan/work/data"
A3_LOCAL_PATH = os.path.join(DATA_FOLDER, "a3.txt")

# Download Data
response = requests.get(A3_DATASET_URL)
if not os.path.exists(A3_LOCAL_PATH):
    with open(A3_LOCAL_PATH, 'wb') as file:
        file.write(response.content)

# Load clean data into spark
data = sc.textFile(A3_LOCAL_PATH)
parsed_data = data.map(lambda row: array(tuple(map(float, row.strip().split())))).cache()

parsed_data.take(5)

24/12/14 18:30:52 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
24/12/14 18:31:07 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
24/12/14 18:31:22 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
24/12/14 18:31:37 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
                                                                                

[array([53920., 42968.]),
 array([52019., 42206.]),
 array([52570., 42476.]),
 array([54220., 42081.]),
 array([54268., 43420.])]

In [31]:
centroids = pkmeans(parsed_data, 50)

Sat Dec 14 19:15:57 2024 Started
Sat Dec 14 19:15:57 2024 Iteration #1	Distance between old and new centroids: 112307.3063	Iteration took: 0.1176 sec
Sat Dec 14 19:15:58 2024 Iteration #2	Distance between old and new centroids: 44519.8717	Iteration took: 0.1157 sec
Sat Dec 14 19:15:58 2024 Iteration #3	Distance between old and new centroids: 31843.3779	Iteration took: 0.1090 sec
Sat Dec 14 19:15:58 2024 Iteration #4	Distance between old and new centroids: 21192.3485	Iteration took: 0.1050 sec
Sat Dec 14 19:15:58 2024 Iteration #5	Distance between old and new centroids: 9646.1662	Iteration took: 0.1113 sec
Sat Dec 14 19:15:58 2024 Iteration #6	Distance between old and new centroids: 6134.1732	Iteration took: 0.1042 sec
Sat Dec 14 19:15:58 2024 Iteration #7	Distance between old and new centroids: 5262.5803	Iteration took: 0.1100 sec
Sat Dec 14 19:15:58 2024 Iteration #8	Distance between old and new centroids: 2496.2124	Iteration took: 0.1101 sec
Sat Dec 14 19:15:58 2024 Iteration #9	Dist

24/12/14 19:17:56 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
24/12/14 19:17:56 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:981)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [27]:
centroids

array([[0.00000000e+00, 6.09482470e+03, 4.10024568e+04],
       [1.00000000e+00, 2.55802081e+04, 2.80568926e+04],
       [2.00000000e+00, 8.98714493e+03, 4.99091014e+04],
       [3.00000000e+00, 9.48408850e+03, 3.80838761e+04],
       [4.00000000e+00, 4.84747623e+04, 2.38995396e+04],
       [5.00000000e+00, 2.79017810e+04, 4.51022025e+04],
       [6.00000000e+00, 6.12914925e+04, 4.51113881e+04],
       [7.00000000e+00, 9.64706383e+03, 6.13325745e+04],
       [8.00000000e+00, 1.75801867e+04, 2.53175267e+04],
       [9.00000000e+00, 1.76587190e+04, 5.63090868e+04],
       [1.00000000e+01, 2.72420135e+04, 1.06863378e+04],
       [1.10000000e+01, 1.12779425e+04, 5.04366897e+04],
       [1.20000000e+01, 3.08898481e+04, 1.88458165e+04],
       [1.30000000e+01, 4.39303021e+04, 3.24957292e+04],
       [1.40000000e+01, 5.76577326e+04, 1.41785382e+04],
       [1.50000000e+01, 3.83125836e+04, 4.58775358e+04],
       [1.60000000e+01, 3.85681861e+04, 6.57083550e+03],
       [1.70000000e+01, 2.91537