In [50]:
# Jayson Francis - Kmeans clustering with Spark

import numpy as np

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.random import RandomRDDs

In [51]:
# Generate random class data with cluster center
c1_v=RandomRDDs.normalVectorRDD(sc,20,2,numPartitions=2,seed=1).map(lambda v:np.add([1,5],v))
c2_v=RandomRDDs.normalVectorRDD(sc,16,2,numPartitions=2,seed=2).map(lambda v:np.add([5,1],v))
c3_v=RandomRDDs.normalVectorRDD(sc,12,2,numPartitions=2,seed=3).map(lambda v:np.add([4,6],v))

In [52]:
# Concatenate 2 RDDs with .union() function
c12 = c1_v.union(c2_v)

# Concatenate the rest of the data
my_data = c12.union(c3_v)

In [65]:
# Returns a kmeans model.

# Chose the number of clusters, 'k'
# Chose the number of iterations and amount of runs
# 'k-means||' is called k-means++ in the literature. Method to find good initial clusters so you dont have to have too many runs. 

my_kmmodel = KMeans.train(my_data, k=1, maxIterations=20, runs=1, initializationMode='k-means||', seed=10)

In [70]:
my_kmmodel.clusterCenters

[array([ 3.12801059,  3.92940785])]

In [74]:
my_data.stats()

(count: 48, mean: [ 3.12801059  3.92940785], stdev: [ 2.11814298  2.25624918], max: [ 6.36840832  8.04523732], min: [-1.33872715 -0.32867964])

In [86]:
# Observation

# The sum of the variance in both dimensions is the sum squared error.

# Stdev derived from my_data.stats()
print(2.11814298**2, 2.25624918**2)
print('SUM: ' + str(2.11814298**2 + 2.25624918**2))

4.486529683723281 5.090660362250674
SUM: 9.577190045973953


In [87]:
# Sum Squared Error
# Get SSE of a point to the center of the cluster it's assigned to

def getsse(point):
    this_center = my_kmmodel.centers[my_kmmodel.predict(point)]
    
    return (sum([x**2 for x in (point - this_center)]))

In [88]:
# Collect list of sse of each point to it's center
my_sse = my_data.map(getsse).collect()

In [89]:
print(np.array(my_sse).mean())

9.57719005617
