In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('kmeans1').getOrCreate()

In [9]:
spark

In [69]:
data = spark.read.format('libsvm').option('numFeatures',3).load('sample_kmeans_data.txt')

### Create a KMeans model trained with my data

In [70]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(data)

### Which cluster is each sample in ?

In [66]:
results = model.transform(data)

In [67]:
results.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|           (3,[],[])|         1|
|  1.0|(3,[0,1,2],[0.1,0...|         1|
|  2.0|(3,[0,1,2],[0.2,0...|         1|
|  3.0|(3,[0,1,2],[9.0,9...|         0|
|  4.0|(3,[0,1,2],[9.1,9...|         0|
|  5.0|(3,[0,1,2],[9.2,9...|         0|
+-----+--------------------+----------+



### Evaluate my model: silhouette coefficient

- +1 means perfect clustering (small intra-cluster distance, large inter-cluster distance)
- -1 means horrible clustering (large intra-cluster distance, small inter-cluster distance)

In [68]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator(metricName='silhouette', distanceMeasure='squaredEuclidean')

silhouette = evaluator.evaluate(results)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.9997530305375207
Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]
