In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
    builder.\
    master('local').\
    appName('clustering-basics').\
    getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans

In [3]:
data = spark.read.format('libsvm').\
    load('D:/learn-ab/learning-PySpark/sample-data/sample-kmeans-data.txt')

In [4]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [5]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [6]:
final_data = data.select('features')
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [7]:
?KMeans

[1;31mInit signature:[0m
[0mKMeans[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mfeaturesCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'features'[0m[1;33m,[0m[1;33m
[0m    [0mpredictionCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'prediction'[0m[1;33m,[0m[1;33m
[0m    [0mk[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m2[0m[1;33m,[0m[1;33m
[0m    [0minitMode[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'k-means||'[0m[1;33m,[0m[1;33m
[0m    [0minitSteps[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mtol[0m[1;33m:[0m [0mfloat[0m [1;33m=[0m [1;36m0.0001[0m[1;33m,[0m[1;33m
[0m    [0mmaxIter[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m20[0m[1;33m,[0m[1;33m
[0m    [0mseed[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mint[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdistanceMeasure[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;3

In [8]:
kmeans = KMeans().setK(2).setSeed(1)

In [9]:
kmeans_model = kmeans.fit(final_data)

In [10]:
kmeans_pred = kmeans_model.transform(final_data)

In [11]:
kmeans_pred.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



In [12]:
cluster_centers = kmeans_model.clusterCenters()
cluster_centers

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

In [13]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [14]:
?ClusteringEvaluator

[1;31mInit signature:[0m
[0mClusteringEvaluator[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mpredictionCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'prediction'[0m[1;33m,[0m[1;33m
[0m    [0mfeaturesCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'features'[0m[1;33m,[0m[1;33m
[0m    [0mmetricName[0m[1;33m:[0m [1;34m'ClusteringEvaluatorMetricType'[0m [1;33m=[0m [1;34m'silhouette'[0m[1;33m,[0m[1;33m
[0m    [0mdistanceMeasure[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'squaredEuclidean'[0m[1;33m,[0m[1;33m
[0m    [0mweightCol[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mstr[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Evaluator for Clustering results, which expects two input
columns: prediction and features. The metric computes the Silhouette
measure using the squared Euclidean distance.

The Silhouette is a measure for 

In [15]:
kmeans_eval = ClusteringEvaluator()

In [16]:
silhouette = kmeans_eval.evaluate(kmeans_pred)
print(f'Silhouette with squared euclidean distance :{silhouette}')

Silhouette with squared euclidean distance :0.9997530305375207


In [17]:
kmeans = KMeans().setK(3).setSeed(1)
kmeans_model = kmeans.fit(final_data)
kmeans_pred = kmeans_model.transform(final_data)
print('Clustering Results : \n')
kmeans_pred.show()
cluster_centers = kmeans_model.clusterCenters()
print('Cluster Centers : \n{cluster_centers}')
silhouette = kmeans_eval.evaluate(kmeans_pred)
print(f'Silhouette with squared euclidean distance :{silhouette}')

Clustering Results : 

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+

Cluster Centers : 
{cluster_centers}
Silhouette with squared euclidean distance :0.6248737134600261


In [18]:
kmeans = KMeans().setK(4).setSeed(1)
kmeans_model = kmeans.fit(final_data)
kmeans_pred = kmeans_model.transform(final_data)
print('Clustering Results : \n')
kmeans_pred.show()
cluster_centers = kmeans_model.clusterCenters()
print('Cluster Centers : \n{cluster_centers}')
silhouette = kmeans_eval.evaluate(kmeans_pred)
print(f'Silhouette with squared euclidean distance :{silhouette}')

Clustering Results : 

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         3|
+--------------------+----------+

Cluster Centers : 
{cluster_centers}
Silhouette with squared euclidean distance :0.25000000000146066


In [19]:
kmeans = KMeans().setK(5).setSeed(1)
kmeans_model = kmeans.fit(final_data)
kmeans_pred = kmeans_model.transform(final_data)
print('Clustering Results : \n')
kmeans_pred.show()
cluster_centers = kmeans_model.clusterCenters()
print('Cluster Centers : \n{cluster_centers}')
silhouette = kmeans_eval.evaluate(kmeans_pred)
print(f'Silhouette with squared euclidean distance :{silhouette}')

Clustering Results : 

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         4|
|(3,[0,1,2],[9.2,9...|         3|
+--------------------+----------+

Cluster Centers : 
{cluster_centers}
Silhouette with squared euclidean distance :0.12500000000000008


In [20]:
kmeans = KMeans().setK(6).setSeed(1)
kmeans_model = kmeans.fit(final_data)
kmeans_pred = kmeans_model.transform(final_data)
print('Clustering Results : \n')
kmeans_pred.show()
cluster_centers = kmeans_model.clusterCenters()
print('Cluster Centers : \n{cluster_centers}')
silhouette = kmeans_eval.evaluate(kmeans_pred)
print(f'Silhouette with squared euclidean distance :{silhouette}')

Clustering Results : 

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         2|
|(3,[0,1,2],[0.2,0...|         3|
|(3,[0,1,2],[9.0,9...|         4|
|(3,[0,1,2],[9.1,9...|         5|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+

Cluster Centers : 
{cluster_centers}
Silhouette with squared euclidean distance :0.0
