In [1]:
!pip install pyspark
!pip install pyarrow
!pip install -q findspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("cluster").getOrCreate()



In [2]:
data=spark.read.csv("/content/seeds_dataset.csv",inferSchema=True,header=True)

In [3]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [4]:
data.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|             5.175|
|14.38|    14.21|     0.8951|             5.386|             3.312|   2.4619999999999997|             4.956|
|14.69|    14.49|  

In [5]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator

In [6]:
assembler=VectorAssembler(inputCols=data.columns,outputCol="features")

In [7]:
output=assembler.transform(data)

In [8]:
final_data=output.select("features")

In [9]:
final_data.show()

+--------------------+
|            features|
+--------------------+
|[15.26,14.84,0.87...|
|[14.88,14.57,0.88...|
|[14.29,14.09,0.90...|
|[13.84,13.94,0.89...|
|[16.14,14.99,0.90...|
|[14.38,14.21,0.89...|
|[14.69,14.49,0.87...|
|[14.11,14.1,0.891...|
|[16.63,15.46,0.87...|
|[16.44,15.25,0.88...|
|[15.26,14.85,0.86...|
|[14.03,14.16,0.87...|
|[13.89,14.02,0.88...|
|[13.78,14.06,0.87...|
|[13.74,14.05,0.87...|
|[14.59,14.28,0.89...|
|[13.99,13.83,0.91...|
|[15.69,14.75,0.90...|
|[14.7,14.21,0.915...|
|[12.72,13.57,0.86...|
+--------------------+
only showing top 20 rows



In [10]:
def findBestKvalue(k,dt,fc):
  kmeans=KMeans(featuresCol=fc).setK(k)
  model=kmeans.fit(dt)
  test=model.transform(dt)
  evaluator=ClusteringEvaluator()
  silhouette = evaluator.evaluate(test)
  print("k = "+ str(k) + " --> "+"Silhouette with squared euclidean distance = " + str(silhouette))


In [11]:
for i in range(2,11):
  findBestKvalue(i,final_data,"features")

k = 2 --> Silhouette with squared euclidean distance = 0.7107643222232611
k = 3 --> Silhouette with squared euclidean distance = 0.6583884755012417
k = 4 --> Silhouette with squared euclidean distance = 0.5883232624783482
k = 5 --> Silhouette with squared euclidean distance = 0.5094235292838772
k = 6 --> Silhouette with squared euclidean distance = 0.47882367461145425
k = 7 --> Silhouette with squared euclidean distance = 0.521117593890403
k = 8 --> Silhouette with squared euclidean distance = 0.4953765139129011
k = 9 --> Silhouette with squared euclidean distance = 0.5108553824163148
k = 10 --> Silhouette with squared euclidean distance = 0.4883863926302419


In [12]:
from pyspark.ml.feature import StandardScaler

In [13]:
scalar=StandardScaler(inputCol="features",outputCol="scaledfeatures")

In [14]:
scaled_data=scalar.fit(final_data).transform(final_data)

In [15]:
scaled_data.show()

+--------------------+--------------------+
|            features|      scaledfeatures|
+--------------------+--------------------+
|[15.26,14.84,0.87...|[5.24452795332028...|
|[14.88,14.57,0.88...|[5.11393027165175...|
|[14.29,14.09,0.90...|[4.91116018695588...|
|[13.84,13.94,0.89...|[4.75650503761158...|
|[16.14,14.99,0.90...|[5.54696468981581...|
|[14.38,14.21,0.89...|[4.94209121682475...|
|[14.69,14.49,0.87...|[5.04863143081749...|
|[14.11,14.1,0.891...|[4.84929812721816...|
|[16.63,15.46,0.87...|[5.71536696354628...|
|[16.44,15.25,0.88...|[5.65006812271202...|
|[15.26,14.85,0.86...|[5.24452795332028...|
|[14.03,14.16,0.87...|[4.82180387844584...|
|[13.89,14.02,0.88...|[4.77368894309428...|
|[13.78,14.06,0.87...|[4.73588435103234...|
|[13.74,14.05,0.87...|[4.72213722664617...|
|[14.59,14.28,0.89...|[5.01426361985209...|
|[13.99,13.83,0.91...|[4.80805675405968...|
|[15.69,14.75,0.90...|[5.39230954047151...|
|[14.7,14.21,0.915...|[5.05206821191403...|
|[12.72,13.57,0.86...|[4.3715855

In [16]:
#after scaled
for i in range(2,11):
  findBestKvalue(i,scaled_data,"scaledfeatures")

k = 2 --> Silhouette with squared euclidean distance = 0.709845635070088
k = 3 --> Silhouette with squared euclidean distance = 0.6300001033389961
k = 4 --> Silhouette with squared euclidean distance = 0.4468230108282874
k = 5 --> Silhouette with squared euclidean distance = 0.34803991948020657
k = 6 --> Silhouette with squared euclidean distance = 0.3449139587798853
k = 7 --> Silhouette with squared euclidean distance = 0.24680354845004945
k = 8 --> Silhouette with squared euclidean distance = 0.23082516186861857
k = 9 --> Silhouette with squared euclidean distance = 0.17218686711111328
k = 10 --> Silhouette with squared euclidean distance = 0.25702587974284263
