In [1]:
import findspark
findspark.init('C:\Spark\spark-3.0.1-bin-hadoop2.7')
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('cluster').getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans

In [3]:
dataset = spark.read.csv("seeds_dataset.csv",
                         header=True,
                         inferSchema=True)

In [4]:
dataset.show(5)

+-----+---------+-----------+------------------+------------------+---------------------+----------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|            5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|           4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|           4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|           4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|           5.175|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
only showing top 5 rows



In [5]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [6]:
dataset.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [7]:
vec_assembler = VectorAssembler(
    inputCols = dataset.columns, 
    outputCol='features'
)

In [8]:
final_data = vec_assembler.transform(dataset)

In [9]:
# Scale the Data 
from pyspark.ml.feature import StandardScaler

In [10]:
scaler = StandardScaler(inputCol="features", 
                        outputCol="scaledFeatures", 
                        withStd=True, 
                        withMean=False)

In [11]:
# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(final_data)

In [12]:
# Normalize each feature to have unit standard deviation.
final_data = scalerModel.transform(final_data)

In [13]:
# Trains a k-means model.
kmeans = KMeans(featuresCol='scaledFeatures', k=3)
model = kmeans.fit(final_data)

In [14]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 4.91309043 10.92012526 37.32658724 12.37724251  8.59393872  1.8226071
 10.35389957]
[ 4.0648023  10.14242485 35.82143905 11.81918014  7.51855717  3.19361875
 10.40520609]
[ 6.32636687 12.38115343 37.39222755 13.9206997   9.75485787  2.41428142
 12.28078861]


In [15]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
+----------+
only showing top 20 rows

