In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('clustering').getOrCreate()

In [3]:
from pyspark.sql.types import StructType, StructField, FloatType, IntegerType, DoubleType

In [4]:
df = spark.read.load('seeds_dataset.txt', format='text')

In [5]:
from pyspark.sql.functions import split

In [6]:
sp_f = split('value', '\t+')

In [7]:
data = df.select(sp_f.getItem(0).alias('area').cast(FloatType()),
          sp_f.getItem(1).alias('perimeter').cast(FloatType()),
          sp_f.getItem(2).alias('compactness').cast(FloatType()),
          sp_f.getItem(3).alias('kernel_length').cast(FloatType()),
          sp_f.getItem(4).alias('kernel_width').cast(FloatType()),
          sp_f.getItem(5).alias('asymmetry').cast(FloatType()),
          sp_f.getItem(6).alias('groove_length').cast(FloatType()),
          sp_f.getItem(7).alias('type').cast(IntegerType())
         )

In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
assembler = VectorAssembler(inputCols=['area',
 'perimeter',
 'compactness',
 'kernel_length',
 'kernel_width',
 'asymmetry',
 'groove_length'], outputCol='features')

In [10]:
data.printSchema()

root
 |-- area: float (nullable = true)
 |-- perimeter: float (nullable = true)
 |-- compactness: float (nullable = true)
 |-- kernel_length: float (nullable = true)
 |-- kernel_width: float (nullable = true)
 |-- asymmetry: float (nullable = true)
 |-- groove_length: float (nullable = true)
 |-- type: integer (nullable = true)



In [11]:
data = assembler.transform(data)

In [12]:
data = data.select('features', 'type')

In [13]:
from pyspark.ml.clustering import KMeans

In [14]:
kmeans = KMeans(featuresCol='features', k=3)

In [15]:
model = kmeans.fit(data)

In [16]:
results = model.transform(data)

In [17]:
print(model.clusterCenters())

[array([11.98865852, 13.28439023,  0.85273658,  5.22742682,  2.88008536,
        4.58392681,  5.0742439 ]), array([14.81910445, 14.53716423,  0.88052239,  5.59101494,  3.29935819,
        2.70658506,  5.21753732]), array([18.72180316, 16.29737705,  0.88508688,  6.20893442,  3.72267215,
        3.60359019,  6.06609837])]


In [18]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [19]:
evaluator = ClusteringEvaluator()

In [20]:
silhouette = evaluator.evaluate(results)

In [21]:
print(silhouette)

0.658388541526433
