In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('seedfinder').getOrCreate()

### Explore Data

In [2]:
data = spark.read.csv('seeds_dataset.csv', inferSchema=True, header=True)

In [3]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [4]:
data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [5]:
data.count()

210

In [6]:
# we know there are 3 different kinds of wheat, so we use K = 3

In [7]:
from pyspark.ml.clustering import KMeans

### Create Feature Set

In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
assembler = VectorAssembler(inputCols=data.columns,outputCol='features')

In [10]:
with_features = assembler.transform(data).select('features')

### Feature Scaling

In [11]:
from pyspark.ml.feature import StandardScaler

In [12]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')

In [13]:
scalar_model = scaler.fit(with_features)

In [14]:
scaled_data = scalar_model.transform(with_features)

In [15]:
scaled_data.select('scaled_features').head(1)

[Row(scaled_features=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

### Train KMeans Model

In [16]:
kmeans = KMeans(featuresCol='scaled_features', k=3)
model = kmeans.fit(scaled_data)

### Get clustering results

In [19]:
results = model.transform(scaled_data)

In [20]:
results.select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows



### Interpret Cluster Results

In [21]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()

In [22]:
evaluator.evaluate(results)

0.616267393520126

In [23]:
model.clusterCenters()

[array([ 4.93382436, 10.94691274, 37.30542404, 12.41332714,  8.60366812,
         1.82917353, 10.40106154]),
 array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
         2.41585013, 12.29286107]),
 array([ 4.06660859, 10.14191893, 35.84098009, 11.81592066,  7.52397236,
         3.1823335 , 10.39801233])]