In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [3]:
from pyspark.ml.clustering import KMeans

In [4]:
dataset = spark.read.format('libsvm').load('FileStore/tables/sample_kmeans_data.txt')

In [5]:
dataset.show()

In [6]:
final_data = dataset.select('features')

In [7]:
final_data.show()

In [8]:
kmeans = KMeans().setK(2).setSeed(1)

In [9]:
model = kmeans.fit(final_data) 
# kmeans only expects feature column(no label column because it is unsupervised learning)

In [10]:
# wssse: within set sum of squared errors
wssse = model.computeCost(final_data)

In [11]:
print(wssse)

In [12]:
# check out the centroids of the clusters found by the model
centers = model.clusterCenters()

In [13]:
centers

In [14]:
# see the labeled cluster
results = model.transform(final_data)

In [15]:
results.show()

In [16]:
# KMeans with a more realistic problem
# classify seed types
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [17]:
dataset = spark.read.csv('FileStore/tables/seeds_dataset.csv', header = True, inferSchema = True)

In [18]:
dataset.printSchema()

In [19]:
dataset.head(1)
# features are all numerical

In [20]:
from pyspark.ml.feature import VectorAssembler

In [21]:
dataset.columns

In [22]:
assembler = VectorAssembler(inputCols = dataset.columns, outputCol = 'features')

In [23]:
final_data = assembler.transform(dataset)

In [24]:
final_data.printSchema()

In [25]:
from pyspark.ml.feature import StandardScaler

In [26]:
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures')

In [27]:
scaler_model = scaler.fit(final_data)

In [28]:
final_data = scaler_model.transform(final_data)

In [29]:
final_data.head(1)

In [30]:
kmeans = KMeans(featuresCol = 'scaledFeatures', k = 3)

In [31]:
model = kmeans.fit(final_data)

In [32]:
print('WSSSE')
print(model.computeCost(final_data))

In [33]:
centers = model.clusterCenters()
centers

In [34]:
model.transform(final_data).select('prediction').show()