In [1]:
try:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
except ImportError as e:
    printmd('<<<<<!!!!! Please restart your kernel after installing Apache Spark !!!!!>>>>>')

In [2]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

In [3]:
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

In [8]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline

indexer = StringIndexer(inputCol="class", outputCol="classIndex")
encoder = OneHotEncoder(inputCol="classIndex", outputCol="categoryVec")
vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)

pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer])
model = pipeline.fit(df)
prediction = model.transform(df)
prediction.show()

+---+---+---+--------------------+-----------+----------+--------------+----------------+--------------------+
|  x|  y|  z|              source|      class|classIndex|   categoryVec|        features|       features_norm|
+---+---+---+--------------------+-----------+----------+--------------+----------------+--------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[21.0,52.0,34.0]|[0.19626168224299...|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,51.0,34.0]|[0.20560747663551...|
|

Now let’s create a new pipeline for kmeans.


In [6]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans(featuresCol="features").setK(14).setSeed(1)
pipeline = Pipeline(stages=[vectorAssembler, kmeans])
model = pipeline.fit(df)
predictions = model.transform(df)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.41244594513295846




[https://en.wikipedia.org/wiki/Silhouette\_(clustering)](https://en.wikipedia.org/wiki/Silhouette_\(clustering\)?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMDeveloperSkillsNetworkML0201ENSkillsNetwork20647446-2022-01-01)


In [10]:
for k in range(2,14):
    kmeans = KMeans(featuresCol="features").setK(k).setSeed(1)
    pipeline = Pipeline(stages=[vectorAssembler, kmeans])
    model = pipeline.fit(df)
    predictions = model.transform(df)

    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette squared euclidean distance with k = {} is {}".format(k,str(silhouette)))

Silhouette squared euclidean distance with k = 2 is 0.6875664014387497
Silhouette squared euclidean distance with k = 3 is 0.6147915951361759
Silhouette squared euclidean distance with k = 4 is 0.6333227654128869
Silhouette squared euclidean distance with k = 5 is 0.5937447997439024
Silhouette squared euclidean distance with k = 6 is 0.592463658820136
Silhouette squared euclidean distance with k = 7 is 0.5484627422401509
Silhouette squared euclidean distance with k = 8 is 0.46686489256383346
Silhouette squared euclidean distance with k = 9 is 0.48034893889849645
Silhouette squared euclidean distance with k = 10 is 0.47370428136987536
Silhouette squared euclidean distance with k = 11 is 0.4819049717562352
Silhouette squared euclidean distance with k = 12 is 0.40964155503229643
Silhouette squared euclidean distance with k = 13 is 0.4153293521373778


In [12]:
for k in range(2,15):
    kmeans = KMeans(featuresCol="features_norm").setK(k).setSeed(1)
    pipeline = Pipeline(stages=[vectorAssembler, normalizer, kmeans])
    model = pipeline.fit(df)

    predictions = model.transform(df)

    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette squared euclidean distance with k = {} is {}".format(k,str(silhouette)))


Silhouette squared euclidean distance with k = 2 is 0.6462988404434188
Silhouette squared euclidean distance with k = 3 is 0.5801675525747375
Silhouette squared euclidean distance with k = 4 is 0.5795128843318075
Silhouette squared euclidean distance with k = 5 is 0.5378473434364454
Silhouette squared euclidean distance with k = 6 is 0.3616039650566081
Silhouette squared euclidean distance with k = 7 is 0.3388334984297795
Silhouette squared euclidean distance with k = 8 is 0.35346131260617686
Silhouette squared euclidean distance with k = 9 is 0.3320686157150071
Silhouette squared euclidean distance with k = 10 is 0.31921981409325373
Silhouette squared euclidean distance with k = 11 is 0.3166261086889984
Silhouette squared euclidean distance with k = 12 is 0.2524553751769574
Silhouette squared euclidean distance with k = 13 is 0.2811747980314105
Silhouette squared euclidean distance with k = 14 is 0.2668998965895519


Sometimes, inflating the dataset helps, here we multiply x by 10, let’s see if the performance inceases.


In [17]:
from pyspark.sql.functions import col
df_denormalized = df.select([col('*'),(col('x')*10)]).drop('x').withColumnRenamed('(x * 10)','x')

In [18]:
df_denormalized.show()

+---+---+--------------------+-----------+---+
|  y|  z|              source|      class|  x|
+---+---+--------------------+-----------+---+
| 49| 35|Accelerometer-201...|Brush_teeth|220|
| 49| 35|Accelerometer-201...|Brush_teeth|220|
| 52| 35|Accelerometer-201...|Brush_teeth|220|
| 52| 35|Accelerometer-201...|Brush_teeth|220|
| 52| 34|Accelerometer-201...|Brush_teeth|210|
| 51| 34|Accelerometer-201...|Brush_teeth|220|
| 50| 35|Accelerometer-201...|Brush_teeth|200|
| 52| 34|Accelerometer-201...|Brush_teeth|220|
| 50| 34|Accelerometer-201...|Brush_teeth|220|
| 51| 35|Accelerometer-201...|Brush_teeth|220|
| 51| 33|Accelerometer-201...|Brush_teeth|210|
| 50| 34|Accelerometer-201...|Brush_teeth|200|
| 49| 33|Accelerometer-201...|Brush_teeth|210|
| 49| 33|Accelerometer-201...|Brush_teeth|210|
| 51| 35|Accelerometer-201...|Brush_teeth|200|
| 49| 34|Accelerometer-201...|Brush_teeth|180|
| 48| 34|Accelerometer-201...|Brush_teeth|190|
| 53| 34|Accelerometer-201...|Brush_teeth|160|
| 52| 35|Acce

In [20]:
for k in range(2,15):
    kmeans = KMeans(featuresCol="features").setK(k).setSeed(1)
    pipeline = Pipeline(stages=[vectorAssembler, kmeans])
    model = pipeline.fit(df_denormalized)
    predictions = model.transform(df_denormalized)

    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette squared euclidean distance with k = {} is {}".format(k,str(silhouette)))

Silhouette squared euclidean distance with k = 2 is 0.8263613179673074
Silhouette squared euclidean distance with k = 3 is 0.7815106788368535
Silhouette squared euclidean distance with k = 4 is 0.7233881535353035
Silhouette squared euclidean distance with k = 5 is 0.7127939562980274
Silhouette squared euclidean distance with k = 6 is 0.6749934663717159
Silhouette squared euclidean distance with k = 7 is 0.6632024338998548
Silhouette squared euclidean distance with k = 8 is 0.6592948134738338
Silhouette squared euclidean distance with k = 9 is 0.6124942597802276
Silhouette squared euclidean distance with k = 10 is 0.6473390776478479
Silhouette squared euclidean distance with k = 11 is 0.6295384296417095
Silhouette squared euclidean distance with k = 12 is 0.6032474280054309
Silhouette squared euclidean distance with k = 13 is 0.5741691916361906
Silhouette squared euclidean distance with k = 14 is 0.5709023393004293


In [21]:
from pyspark.ml.clustering import GaussianMixture

for k in range(2,15):
    gmm = GaussianMixture(featuresCol="features").setK(k).setSeed(1)
    pipeline = Pipeline(stages=[vectorAssembler, gmm])

    model = pipeline.fit(df)

    predictions = model.transform(df)

    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette squared euclidean distance with k = {} is {}".format(k,str(silhouette)))

Silhouette squared euclidean distance with k = 2 is 0.3528427198750997
Silhouette squared euclidean distance with k = 3 is 0.41320597955203525
Silhouette squared euclidean distance with k = 4 is 0.4572038185796801
Silhouette squared euclidean distance with k = 5 is 0.4542839848166631
Silhouette squared euclidean distance with k = 6 is 0.3574269800636112
Silhouette squared euclidean distance with k = 7 is 0.3304041087315892
Silhouette squared euclidean distance with k = 8 is 0.2521027765850466
Silhouette squared euclidean distance with k = 9 is 0.3679609180820608
Silhouette squared euclidean distance with k = 10 is 0.27317277887797353
Silhouette squared euclidean distance with k = 11 is 0.14477229176774098
Silhouette squared euclidean distance with k = 12 is 0.22676395644566671
Silhouette squared euclidean distance with k = 13 is 0.24957429197367767
Silhouette squared euclidean distance with k = 14 is 0.15906267433367427
