In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.6.tgz
!tar xvf spark-2.4.4-bin-hadoop2.6.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.6"
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="PySpark_dataframe")

# Clustering

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

spark = SparkSession.builder.master('local[2]') \
        .appName('clustering_app') \
        .getOrCreate()

In [None]:
# delete files from previous runs
!rm -f hmp.parquet*

# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

--2020-01-08 08:58:34--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet [following]
--2020-01-08 08:58:34--  https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (911K) [application/octet-stream]
Saving to: ‘hmp.parquet’


2020-01-08 08:58:34 (12.1 MB/s) - ‘hmp.parquet’ saved [932997/932997]



In [None]:
df.show()

+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 20| 51| 35|Accelerometer-201...|Brush_teeth|
| 18| 49| 34|Accelerometer-201...|Brush_teeth|
| 19| 48| 34|Accelerometer-201...|Brush_teeth|
| 16| 53| 34|Accelerometer-201...|Brush_teeth|
| 18| 52| 35|

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols=['x','y','z'],
                                  outputCol = 'features')

In [None]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setK(13).setSeed(1)

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, kmeans])


In [None]:
model = pipeline.fit(df)

In [None]:
from pyspark.ml.evaluation import ClusteringEvaluator

# Make predictions
predictions = model.transform(df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.4153293521373778


The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters.[Silhouette](https://en.wikipedia.org/wiki/Silhouette)

Here we use only two classes

In [None]:
df.createOrReplaceTempView('df_table')
df2 = spark.sql("select * from df where class in ('Climb_stairs', 'Brush_teeth')")

Don't forget to change the K value

In [None]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setK(2).setSeed(1)

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, kmeans])

In [None]:
model2 = pipeline.fit(df2)

In [None]:
from pyspark.ml.evaluation import ClusteringEvaluator

# Make predictions
predictions = model2.transform(df2)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.6233363868811865


## Exercise

### Let do the Data Engineering

In [None]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline

indexer = StringIndexer(inputCol="class", outputCol="classIndex")
encoder = OneHotEncoder(inputCol="classIndex", outputCol="categoryVec")
vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)

pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer])
model = pipeline.fit(df)
prediction = model.transform(df)
prediction.show()

+---+---+---+--------------------+-----------+----------+--------------+----------------+--------------------+
|  x|  y|  z|              source|      class|classIndex|   categoryVec|        features|       features_norm|
+---+---+---+--------------------+-----------+----------+--------------+----------------+--------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[21.0,52.0,34.0]|[0.19626168224299...|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,51.0,34.0]|[0.20560747663551...|
|

Now let’s create a new pipeline for kmeans.

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans(featuresCol="features").setK(14).setSeed(1)
pipeline = Pipeline(stages=[vectorAssembler, kmeans])
model = pipeline.fit(df)
predictions = model.transform(df)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.41244594513295846


We have 14 different movement patterns in the dataset, so setting K of KMeans to 14 is a good idea. But please experiment with different values for K, do you find a sweet spot? The closer Silhouette gets to 1, the better.

In [None]:
# please change the pipeline the check performance for different K, feel free to use a loop

In [None]:
for i in range(10,17):
  kmeans = KMeans(featuresCol="features").setK(i).setSeed(1)
  pipeline = Pipeline(stages=[vectorAssembler, kmeans])
  model = pipeline.fit(df)
  predictions = model.transform(df)
  evaluator = ClusteringEvaluator()
  silhouette = evaluator.evaluate(predictions)
  print("The value of K is: ", i)
  print("Silhouette with squared euclidean distance = " + str(silhouette))

The value of K is:  10
Silhouette with squared euclidean distance = 0.47370428136987536
The value of K is:  11
Silhouette with squared euclidean distance = 0.4819049717562352
The value of K is:  12
Silhouette with squared euclidean distance = 0.40964155503229643
The value of K is:  13
Silhouette with squared euclidean distance = 0.4153293521373778
The value of K is:  14
Silhouette with squared euclidean distance = 0.41244594513295846
The value of K is:  15
Silhouette with squared euclidean distance = 0.41771495579360896
The value of K is:  16
Silhouette with squared euclidean distance = 0.39594610810727193


Now please extend the pipeline to work on the normalized features. You need to tell KMeans to use the normalized feature column and change the pipeline in order to contain the normalizer stage as well.

In [None]:
kmeans = KMeans($$).setK(14).setSeed(1)
pipeline = $$
model = pipeline.fit(df)

predictions = model.transform(df)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

In [None]:
kmeans = KMeans(featuresCol='features_norm').setK(14).setSeed(1)
Pipeline(stages=[vectorAssembler, normalizer, kmeans])
model = pipeline.fit(df)

predictions = model.transform(df)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))


Silhouette with squared euclidean distance = 0.39594610810727193


In [None]:
from pyspark.sql.functions import col
df_denormalized = df.select([col('*'),(col('x')*10)]).drop('x').withColumnRenamed('(x * 10)','x')

In [None]:
df_denormalized.show()

+---+---+--------------------+-----------+---+
|  y|  z|              source|      class|  x|
+---+---+--------------------+-----------+---+
| 49| 35|Accelerometer-201...|Brush_teeth|220|
| 49| 35|Accelerometer-201...|Brush_teeth|220|
| 52| 35|Accelerometer-201...|Brush_teeth|220|
| 52| 35|Accelerometer-201...|Brush_teeth|220|
| 52| 34|Accelerometer-201...|Brush_teeth|210|
| 51| 34|Accelerometer-201...|Brush_teeth|220|
| 50| 35|Accelerometer-201...|Brush_teeth|200|
| 52| 34|Accelerometer-201...|Brush_teeth|220|
| 50| 34|Accelerometer-201...|Brush_teeth|220|
| 51| 35|Accelerometer-201...|Brush_teeth|220|
| 51| 33|Accelerometer-201...|Brush_teeth|210|
| 50| 34|Accelerometer-201...|Brush_teeth|200|
| 49| 33|Accelerometer-201...|Brush_teeth|210|
| 49| 33|Accelerometer-201...|Brush_teeth|210|
| 51| 35|Accelerometer-201...|Brush_teeth|200|
| 49| 34|Accelerometer-201...|Brush_teeth|180|
| 48| 34|Accelerometer-201...|Brush_teeth|190|
| 53| 34|Accelerometer-201...|Brush_teeth|160|
| 52| 35|Acce

In [None]:
kmeans = KMeans(featuresCol="features").setK(14).setSeed(1)
pipeline = Pipeline(stages=[vectorAssembler, kmeans])
model = pipeline.fit(df_denormalized)
predictions = model.transform(df_denormalized)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.5709023393004293


Apache SparkML can be used to try many different algorithms and parametrizations using the same pipeline. Please change the code below to use GaussianMixture over KMeans. Please use the following link for your reference.
[GaussianMixture](https://spark.apache.org/docs/latest/ml-clustering.html#gaussian-mixture-model-gmm)


In [None]:
from pyspark.ml.clustering import GaussianMixture

gmm = $$
pipeline = $$

model = pipeline.fit(df)

predictions = model.transform(df)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

In [None]:
from pyspark.ml.clustering import GaussianMixture

gmm = GaussianMixture(featuresCol="features").setK(14).setSeed(1)
pipeline = Pipeline(stages=[vectorAssembler, gmm])

model = pipeline.fit(df)

predictions = model.transform(df)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.15906267433367427


In [None]:
from pyspark.ml.clustering import GaussianMixture

gmm = GaussianMixture(featuresCol='features_norm').setK(14).setSeed(1)
Pipeline(stages=[vectorAssembler, normalizer, gmm])

model = pipeline.fit(df)

predictions = model.transform(df)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.15906267433367427
