2. Build a Clustering Model with Spark with a dataset of your choice

Initialize Spark Session

In [15]:
from pyspark.sql import SparkSession

try:
    spark = SparkSession.builder \
        .appName("ClusteringModel") \
        .getOrCreate()
        
    spark.sparkContext.setLogLevel("ERROR")
    print("Spark session initialized successfully.")
except Exception as e:
    print(f"Error initializing Spark session: {e}")
    exit(1)

Spark session initialized successfully.


Load Dataset (Iris, without target column)

In [16]:
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

df = spark.createDataFrame(iris_df)
df.show(5)

+-----------------+----------------+-----------------+----------------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|
+-----------------+----------------+-----------------+----------------+
|              5.1|             3.5|              1.4|             0.2|
|              4.9|             3.0|              1.4|             0.2|
|              4.7|             3.2|              1.3|             0.2|
|              4.6|             3.1|              1.5|             0.2|
|              5.0|             3.6|              1.4|             0.2|
+-----------------+----------------+-----------------+----------------+
only showing top 5 rows



Feature Assembling for Clustering

In [17]:
from pyspark.ml.feature import VectorAssembler

feature_cols = iris.feature_names
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_assembled = assembler.transform(df).select("features")
df_assembled.show(5)

+-----------------+
|         features|
+-----------------+
|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|
+-----------------+
only showing top 5 rows



KMeans Clustering Model

In [18]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol='features', k=3, seed=1)  # 3 clusters for iris
model = kmeans.fit(df_assembled)

Predictions (Cluster Assignments)

In [19]:
predictions = model.transform(df_assembled)
predictions.select("features", "prediction").show(10)

+-----------------+----------+
|         features|prediction|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]|         1|
|[4.9,3.0,1.4,0.2]|         1|
|[4.7,3.2,1.3,0.2]|         1|
|[4.6,3.1,1.5,0.2]|         1|
|[5.0,3.6,1.4,0.2]|         1|
|[5.4,3.9,1.7,0.4]|         1|
|[4.6,3.4,1.4,0.3]|         1|
|[5.0,3.4,1.5,0.2]|         1|
|[4.4,2.9,1.4,0.2]|         1|
|[4.9,3.1,1.5,0.1]|         1|
+-----------------+----------+
only showing top 10 rows



Evaluate Clustering (Silhouette Score)

In [20]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score: {silhouette:.4f}")

Silhouette Score: 0.7344


Cluster Centers

In [21]:
centers = model.clusterCenters()
for idx, center in enumerate(centers):
    print(f"Cluster {idx} Center: {center}")

Cluster 0 Center: [6.85384615 3.07692308 5.71538462 2.05384615]
Cluster 1 Center: [5.006 3.428 1.462 0.246]
Cluster 2 Center: [5.88360656 2.74098361 4.38852459 1.43442623]


Stop Spark Session

In [22]:
spark.stop()
print("Spark session stopped successfully.")


Spark session stopped successfully.
