In [3]:
import findspark
findspark.init()
import pyspark

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
raw = spark.read.csv("StudentsPerformance.csv", inferSchema = True, header = True)

df = raw.select("*").withColumn("id", monotonically_increasing_id())
df.show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+---+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score| id|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+---+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|  0|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|  1|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|  2|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|  3|
|  male|       group C|               some college|    standard|            

In [13]:
from pyspark.ml.feature import StringIndexer

# preprocessing data
indexer = StringIndexer(inputCol="test preparation course", outputCol="preparation")
preprocessed = indexer.fit(df).transform(df)
preprocessed.show()

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=["preparation", "math score","reading score","writing score"],
    outputCol='features')
preprocessed = assembler.transform(preprocessed)
preprocessed.show()

+---+-----------------------+----------+-------------+-------------+-----------+
| id|test preparation course|math score|reading score|writing score|preparation|
+---+-----------------------+----------+-------------+-------------+-----------+
|  0|                   none|        72|           72|           74|        0.0|
|  1|              completed|        69|           90|           88|        1.0|
|  2|                   none|        90|           95|           93|        0.0|
|  3|                   none|        47|           57|           44|        0.0|
|  4|                   none|        76|           78|           75|        0.0|
|  5|                   none|        71|           83|           78|        0.0|
|  6|              completed|        88|           95|           92|        1.0|
|  7|                   none|        40|           43|           39|        0.0|
|  8|              completed|        64|           64|           67|        1.0|
|  9|                   none

In [32]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans().setK(6).setSeed(1)
model = kmeans.fit(preprocessed)

y = model.transform(preprocessed)
y.show()

+---+-----------------------+----------+-------------+-------------+-----------+--------------------+----------+
| id|test preparation course|math score|reading score|writing score|preparation|            features|prediction|
+---+-----------------------+----------+-------------+-------------+-----------+--------------------+----------+
|  0|                   none|        72|           72|           74|        0.0|[0.0,72.0,72.0,74.0]|         3|
|  1|              completed|        69|           90|           88|        1.0|[1.0,69.0,90.0,88.0]|         4|
|  2|                   none|        90|           95|           93|        0.0|[0.0,90.0,95.0,93.0]|         1|
|  3|                   none|        47|           57|           44|        0.0|[0.0,47.0,57.0,44.0]|         2|
|  4|                   none|        76|           78|           75|        0.0|[0.0,76.0,78.0,75.0]|         4|
|  5|                   none|        71|           83|           78|        0.0|[0.0,71.0,83.0,7

In [33]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(y)

print("Silhouette with squared euclidean distance = " + str(silhouette))


centers = model.clusterCenters()
print("\n\nCluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.4574073166675022


Cluster Centers: 
[ 0.24019608 59.19607843 61.39215686 60.07352941]
[ 0.53333333 88.275      91.325      91.13333333]
[ 0.21153846 49.21794872 51.42307692 49.28205128]
[ 0.38223938 67.16602317 70.67953668 70.27799228]
[ 0.48660714 76.59375    80.47767857 79.15178571]
[ 0.10810811 32.13513514 35.97297297 33.59459459]
