# Import modules and create Spark session


In [1]:
#import modules
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

#create session
appName = "Clustering in Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
# Read file into dataFrame
# Description for each column data:
# 
# CustomerName: name of customer
# Age: age of customer (in year)/
# MaritalStatus: (1=married, 0=not married)
# IncomeRange: income per year (in USD)
# Gender: (1=female, 2=male)
# TotalChildren: number of children customer has
# ChildrenAtHome: number of children livin/g with customer (in the same home)
# Education: (1=high school, 2=bachelor, 3=master, 4=PhD, 5=Post-doc)
# Occupation: (0=unskilled manual work until 5=professional)
# HomeOwner: (1=owning a home, 0=not owning a home)
# Cars: number of car customer has

In [3]:

#read csv file using automatically inferred schema
customers = spark.read.csv(
    'dataset/customers.csv', inferSchema=True, header=True)
customers.show(3)

+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
|   CustomerName|Age|MaritalStatus|IncomeRange|Gender|TotalChildren|ChildrenAtHome|Education|Occupation|HomeOwner|Cars|
+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
|    Aaron Adams| 42|            0|      50000|     0|            0|             0|        3|         2|        1|   1|
|Aaron Alexander| 40|            1|      50000|     0|            0|             0|        2|         2|        1|   2|
|    Aaron Allen| 63|            0|      25000|     0|            2|             1|        2|         1|        1|   2|
+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
only showing top 3 rows



# Create Assembler

In [4]:
#define assembler
assembler = VectorAssembler(inputCols = [
    "Age", "MaritalStatus", "IncomeRange", "Gender", "TotalChildren", 
    "ChildrenAtHome", "Education", "Occupation", "HomeOwner", "Cars"], 
                            outputCol="features")


# Transform Data

In [13]:
#data = assembler.transform(customers)
data = assembler.transform(customers).select('CustomerName', 'features')
data.show(truncate = False, n=3)

+---------------+----------------------------------------------+
|CustomerName   |features                                      |
+---------------+----------------------------------------------+
|Aaron Adams    |[42.0,0.0,50000.0,0.0,0.0,0.0,3.0,2.0,1.0,1.0]|
|Aaron Alexander|[40.0,1.0,50000.0,0.0,0.0,0.0,2.0,2.0,1.0,2.0]|
|Aaron Allen    |[63.0,0.0,25000.0,0.0,2.0,1.0,2.0,1.0,1.0,2.0]|
+---------------+----------------------------------------------+
only showing top 3 rows



# Create k-Means clustering model¶


In [21]:
#define kMeans clustering algorithm
kmeans = KMeans(
    featuresCol=assembler.getOutputCol(), 
    predictionCol="clustering", k=8)
model = kmeans.fit(data)
print ("Model is successfully trained!")

Model is successfully trained!


# Print centroid for each cluster¶


In [22]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[5.60711289e+01 5.83804487e-01 7.50000000e+04 5.03921211e-01
 2.17308043e+00 8.16706183e-01 3.73244574e+00 3.92759438e+00
 7.23326646e-01 1.38063104e+00]
[6.77522388e+01 6.95522388e-01 1.50000000e+05 4.86567164e-01
 2.97611940e+00 4.17313433e+00 3.20746269e+00 4.43880597e+00
 6.55223881e-01 3.25223881e+00]
[5.31013005e+01 4.17180014e-01 2.50000000e+04 4.80492813e-01
 1.41512663e+00 6.08487337e-01 2.31622177e+00 1.45448323e+00
 5.93086927e-01 1.11464750e+00]
[6.64743041e+01 6.81477516e-01 5.00000000e+04 4.95182013e-01
 2.22805139e+00 9.15417559e-02 3.03104925e+00 3.32334047e+00
 6.80406852e-01 1.48768737e+00]
[6.42970464e+01 5.93248945e-01 1.00000000e+05 4.93670886e-01
 3.19156118e+00 8.70886076e-01 3.15105485e+00 3.85400844e+00
 7.24894515e-01 1.85738397e+00]
[4.49289987e+01 4.51755527e-01 5.00000000e+04 4.93368010e-01
 9.16775033e-01 6.95968791e-01 3.32717815e+00 2.51495449e+00
 6.54096229e-01 9.80234070e-01]
[5.16544885e+01 5.72025052e-01 1.50000000e+05 4.74947808e-

# Cluster the data¶


In [25]:
prediction = model.transform(data)#cluster given data
prediction.show(5)
prediction.groupBy("clustering").count().orderBy("clustering").show()#count members in each cluster
prediction.select('CustomerName', 'clustering').show(5)#show several clustered data

+---------------+--------------------+----------+
|   CustomerName|            features|clustering|
+---------------+--------------------+----------+
|    Aaron Adams|[42.0,0.0,50000.0...|         5|
|Aaron Alexander|[40.0,1.0,50000.0...|         5|
|    Aaron Allen|[63.0,0.0,25000.0...|         2|
|    Aaron Baker|[56.0,1.0,50000.0...|         3|
|   Aaron Bryant|[72.0,0.0,75000.0...|         0|
+---------------+--------------------+----------+
only showing top 5 rows

+----------+-----+
|clustering|count|
+----------+-----+
|         0| 5483|
|         1|  670|
|         2| 2922|
|         3| 1868|
|         4| 1185|
|         5| 3845|
|         6|  958|
|         7| 1577|
+----------+-----+

+---------------+----------+
|   CustomerName|clustering|
+---------------+----------+
|    Aaron Adams|         5|
|Aaron Alexander|         5|
|    Aaron Allen|         2|
|    Aaron Baker|         3|
|   Aaron Bryant|         0|
+---------------+----------+
only showing top 5 rows

