# Mengimport library dan package machine learning

In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
# mengimport modul yang dibutuhkan
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

# membuat session
appName = "Clustering di Apache Spark"
spark = SparkSession \
.builder \
.appName(appName) \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

# Memuat data kostumer dari file

In [3]:
# memuat data dari file ke DataFrame dengan infer skema
customers = spark.read.csv('customers.csv', inferSchema=True, header=True)
customers.show(3)

+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
|   CustomerName|Age|MaritalStatus|IncomeRange|Gender|TotalChildren|ChildrenAtHome|Education|Occupation|HomeOwner|Cars|
+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
|    Aaron Adams| 42|            0|      50000|     0|            0|             0|        3|         2|        1|   1|
|Aaron Alexander| 40|            1|      50000|     0|            0|             0|        2|         2|        1|   2|
|    Aaron Allen| 63|            0|      25000|     0|            2|             1|        2|         1|        1|   2|
+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
only showing top 3 rows



# Menyiapkan data training

In [4]:
# membuat assembler untuk mengubah fitur menjadi satu kolom fitur
assembler = VectorAssembler(inputCols = [
"Age", "MaritalStatus", "IncomeRange", "Gender", "TotalChildren",
"ChildrenAtHome", "Education", "Occupation", "HomeOwner", "Cars"],
outputCol="features")
train = assembler.transform(customers).select('CustomerName', 'features')
train.show(truncate = False, n=3)

+---------------+----------------------------------------------+
|CustomerName   |features                                      |
+---------------+----------------------------------------------+
|Aaron Adams    |[42.0,0.0,50000.0,0.0,0.0,0.0,3.0,2.0,1.0,1.0]|
|Aaron Alexander|[40.0,1.0,50000.0,0.0,0.0,0.0,2.0,2.0,1.0,2.0]|
|Aaron Allen    |[63.0,0.0,25000.0,0.0,2.0,1.0,2.0,1.0,1.0,2.0]|
+---------------+----------------------------------------------+
only showing top 3 rows



# Membuat model k-Means Clustering

In [5]:
# mendefinisikan algoritma clustering
kmeans = KMeans(
featuresCol = assembler.getOutputCol(), predictionCol="cluster",
k=5, seed=0)

# mentraining model dengan perintah ".fit()"
model = kmeans.fit(train)
print("Model selesai dibuat!")

Model selesai dibuat!


# Mencari nilai titik tengah dari setiap cluster

In [6]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[5.31013005e+01 4.17180014e-01 2.50000000e+04 4.80492813e-01
 1.41512663e+00 6.08487337e-01 2.31622177e+00 1.45448323e+00
 5.93086927e-01 1.11464750e+00]
[5.53417813e+01 5.72411296e-01 1.00000000e+05 4.97103548e-01
 2.54380883e+00 1.54272266e+00 3.46198407e+00 4.19116582e+00
 7.16509776e-01 1.94532947e+00]
[5.19737441e+01 5.26868545e-01 5.00000000e+04 4.93961141e-01
 1.34552774e+00 4.98337126e-01 3.23035183e+00 2.77927534e+00
 6.62699107e-01 1.14615789e+00]
[5.60711289e+01 5.83804487e-01 7.50000000e+04 5.03921211e-01
 2.17308043e+00 8.16706183e-01 3.73244574e+00 3.92759438e+00
 7.23326646e-01 1.38063104e+00]
[5.82794840e+01 6.22850123e-01 1.50000000e+05 4.79729730e-01
 2.07248157e+00 3.20638821e+00 3.41461916e+00 4.34705160e+00
 6.48648649e-01 3.10995086e+00]


# Memprediksi cluster

In [7]:
prediction = model.transform(train) # melakukan prediksi klaster
prediction.groupBy("cluster").count().orderBy("cluster").show()
prediction.select('CustomerName', 'cluster').show(5)

+-------+-----+
|cluster|count|
+-------+-----+
|      0| 2922|
|      1| 2762|
|      2| 5713|
|      3| 5483|
|      4| 1628|
+-------+-----+

+---------------+-------+
|   CustomerName|cluster|
+---------------+-------+
|    Aaron Adams|      2|
|Aaron Alexander|      2|
|    Aaron Allen|      0|
|    Aaron Baker|      2|
|   Aaron Bryant|      3|
+---------------+-------+
only showing top 5 rows

