In [1]:
import urllib
f = urllib.urlretrieve ("http://malifauzi.lecture.ub.ac.id/files/2019/05/ccdata2.zip", "ccdata2.zip")

### Membaca data dari file

In [12]:
dataset = spark.read.format("csv").options(header='true', inferschema='true').load("ccdata2.csv")
dataset.show()

+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+
|CUST_ID|    BALANCE|BALANCE_FREQUENCY|PURCHASES|ONEOFF_PURCHASES|INSTALLMENTS_PURCHASES|CASH_ADVANCE|PURCHASES_FREQUENCY|ONEOFF_PURCHASES_FREQUENCY|PURCHASES_INSTALLMENTS_FREQUENCY|CASH_ADVANCE_FREQUENCY|CASH_ADVANCE_TRX|PURCHASES_TRX|CREDIT_LIMIT|   PAYMENTS|MINIMUM_PAYMENTS|PRC_FULL_PAYMENT|TENURE|
+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+
| C10001|  40.900749|         0.818182|     95.4|             0.0|                  95.4|  

### Menentukan Fitur

In [13]:
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=["BALANCE", "BALANCE_FREQUENCY", "PURCHASES"], outputCol="features")
new_dataset = vecAssembler.transform(dataset)
new_dataset_f = new_dataset.select('CUST_ID', 'features')
new_dataset_f.show()

+-------+--------------------+
|CUST_ID|            features|
+-------+--------------------+
| C10001|[40.900749,0.8181...|
| C10002|[3202.467416,0.90...|
| C10003|[2495.148862,1.0,...|
| C10004|[1666.670542,0.63...|
| C10005|[817.714335,1.0,1...|
| C10006|[1809.828751,1.0,...|
| C10007|[627.260806,1.0,7...|
| C10008|[1823.652743,1.0,...|
| C10009|[1014.926473,1.0,...|
| C10010|[152.225975,0.545...|
| C10011|[1293.124939,1.0,...|
| C10012|[630.794744,0.818...|
| C10013|[1516.92862,1.0,3...|
| C10014|[921.693369,1.0,2...|
| C10015|[2772.772734,1.0,...|
| C10016|[6886.213231,1.0,...|
| C10017|[2072.074354,0.87...|
| C10018|[41.089489,0.4545...|
| C10019|[1989.072228,1.0,...|
| C10020|[3577.970933,1.0,...|
+-------+--------------------+
only showing top 20 rows



### Memilih algoritma dan menentukan parameter

In [14]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setK(4).setSeed(1)

### Membuat model

In [15]:
model = kmeans.fit(new_dataset_f)

### Melakukan clustering

In [16]:
clustering_result = model.transform(new_dataset_f)
clustering_result.show()

+-------+--------------------+----------+
|CUST_ID|            features|prediction|
+-------+--------------------+----------+
| C10001|[40.900749,0.8181...|         1|
| C10002|[3202.467416,0.90...|         1|
| C10003|[2495.148862,1.0,...|         1|
| C10004|[1666.670542,0.63...|         1|
| C10005|[817.714335,1.0,1...|         1|
| C10006|[1809.828751,1.0,...|         1|
| C10007|[627.260806,1.0,7...|         0|
| C10008|[1823.652743,1.0,...|         1|
| C10009|[1014.926473,1.0,...|         1|
| C10010|[152.225975,0.545...|         1|
| C10011|[1293.124939,1.0,...|         1|
| C10012|[630.794744,0.818...|         1|
| C10013|[1516.92862,1.0,3...|         0|
| C10014|[921.693369,1.0,2...|         1|
| C10015|[2772.772734,1.0,...|         1|
| C10016|[6886.213231,1.0,...|         2|
| C10017|[2072.074354,0.87...|         1|
| C10018|[41.089489,0.4545...|         1|
| C10019|[1989.072228,1.0,...|         1|
| C10020|[3577.970933,1.0,...|         1|
+-------+--------------------+----

### Menyimpan hasil pada file

In [17]:
new_prediction = clustering_result.select('CUST_ID', 'prediction')
new_prediction.show()
#new_dataset_f.show()
new_prediction.write.csv("/home/hduser/Documents/clusteringresult", header = 'true')

+-------+----------+
|CUST_ID|prediction|
+-------+----------+
| C10001|         1|
| C10002|         1|
| C10003|         1|
| C10004|         1|
| C10005|         1|
| C10006|         1|
| C10007|         0|
| C10008|         1|
| C10009|         1|
| C10010|         1|
| C10011|         1|
| C10012|         1|
| C10013|         0|
| C10014|         1|
| C10015|         1|
| C10016|         2|
| C10017|         1|
| C10018|         1|
| C10019|         1|
| C10020|         1|
+-------+----------+
only showing top 20 rows



### Evaluasi hasil clustering

In [18]:
from pyspark.ml.evaluation import ClusteringEvaluator
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.704114226048


In [19]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[  2.02384007e+03   9.75757600e-01   4.93597533e+03]
[  1.19163583e+03   8.39585094e-01   5.80991990e+02]
[  6.93374045e+03   1.00000000e+00   1.55772000e+03]
[  8.52187991e+03   1.00000000e+00   1.45875312e+04]
