# Clustering
## Platform: Spark, colab.research.google.com

In [0]:
# install Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
# init Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
import pandas as pd
import time

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

In [13]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [14]:
customers = spark.read.csv("/content/gdrive/My Drive/Colab Notebooks/SparkAzureTutorial/data/customers.csv", header=True, inferSchema=True)
customers.show(5, False)

+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
|CustomerName   |Age|MaritalStatus|IncomeRange|Gender|TotalChildren|ChildrenAtHome|Education|Occupation|HomeOwner|Cars|
+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
|Aaron Adams    |42 |0            |50000      |0     |0            |0             |3        |2         |1        |1   |
|Aaron Alexander|40 |1            |50000      |0     |0            |0             |2        |2         |1        |2   |
|Aaron Allen    |63 |0            |25000      |0     |2            |1             |2        |1         |1        |2   |
|Aaron Baker    |56 |1            |50000      |0     |4            |2             |2        |2         |1        |2   |
|Aaron Bryant   |72 |0            |75000      |0     |4            |0             |4        |4         |1        |2   |
+---------------+---+-------------+-----

In [0]:
assembler = VectorAssembler(inputCols = ["Age", "MaritalStatus", "IncomeRange", "Gender", "TotalChildren", "ChildrenAtHome", "Education", "Occupation", "HomeOwner", "Cars"], outputCol="features")
train = assembler.transform(customers)

kmeans = KMeans(featuresCol=assembler.getOutputCol(), predictionCol="cluster", k=5, seed=0)
model = kmeans.fit(train)

In [16]:
prediction = model.transform(train)
prediction.select("CustomerName", "cluster").show(50)

+----------------+-------+
|    CustomerName|cluster|
+----------------+-------+
|     Aaron Adams|      0|
| Aaron Alexander|      0|
|     Aaron Allen|      4|
|     Aaron Baker|      0|
|    Aaron Bryant|      3|
|    Aaron Butler|      3|
|  Aaron Campbell|      3|
|    Aaron Carter|      0|
|      Aaron Chen|      3|
|   Aaron Coleman|      0|
|   Aaron Collins|      1|
|      Aaron Diaz|      2|
|   Aaron Edwards|      1|
|     Aaron Evans|      3|
|    Aaron Flores|      3|
|    Aaron Foster|      3|
|  Aaron Gonzales|      3|
|  Aaron Gonzalez|      0|
|     Aaron Green|      0|
|     Aaron Green|      0|
|   Aaron Griffin|      4|
|      Aaron Hall|      0|
|     Aaron Hayes|      2|
| Aaron Henderson|      0|
| Aaron Hernandez|      0|
|      Aaron Hill|      2|
|    Aaron Hughes|      2|
|       Aaron Jai|      3|
|   Aaron Jenkins|      0|
|      Aaron King|      3|
|     Aaron Kumar|      3|
|       Aaron Lal|      0|
|        Aaron Li|      3|
|  Aaron McDonald|      0|
|

In [17]:
prediction.groupBy("cluster").count().orderBy("cluster").show()

+-------+-----+
|cluster|count|
+-------+-----+
|      0| 5713|
|      1| 1628|
|      2| 2762|
|      3| 5483|
|      4| 2922|
+-------+-----+

