In [5]:
import findspark

In [6]:
findspark.init()

In [3]:
import pyspark

In [9]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler 
from pyspark.sql import SparkSession

In [10]:
#create session
appName = "Clustering in Spark"
spark = SparkSession \
.builder \
.appName(appName) \
.config("spark.some.config.option","some-value") \
.getOrCreate()

In [11]:
#Read file into dataframe
"""
Description for each column data
CustomerName: name of customer
Age: age of customer(in year)
MaritalStatus: (1=married,0=not married)
IncomeRange: income per year in usd
Gender: (1=female, 2=male)
Totalchildren: number of children customer has
ChildrenAtHome: number of children living with customer(in the same home)
Education: (1=high school, 2=bachelor, 3=master, 4=PhD, 5=Postdoc)
Occupation: (0=unskilled manual work util, 5=professional)
Homeowner:(1=owning home, 0=not owning home)
Cars:number of car customer has
"""


'\nDescription for each column data\nCustomerName: name of customer\nAge: age of customer(in year)\nMaritalStatus: (1=married,0=not married)\nIncomeRange: income per year in usd\nGender: (1=female, 2=male)\nTotalchildren: number of children customer has\nChildrenAtHome: number of children living with customer(in the same home)\nEducation: (1=high school, 2=bachelor, 3=master, 4=PhD, 5=Postdoc)\nOccupation: (0=unskilled manual work util, 5=professional)\nHomeowner:(1=owning home, 0=not owning home)\nCars:number of car customer has\n'

In [12]:
#read csv file using automatically inferred schema
customers = spark.read.csv('C:/Users/aayushi srivastava/Documents/AayushiSrivastavaJobSearch/PySparkProjects/dataset/customers.csv',inferSchema=True,header=True)
customers.show(3)

+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
|   CustomerName|Age|MaritalStatus|IncomeRange|Gender|TotalChildren|ChildrenAtHome|Education|Occupation|HomeOwner|Cars|
+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
|    Aaron Adams| 42|            0|      50000|     0|            0|             0|        3|         2|        1|   1|
|Aaron Alexander| 40|            1|      50000|     0|            0|             0|        2|         2|        1|   2|
|    Aaron Allen| 63|            0|      25000|     0|            2|             1|        2|         1|        1|   2|
+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
only showing top 3 rows



In [13]:
#Prepare the data
#define assembler
assembler = VectorAssembler(inputCols = [
    "Age", "MaritalStatus", "IncomeRange", "Gender", "TotalChildren", "ChildrenAtHome", "Education", "Occupation", "HomeOwner", "Cars"],
                            outputCol = "features")
data = assembler.transform(customers).select('CustomerName','features')
data.show(truncate=False,n=3)

+---------------+----------------------------------------------+
|CustomerName   |features                                      |
+---------------+----------------------------------------------+
|Aaron Adams    |[42.0,0.0,50000.0,0.0,0.0,0.0,3.0,2.0,1.0,1.0]|
|Aaron Alexander|[40.0,1.0,50000.0,0.0,0.0,0.0,2.0,2.0,1.0,2.0]|
|Aaron Allen    |[63.0,0.0,25000.0,0.0,2.0,1.0,2.0,1.0,1.0,2.0]|
+---------------+----------------------------------------------+
only showing top 3 rows



In [14]:
#Create k-means clustering model
kmeans = KMeans(
featuresCol = assembler.getOutputCol(),
predictionCol = "cluster", k=5)

model = kmeans.fit(data)
print("Model is successfully trained")

Model is successfully trained


In [15]:
#Print centroid for each cluster
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[5.82794840e+01 6.22850123e-01 1.50000000e+05 4.79729730e-01
 2.07248157e+00 3.20638821e+00 3.41461916e+00 4.34705160e+00
 6.48648649e-01 3.10995086e+00]
[5.31013005e+01 4.17180014e-01 2.50000000e+04 4.80492813e-01
 1.41512663e+00 6.08487337e-01 2.31622177e+00 1.45448323e+00
 5.93086927e-01 1.11464750e+00]
[5.60711289e+01 5.83804487e-01 7.50000000e+04 5.03921211e-01
 2.17308043e+00 8.16706183e-01 3.73244574e+00 3.92759438e+00
 7.23326646e-01 1.38063104e+00]
[5.19737441e+01 5.26868545e-01 5.00000000e+04 4.93961141e-01
 1.34552774e+00 4.98337126e-01 3.23035183e+00 2.77927534e+00
 6.62699107e-01 1.14615789e+00]
[5.53417813e+01 5.72411296e-01 1.00000000e+05 4.97103548e-01
 2.54380883e+00 1.54272266e+00 3.46198407e+00 4.19116582e+00
 7.16509776e-01 1.94532947e+00]


In [16]:
#Cluster the data
prediction = model.transform(data)#cluster given data
prediction.groupBy("cluster").count().orderBy("cluster").show()#count members in each cluster
prediction.select("CustomerName", 'cluster').show(5)#show several clustered data

+-------+-----+
|cluster|count|
+-------+-----+
|      0| 1628|
|      1| 2922|
|      2| 5483|
|      3| 5713|
|      4| 2762|
+-------+-----+

+---------------+-------+
|   CustomerName|cluster|
+---------------+-------+
|    Aaron Adams|      3|
|Aaron Alexander|      3|
|    Aaron Allen|      1|
|    Aaron Baker|      3|
|   Aaron Bryant|      2|
+---------------+-------+
only showing top 5 rows

