<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Get-vectorized-data" data-toc-modified-id="Get-vectorized-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Get vectorized data</a></span></li><li><span><a href="#Feature-scaling" data-toc-modified-id="Feature-scaling-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Feature scaling</a></span></li><li><span><a href="#Modelling" data-toc-modified-id="Modelling-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Modelling</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('bhishan').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


In [2]:
from pyspark.ml.clustering import KMeans

# Load data

In [3]:
df = spark.read.csv('../data/seeds_dataset.txt',header=True, inferSchema=True)
print(df.count())
print(len(df.columns))
df.printSchema()

df.show()

210
7
root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length of kernel: double (nullable = true)
 |-- width of kernel: double (nullable = true)
 |-- asymmetry coefficient: double (nullable = true)
 |-- length of groove: double (nullable = true)

+-----+---------+-----------+------------------+------------------+---------------------+------------------+
| area|perimeter|compactness|  length of kernel|   width of kernel|asymmetry coefficient|  length of groove|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|
|13.84|    13.94| 

# Get vectorized data

In [4]:
from pyspark.ml.feature import VectorAssembler

In [5]:
df.columns

['area',
 'perimeter',
 'compactness',
 'length of kernel',
 'width of kernel',
 'asymmetry coefficient',
 'length of groove']

In [6]:
assembler = VectorAssembler(inputCols=df.columns, outputCol='features')
final_data = assembler.transform(df)

In [7]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length of kernel: double (nullable = true)
 |-- width of kernel: double (nullable = true)
 |-- asymmetry coefficient: double (nullable = true)
 |-- length of groove: double (nullable = true)
 |-- features: vector (nullable = true)



# Feature scaling

In [8]:
from pyspark.ml.feature import StandardScaler

In [9]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [10]:
scaler_model = scaler.fit(final_data)

In [11]:
final_data = scaler_model.transform(final_data)

In [13]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length of kernel=5.763, width of kernel=3.312, asymmetry coefficient=2.221, length of groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

# Modelling

In [14]:
kmeans = KMeans(featuresCol='scaledFeatures',k=3)

In [15]:
model = kmeans.fit(final_data)

In [16]:
print('WSSSE')
print(model.computeCost(final_data))

WSSSE
429.11538100236294


In [17]:
centers = model.clusterCenters()

In [18]:
centers

[array([ 4.90993613, 10.92295738, 37.28032496, 12.38401355,  8.5873381 ,
         1.7739463 , 10.35147469]),
 array([ 6.32636687, 12.38115343, 37.39222755, 13.9206997 ,  9.75485787,
         2.41428142, 12.28078861]),
 array([ 4.06818854, 10.13938448, 35.87110297, 11.81191124,  7.52564313,
         3.24585755, 10.40780927])]

In [21]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows

