In [1]:
from pyspark.sql import functions as F
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler
from pyspark.ml.evaluation import ClusteringEvaluator


In [2]:
# import the data
df=spark.read.csv('gs://buck_561/ass4/iris.data',inferSchema=True,header=False)
df.show(2,False)

                                                                                

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|_c4        |
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 2 rows



In [3]:
# import the data
dfn=spark.read.text('gs://buck_561/ass4/iris.names')
dfn.show(100,truncate=False)

[Stage 3:>                                                          (0 + 1) / 1]

+------------------------------------------------------------------------------+
|value                                                                         |
+------------------------------------------------------------------------------+
|1. Title: Iris Plants Database                                                |
|	Updated Sept 21 by C.Blake - Added discrepency information                   |
|                                                                              |
|2. Sources:                                                                   |
|     (a) Creator: R.A. Fisher                                                 |
|     (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)               |
|     (c) Date: July, 1988                                                     |
|                                                                              |
|3. Past Usage:                                                                |
|   - Publications: too many

                                                                                

In [4]:
# data prep: renaming columns
df=df.withColumnRenamed('_c0','sepal_length')\
.withColumnRenamed('_c1','sepal_width')\
.withColumnRenamed('_c2','petal_length')\
.withColumnRenamed('_c3','petal_width')\
.withColumnRenamed('_c4','class')
df.show(2,False)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|class      |
+------------+-----------+------------+-----------+-----------+
|5.1         |3.5        |1.4         |0.2        |Iris-setosa|
|4.9         |3.0        |1.4         |0.2        |Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 2 rows



In [5]:
df.count()

150

In [6]:
df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- class: string (nullable = true)



In [7]:
df.describe().show(truncate=False)

[Stage 8:>                                                          (0 + 1) / 1]

+-------+------------------+-------------------+------------------+------------------+--------------+
|summary|sepal_length      |sepal_width        |petal_length      |petal_width       |class         |
+-------+------------------+-------------------+------------------+------------------+--------------+
|count  |150               |150                |150               |150               |150           |
|mean   |5.843333333333335 |3.0540000000000007 |3.7586666666666693|1.1986666666666672|null          |
|stddev |0.8280661279778637|0.43359431136217375|1.764420419952262 |0.7631607417008414|null          |
|min    |4.3               |2.0                |1.0               |0.1               |Iris-setosa   |
|max    |7.9               |4.4                |6.9               |2.5               |Iris-virginica|
+-------+------------------+-------------------+------------------+------------------+--------------+



                                                                                

In [8]:
fcols=["sepal_length", "sepal_width","petal_length","petal_width"]

In [9]:
assembler = VectorAssembler(inputCols = fcols, outputCol="features") #it makes a vector with 4 parameters mentioned in inputCols and name it as outputCol.
df_vec = assembler.transform(df) # this will add to the table outputCol column with vectors.
df_vec.show(5,False)

+------------+-----------+------------+-----------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|class      |features         |
+------------+-----------+------------+-----------+-----------+-----------------+
|5.1         |3.5        |1.4         |0.2        |Iris-setosa|[5.1,3.5,1.4,0.2]|
|4.9         |3.0        |1.4         |0.2        |Iris-setosa|[4.9,3.0,1.4,0.2]|
|4.7         |3.2        |1.3         |0.2        |Iris-setosa|[4.7,3.2,1.3,0.2]|
|4.6         |3.1        |1.5         |0.2        |Iris-setosa|[4.6,3.1,1.5,0.2]|
|5.0         |3.6        |1.4         |0.2        |Iris-setosa|[5.0,3.6,1.4,0.2]|
+------------+-----------+------------+-----------+-----------+-----------------+
only showing top 5 rows



In [10]:
# Function for trying different values of K 
def trials(df):
    silhouette_max = 0
    k_best = 0 

    for i in range(2,15):
      # Train a k-means model
      kmeans = KMeans().setK(i).setSeed(1000)
      model = kmeans.fit(df)
      predictions = model.transform(df)

      # Evaluate clustering by computing Silhouette score
      evaluator = ClusteringEvaluator()
      silhouette_score = evaluator.evaluate(predictions)
      print("K = "+ str(i) + " Silhouette Score = " + str(silhouette_score))

      if silhouette_score > silhouette_max:
        silhouette_max = silhouette_score
        k_best = i

    return k_best, silhouette_max

# without scaling

In [11]:
# Trains a k-means model with k = 3 as number of classes is 3
kmeans = KMeans().setK(3).setSeed(1000)
model = kmeans.fit(df_vec)
predictions = model.transform(df_vec)


# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette_score = evaluator.evaluate(predictions)
print("Silhouette Score = " + str(silhouette_score))

22/04/19 04:30:01 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/04/19 04:30:01 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


Silhouette Score = 0.7342113066202725


In [12]:
k_best, silhouette_max= trials(df_vec)

K = 2 Silhouette Score = 0.8501515983265806
K = 3 Silhouette Score = 0.7342113066202725
K = 4 Silhouette Score = 0.6720731409257744
K = 5 Silhouette Score = 0.6040089708606194
K = 6 Silhouette Score = 0.6451259820011412
K = 7 Silhouette Score = 0.5315232212372373
K = 8 Silhouette Score = 0.6219498862385187
K = 9 Silhouette Score = 0.5135866324495382
K = 10 Silhouette Score = 0.5814902500936365
K = 11 Silhouette Score = 0.48596018325090645
K = 12 Silhouette Score = 0.42835659035393897
K = 13 Silhouette Score = 0.4269620576600722
K = 14 Silhouette Score = 0.4272877809180372


# with StandardScaler

In [13]:
scale=StandardScaler(inputCol='features',outputCol='standardized')
df_scale=scale.fit(df_vec)
df_scale=df_scale.transform(df_vec)
df_scale=df_scale.drop('features').withColumnRenamed('standardized','features')

# Train a k-means model with k = 3 as number of classes is 3
kmeans = KMeans().setK(3).setSeed(1000)
model = kmeans.fit(df_scale)
predictions = model.transform(df_scale)


# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette_score = evaluator.evaluate(predictions)
print("Silhouette Score = " + str(silhouette_score))

Silhouette Score = 0.6441275819793945


In [14]:
k_best, silhouette_max= trials(df_scale)

K = 2 Silhouette Score = 0.7714149126311811
K = 3 Silhouette Score = 0.6441275819793945
K = 4 Silhouette Score = 0.5825498311859583
K = 5 Silhouette Score = 0.5534565400833028
K = 6 Silhouette Score = 0.5096142911786772
K = 7 Silhouette Score = 0.4643188770246435
K = 8 Silhouette Score = 0.47525309874105537
K = 9 Silhouette Score = 0.4753171390967383
K = 10 Silhouette Score = 0.4504353962133759
K = 11 Silhouette Score = 0.5416971314712636
K = 12 Silhouette Score = 0.5358515908942113
K = 13 Silhouette Score = 0.44135450301087276
K = 14 Silhouette Score = 0.42048234273967866


# with MinMaxScaler

In [15]:
scale=MinMaxScaler(inputCol='features',outputCol='standardized')
df_scale=scale.fit(df_vec)
df_scale=df_scale.transform(df_vec)
df_scale=df_scale.drop('features').withColumnRenamed('standardized','features')


# Train a k-means model with k = 3 as number of classes is 3
kmeans = KMeans().setK(3).setSeed(1000)
model = kmeans.fit(df_scale)
predictions = model.transform(df_scale)


# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette_score = evaluator.evaluate(predictions)
print("Silhouette Score = " + str(silhouette_score))

Silhouette Score = 0.6959387676166829


In [16]:
k_best, silhouette_max= trials(df_scale)

K = 2 Silhouette Score = 0.8092838624745782
K = 3 Silhouette Score = 0.6959387676166829
K = 4 Silhouette Score = 0.619911942661107
K = 5 Silhouette Score = 0.5894686905948959
K = 6 Silhouette Score = 0.4751302682240594
K = 7 Silhouette Score = 0.4502753277038834
K = 8 Silhouette Score = 0.489959385428164
K = 9 Silhouette Score = 0.5078379665570838
K = 10 Silhouette Score = 0.46864364402220027
K = 11 Silhouette Score = 0.5096132892440227
K = 12 Silhouette Score = 0.4738223627992889
K = 13 Silhouette Score = 0.4865619119020438
K = 14 Silhouette Score = 0.42823729740858624


# Final Model, without Scaling

In [17]:
k_best, silhouette_max= trials(df_vec)
print("\n \n \n k = "+ str(k_best) + " gives the best performance, Silhouette = "+ str(silhouette_max))


K = 2 Silhouette Score = 0.8501515983265806
K = 3 Silhouette Score = 0.7342113066202725
K = 4 Silhouette Score = 0.6720731409257744
K = 5 Silhouette Score = 0.6040089708606194
K = 6 Silhouette Score = 0.6451259820011412
K = 7 Silhouette Score = 0.5315232212372373
K = 8 Silhouette Score = 0.6219498862385187
K = 9 Silhouette Score = 0.5135866324495382
K = 10 Silhouette Score = 0.5814902500936365
K = 11 Silhouette Score = 0.48596018325090645
K = 12 Silhouette Score = 0.42835659035393897
K = 13 Silhouette Score = 0.4269620576600722
K = 14 Silhouette Score = 0.4272877809180372

 
 
 k = 2 gives the best performance, Silhouette = 0.8501515983265806
