In [44]:
from pyspark.sql import SparkSession

In [45]:
spark = SparkSession.builder.appName('clusters').getOrCreate()

In [46]:
dataset = spark.read.csv('hack_data.csv',header=True,inferSchema=True)
dataset.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [47]:
dataset.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [48]:
dataset.count()

334

In [49]:
len(dataset.columns)

7

In [50]:
dataset.describe().show()

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       null|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       null| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

In [51]:
dataset.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [52]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

In [53]:
feat_cols = ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
             'Servers_Corrupted', 'Pages_Corrupted','WPM_Typing_Speed']

In [54]:
assembler = VectorAssembler(inputCols=feat_cols,outputCol='features')

In [55]:
final_data = assembler.transform(dataset=dataset)

In [56]:
from pyspark.ml.feature import StandardScaler

In [57]:
scaler = StandardScaler(inputCol='features',outputCol='scaledFeatures',withStd=True,withMean=False)

In [58]:
scaler_model = scaler.fit(dataset=final_data)

In [59]:
cluster_final_data = scaler_model.transform(dataset=final_data)

In [60]:
cluster_final_data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]), scaledFeatures=DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963]))]

In [64]:
cluster_final_data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [65]:
kmeans3 = KMeans(featuresCol='scaledFeatures',k=3)
kmeans2 = KMeans(featuresCol='scaledFeatures',k=2)

In [67]:
model_k3 = kmeans3.fit(dataset=cluster_final_data)
model_k2 = kmeans2.fit(dataset=cluster_final_data)

In [68]:
wssse3 = model_k3.summary.trainingCost
wssse3

434.1492898715821

In [69]:
wssse2 = model_k2.summary.trainingCost
wssse2

601.7707512676691

In [71]:
print("With K=3")
print("Within Set Sum of Squared Errors = " + str(wssse3))
print('--'*30)
print("With K=2")
print("Within Set Sum of Squared Errors = " + str(wssse2))

With K=3
Within Set Sum of Squared Errors = 434.1492898715821
------------------------------------------------------------
With K=2
Within Set Sum of Squared Errors = 601.7707512676691


In [74]:
model_k3.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   83|
|         2|   84|
|         0|  167|
+----------+-----+



In [75]:
model_k2.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [73]:
for i in range(2,9):
    kmeans = KMeans(featuresCol='scaledFeatures',k=i)
    model = kmeans.fit(cluster_final_data)
    wssse = model.summary.trainingCost
    print("With K={}".format(i))
    print("Within Set Sum of Squared Errors = " + str(wssse))
    print('--'*30)

With K=2
Within Set Sum of Squared Errors = 601.7707512676691
------------------------------------------------------------
With K=3
Within Set Sum of Squared Errors = 434.1492898715821
------------------------------------------------------------
With K=4
Within Set Sum of Squared Errors = 412.9214065360449
------------------------------------------------------------
With K=5
Within Set Sum of Squared Errors = 246.62403145571344
------------------------------------------------------------
With K=6
Within Set Sum of Squared Errors = 227.26195642543374
------------------------------------------------------------
With K=7
Within Set Sum of Squared Errors = 213.03083199553384
------------------------------------------------------------
With K=8
Within Set Sum of Squared Errors = 199.6114894560158
------------------------------------------------------------
