In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('hackers').getOrCreate()

In [7]:
data = spark.read.csv('data/Clustering/hack_data.csv', inferSchema=True, header=True)

In [8]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [9]:
data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)]

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [14]:
assembler = VectorAssembler(inputCols=[
    'Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used', 'Servers_Corrupted', 
    'Pages_Corrupted', 'WPM_Typing_Speed'], outputCol='features')

In [17]:
output = assembler.transform(data).select(['features'])

In [18]:
output.show()

+--------------------+
|            features|
+--------------------+
|[8.0,391.09,1.0,2...|
|[20.0,720.99,0.0,...|
|[31.0,356.32,1.0,...|
|[2.0,228.08,1.0,2...|
|[20.0,408.5,0.0,3...|
|[1.0,390.69,1.0,2...|
|[18.0,342.97,1.0,...|
|[22.0,101.61,1.0,...|
|[15.0,275.53,1.0,...|
|[12.0,424.83,1.0,...|
|[15.0,249.09,1.0,...|
|[32.0,242.48,0.0,...|
|[23.0,514.54,0.0,...|
|[9.0,284.77,0.0,3...|
|[27.0,779.25,1.0,...|
|[12.0,307.31,1.0,...|
|[21.0,355.94,1.0,...|
|[10.0,372.65,0.0,...|
|[20.0,347.23,1.0,...|
|[22.0,456.57,0.0,...|
+--------------------+
only showing top 20 rows



### Feature scaling

In [19]:
from pyspark.ml.feature import StandardScaler

In [20]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [24]:
final_data = scaler.fit(output).transform(output).select('scaledFeatures')

In [25]:
final_data.show()

+--------------------+
|      scaledFeatures|
+--------------------+
|[0.56785108466505...|
|[1.41962771166263...|
|[2.20042295307707...|
|[0.14196277116626...|
|[1.41962771166263...|
|[0.07098138558313...|
|[1.27766494049636...|
|[1.56159048282889...|
|[1.06472078374697...|
|[0.85177662699757...|
|[1.06472078374697...|
|[2.27140433866020...|
|[1.63257186841202...|
|[0.63883247024818...|
|[1.91649741074455...|
|[0.85177662699757...|
|[1.49060909724576...|
|[0.70981385583131...|
|[1.41962771166263...|
|[1.56159048282889...|
+--------------------+
only showing top 20 rows



### Model fitting

In [26]:
from pyspark.ml.clustering import KMeans

In [28]:
model_k2 = KMeans(featuresCol='scaledFeatures', k=2)
model_k3 = KMeans(featuresCol='scaledFeatures', k=3)

In [29]:
fitted_model_k2 = model_k2.fit(final_data)
fitted_model_k3 = model_k3.fit(final_data)

In [33]:
fitted_model_k3.transform(final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   84|
|         2|  167|
|         0|   83|
+----------+-----+



In [34]:
fitted_model_k2.transform(final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+

