In [1]:
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName('hack-data').getOrCreate()

In [10]:
data = spark.read.csv('hack_data.csv', header= True, inferSchema=True)

In [11]:
data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)]

In [12]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [13]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

In [14]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [22]:
assembler = VectorAssembler(inputCols=[
    'Session_Connection_Time',
    'Bytes Transferred',
    'Kali_Trace_Used',
    'Servers_Corrupted',
    'Pages_Corrupted',
    'WPM_Typing_Speed'
], outputCol='features')

In [25]:
output = assembler.transform(data)

In [26]:
output.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]))]

In [27]:
from pyspark.ml.feature import StandardScaler

In [28]:
scaler = StandardScaler(inputCol='features',outputCol='scaled_features')

In [29]:
scaler_model = scaler.fit(output)

In [30]:
final_data = scaler_model.transform(output)

In [31]:
final_data.head()

Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]), scaled_features=DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963]))

In [43]:
kmeans3 = KMeans(featuresCol='scaled_features',k=3)
kmeans2 = KMeans(featuresCol='scaled_features',k=2)


In [44]:
model2 = kmeans2.fit(final_data)
model3 = kmeans3.fit(final_data)

In [45]:
centers2 = model2.clusterCenters()
centers3 = model3.clusterCenters()

In [46]:
print(centers2)
print(centers3)

[array([1.26023837, 1.31829808, 0.99280765, 1.36491885, 2.5625043 ,
       5.26676612]), array([2.99991988, 2.92319035, 1.05261534, 3.20390443, 4.51321315,
       3.28474   ])]
[array([1.21780112, 1.37901802, 1.99757683, 1.37198977, 2.55237797,
       5.29152222]), array([2.99991988, 2.92319035, 1.05261534, 3.20390443, 4.51321315,
       3.28474   ]), array([1.30217042, 1.25830099, 0.        , 1.35793211, 2.57251009,
       5.24230473])]


In [47]:
model2.transform(final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [48]:
model3.transform(final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   84|
|         0|   83|
+----------+-----+

