In [1]:
import findspark
findspark.init('/home/osboxes/spark-2.4.3-bin-hadoop2.7')
import pyspark
import os

In [2]:
myPath = os.path.join('/home', 'osboxes', 'CourseMaterial', 'Spark_for_Machine_Learning', 'Clustering')

print(myPath)

os.chdir(myPath)

/home/osboxes/CourseMaterial/Spark_for_Machine_Learning/Clustering


In [3]:
os.listdir()

['Clustering_Consulting_Project.ipynb',
 'Clustering Code Along.ipynb',
 'seeds_dataset.csv',
 'seeds_dataset.txt',
 '.ipynb_checkpoints',
 'Clustering_Code_Example.ipynb',
 'Clustering_Consulting_Project_SOLUTIONS.ipynb',
 'hack_data.csv',
 'sample_kmeans_data.txt']

In [4]:
# set up the session info

from pyspark.sql import SparkSession

name = 'cluster'

spark = SparkSession.builder.appName(name).getOrCreate()

from pyspark.ml.clustering import KMeans

In [6]:
# Loads data.
dataset = spark.read.csv("hack_data.csv",header=True,inferSchema=True)
dataset.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [7]:
dataset.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [8]:
# check dimensions for scaling

dataset.describe().show()

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       null|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       null| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

In [10]:
# show the columns for the assembler step

dataset.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [11]:
# import the assembler

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [12]:
# get the key features

feat_cols = ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
             'Servers_Corrupted', 'Pages_Corrupted','WPM_Typing_Speed']

In [13]:
# assemble them

vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')

In [14]:
# transform for modeling


final_data = vec_assembler.transform(dataset)

In [15]:
# scale due to differences

from pyspark.ml.feature import StandardScaler

In [16]:
# perform the scaling

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [17]:
# Compute summary statistics by fitting the StandardScaler

scalerModel = scaler.fit(final_data)

# Normalize each feature to have unit standard deviation

cluster_final_data = scalerModel.transform(final_data)

In [18]:
# find out 2 or 3

kmeans3 = KMeans(featuresCol='scaledFeatures',k=3)
kmeans2 = KMeans(featuresCol='scaledFeatures',k=2)

In [19]:
# model

model_k3 = kmeans3.fit(cluster_final_data)
model_k2 = kmeans2.fit(cluster_final_data)

In [20]:
# get the costs

wssse_k3 = model_k3.computeCost(cluster_final_data)
wssse_k2 = model_k2.computeCost(cluster_final_data)

In [21]:
# get the WSSSE

print("With K=3")
print("Within Set Sum of Squared Errors = " + str(wssse_k3))
print('--'*30)
print("With K=2")
print("Within Set Sum of Squared Errors = " + str(wssse_k2))

With K=3
Within Set Sum of Squared Errors = 434.75507308487647
------------------------------------------------------------
With K=2
Within Set Sum of Squared Errors = 601.7707512676716


In [22]:
"""
Not much to be gained from the WSSSE, after all, we would expect that as K increases, the WSSSE decreases
We could however continue the analysis by seeing the drop from K=3 to K=4 to check if the clustering 
favors even or odd numbers
This won't be substantial, but its worth a look:
"""



for k in range(2,9):
    kmeans = KMeans(featuresCol='scaledFeatures',k=k)
    model = kmeans.fit(cluster_final_data)
    wssse = model.computeCost(cluster_final_data)
    print("With K={}".format(k))
    print("Within Set Sum of Squared Errors = " + str(wssse))
    print('--'*30)

With K=2
Within Set Sum of Squared Errors = 601.7707512676716
------------------------------------------------------------
With K=3
Within Set Sum of Squared Errors = 434.75507308487647
------------------------------------------------------------
With K=4
Within Set Sum of Squared Errors = 267.1336116887891
------------------------------------------------------------
With K=5
Within Set Sum of Squared Errors = 245.36421529748606
------------------------------------------------------------
With K=6
Within Set Sum of Squared Errors = 229.07089185394216
------------------------------------------------------------
With K=7
Within Set Sum of Squared Errors = 210.82416878734273
------------------------------------------------------------
With K=8
Within Set Sum of Squared Errors = 196.4796374094792
------------------------------------------------------------


In [23]:
# looking for evenly numbered attacks

model_k3.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   88|
|         0|   79|
+----------+-----+



In [24]:
model_k2.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+

