# Clustering

# Basic

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("clustering_basic").getOrCreate()

24/06/11 18:14:37 WARN Utils: Your hostname, agusrichard.local resolves to a loopback address: 127.0.0.1; using 192.168.0.104 instead (on interface en0)
24/06/11 18:14:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/11 18:14:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/11 18:14:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df = spark.read.format("libsvm").load("./files/sample_kmeans_data.txt")
df.show()

24/06/11 18:16:38 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
                                                                                

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+


In [4]:
from pyspark.ml.clustering import KMeans

In [5]:
model = KMeans(k=2, seed=21).fit(df)

24/06/11 18:17:47 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [10]:
model.summary.trainingCost

0.11999999999994547

## Real

In [11]:
spark = SparkSession.builder.appName('clustering_real').getOrCreate()

24/06/11 18:26:37 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [12]:
df = spark.read.csv('./files/seeds_dataset.csv', header=True, inferSchema=True)
df.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|             5.175|
|14.38|    14.21|     0.8951|             5.386|             3.312|   2.4619999999999997|             4.956|
|14.69|    14.49|  

In [13]:
from pyspark.ml.feature import VectorAssembler

In [14]:
df.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [15]:
assembler = VectorAssembler(inputCols=df.columns, outputCol='features')
df = assembler.transform(df)

In [16]:
df.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|[13.84,13.94,0.89...|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355| 

In [17]:
from pyspark.ml.feature import StandardScaler

In [18]:
scaler = StandardScaler(withMean=True, inputCol='features', outputCol='features_scaled').fit(df)

In [19]:
df = scaler.transform(df)

In [20]:
df.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|     features_scaled|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|[0.14175903742014...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|[0.01116135575161...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|[-0.1916087289442...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|     

In [21]:
model = KMeans(k=3, featuresCol='features_scaled').fit(df)

In [22]:
model.summary.trainingCost

428.60820118724456

In [23]:
model.summary.clusterSizes

[72, 67, 71]

In [25]:
model.clusterCenters()

[array([-1.02779666, -1.00424915, -0.96260496, -0.89554512, -1.08299564,
         0.693148  , -0.62331915]),
 array([ 1.25368596,  1.25895795,  0.55912833,  1.23493193,  1.1620751 ,
        -0.04511088,  1.28922727]),
 array([-0.14078309, -0.16963724,  0.44853463, -0.25719987,  0.00164301,
        -0.66034122, -0.58449646])]

In [26]:
model.transform(df).select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
+----------+


## Project

In [114]:
spark = SparkSession.builder.appName('clustering_project').getOrCreate()

In [115]:
df = spark.read.csv('./files/hack_data.csv', header=True, inferSchema=True)
df.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [116]:
df.describe().show()

[Stage 309:>                                                        (0 + 1) / 1]

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       NULL|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       NULL| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

                                                                                

In [117]:
df.groupBy('Location').count().show()

+--------------------+-----+
|            Location|count|
+--------------------+-----+
|            Anguilla|    1|
|            Paraguay|    2|
|               Macao|    2|
|Heard Island and ...|    2|
|               Yemen|    1|
|             Tokelau|    2|
|              Sweden|    3|
|French Southern T...|    3|
|            Kiribati|    1|
|              Guyana|    2|
|         Philippines|    3|
|            Malaysia|    2|
|           Singapore|    1|
|United States Vir...|    6|
|              Turkey|    1|
|      Western Sahara|    2|
|              Malawi|    2|
|                Iraq|    3|
|Northern Mariana ...|    3|
|             Germany|    1|
+--------------------+-----+


In [118]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [119]:
# indexer = StringIndexer(inputCol='Location', outputCol='location_index').fit(df)
# df = indexer.transform(df)
# df.show()

In [120]:
# encoder = OneHotEncoder(inputCol='location_index', outputCol='location_encoded').fit(df)
# df = encoder.transform(df)
# df.show()

In [122]:
df = df.drop('Location')
df.show()

+-----------------------+-----------------+---------------+-----------------+---------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|            70.8|
|                   20.0|            408.5|              0|             3.57|            8.0|           71.28|
|                    1.0|           390.69|              1|             2.79|            9.0|           71.57|
|

In [123]:
assembler = VectorAssembler(inputCols=df.columns, outputCol='features')
df = assembler.transform(df)
df.show()

+-----------------------+-----------------+---------------+-----------------+---------------+----------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|WPM_Typing_Speed|            features|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|           72.37|[8.0,391.09,1.0,2...|
|                   20.0|           720.99|              0|             3.04|            9.0|           69.08|[20.0,720.99,0.0,...|
|                   31.0|           356.32|              1|             3.71|            8.0|           70.58|[31.0,356.32,1.0,...|
|                    2.0|           228.08|              1|             2.48|            8.0|            70.8|[2.0,228.08,1.0,2...|
|                   20.0|            408.5|              0|             3.57

In [124]:
scaler = StandardScaler(withMean=True, inputCol='features', outputCol='features_scaled')
df = scaler.fit(df).transform(df)

In [125]:
df.show()

+-----------------------+-----------------+---------------+-----------------+---------------+----------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|WPM_Typing_Speed|            features|     features_scaled|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+--------------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|           72.37|[8.0,391.09,1.0,2...|[-1.5622280401844...|
|                   20.0|           720.99|              0|             3.04|            9.0|           69.08|[20.0,720.99,0.0,...|[-0.7104514131868...|
|                   31.0|           356.32|              1|             3.71|            8.0|           70.58|[31.0,356.32,1.0,...|[0.07034382822759...|
|                    2.0|           228.08|              1|             2.48|     

In [126]:
model_3 = KMeans(k=3, featuresCol='features_scaled', seed=21).fit(df)

                                                                                

In [127]:
model_3.summary.trainingCost

434.1492898715821

In [128]:
model_2 = KMeans(k=2, featuresCol='features_scaled', seed=21).fit(df)

In [129]:
model_2.summary.trainingCost

601.7707512676687

In [130]:
result = model_2.transform(df)

In [131]:
result.show()

+-----------------------+-----------------+---------------+-----------------+---------------+----------------+--------------------+--------------------+----------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|WPM_Typing_Speed|            features|     features_scaled|prediction|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+--------------------+--------------------+----------+
|                    8.0|           391.09|              1|             2.96|            7.0|           72.37|[8.0,391.09,1.0,2...|[-1.5622280401844...|         0|
|                   20.0|           720.99|              0|             3.04|            9.0|           69.08|[20.0,720.99,0.0,...|[-0.7104514131868...|         0|
|                   31.0|           356.32|              1|             3.71|            8.0|           70.58|[31.0,356.32,1.0,...|[0.07034382822759...|         0|
|               

In [132]:
result.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+


In [133]:
model_3.transform(df).groupBy('prediction').count().show()



+----------+-----+
|prediction|count|
+----------+-----+
|         1|   84|
|         2|  167|
|         0|   83|
+----------+-----+


                                                                                