# Curso Big Data #10 - K_Means

#### 1. Inicializamos la SparkSession


In [55]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('KMeans').getOrCreate()

#### 2.  Importamos el ddataset

In [56]:
df = spark.read.csv('C:/Users/pc/pruebas/hack_data.csv', inferSchema=True, header=True)
df = df.drop('Location')

In [50]:
df.show(5)

+-----------------------+-----------------+---------------+-----------------+---------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|            70.8|
|                   20.0|            408.5|              0|             3.57|            8.0|           71.28|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+
o

#### 3. Transformamos el dataset

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=df.columns, outputCol='features')
df = assembler.transform(df)

#### 4. Escalando las caracteristicas

In [59]:

from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
df = scaler.fit(df).transform(df)

#### 5. Creamos el modelo

In [60]:

from pyspark.ml.clustering import KMeans
kmeans = KMeans(featuresCol='scaled_features', k=2)
kmeans = kmeans.fit(df)



In [62]:
print (kmeans)

KMeansModel: uid=KMeans_df4fc99bc9f1, k=2, distanceMeasure=euclidean, numFeatures=6


#### 6. Obtenemos las predicciones

In [63]:
pdt = kmeans.transform(df)
kmeans.transform(df).show(3)

+-----------------------+-----------------+---------------+-----------------+---------------+----------------+--------------------+--------------------+----------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|WPM_Typing_Speed|            features|     scaled_features|prediction|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+--------------------+--------------------+----------+
|                    8.0|           391.09|              1|             2.96|            7.0|           72.37|[8.0,391.09,1.0,2...|[0.56785108466505...|         1|
|                   20.0|           720.99|              0|             3.04|            9.0|           69.08|[20.0,720.99,0.0,...|[1.41962771166263...|         1|
|                   31.0|           356.32|              1|             3.71|            8.0|           70.58|[31.0,356.32,1.0,...|[2.20042295307707...|         1|
+---------------

#### 7. Evaluamos el modelo (k = 2)

In [64]:
from pyspark.ml.evaluation import ClusteringEvaluator

# Make predictions
predictions = kmeans.transform(df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.6683623593283755


In [65]:
wssse = model.summary.trainingCost

In [66]:
print(wssse)

6914859.905711964


In [69]:

wssse = kmeans.summary.trainingCost
centers = kmeans.clusterCenters()
print(wssse)
print(centers)

601.7707512676691
[array([2.99991988, 2.92319035, 1.05261534, 3.20390443, 4.51321315,
       3.28474   ]), array([1.26023837, 1.31829808, 0.99280765, 1.36491885, 2.5625043 ,
       5.26676612])]


#### 8. Counting the number of samples in each predicted cluster (k = 2)

In [67]:

df_pred = kmeans.transform(df)
df_pred = df_pred.groupby('prediction').count()
df_pred.show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+

