# Clustering (Agrupamiento): Ejemplo 1

La implementación de MLlib incluye una versión paralelizada del método <a href="http://en.wikipedia.org/wiki/K-means%2B%2B">k-means++</a>, llamada <a href="http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf">kmeans||</a>.




In [1]:
# Solo necesario si se usa Google Colab
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/89/db/e18cfd78e408de957821ec5ca56de1250645b05f8523d169803d8df35a64/pyspark-3.1.2.tar.gz (212.4MB)
[K     |████████████████████████████████| 212.4MB 72kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 18.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=e627809fbca1643bcd96410a66c7895ed0d206357eeca40611f591ebe4aad03a
  Stored in directory: /root/.cache/pip/wheels/40/1b/2c/30f43be2627857ab80062bef1527c0128f7b4070b6b2d02139
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans

spark = SparkSession.builder.appName('Ejemplo1_clustering').getOrCreate()

In [4]:
# Cargar los datos
df0 = spark.read.format("libsvm").load("/content/datos_kmeans.txt")
df0.show(truncate=False)

+-----+-------------------------+
|label|features                 |
+-----+-------------------------+
|0.0  |(3,[],[])                |
|1.0  |(3,[0,1,2],[0.1,0.1,0.1])|
|2.0  |(3,[0,1,2],[0.2,0.2,0.2])|
|3.0  |(3,[0,1,2],[9.0,9.0,9.0])|
|4.0  |(3,[0,1,2],[9.1,9.1,9.1])|
|5.0  |(3,[0,1,2],[9.2,9.2,9.2])|
+-----+-------------------------+



In [6]:
# Se usa solamente la columna "features" (no se requiere etiquetas) con nombre "atributos"
df = df0.select(df0['features'].alias('atributos'))
df.show(truncate=False)

+-------------------------+
|atributos                |
+-------------------------+
|(3,[],[])                |
|(3,[0,1,2],[0.1,0.1,0.1])|
|(3,[0,1,2],[0.2,0.2,0.2])|
|(3,[0,1,2],[9.0,9.0,9.0])|
|(3,[0,1,2],[9.1,9.1,9.1])|
|(3,[0,1,2],[9.2,9.2,9.2])|
+-------------------------+



### Entrenamiento

In [7]:
# Modelo de k-means: 2 clústeres (setSeed se usa para la inicialización de los centroides)
nclusteres = 2
kmeans = KMeans(featuresCol='atributos').setK(nclusteres).setSeed(1)
type(kmeans)
# Intentar cambiando el valor de K a 3 o 4

pyspark.ml.clustering.KMeans

In [8]:
# Entrenamiento del modelo
modelo = kmeans.fit(df)

In [10]:
type(modelo)

pyspark.ml.clustering.KMeansModel

In [11]:
# Evaluación del agrupamiento usando suma de errores cuadráticos (SSE: Sum of
# Squared Errors) dentro de los clústeres
sse = modelo.summary.trainingCost
print("Suma de errores cuadráticos dentro de la clase: {:.3f}".format(sse))

Suma de errores cuadráticos dentro de la clase: 0.120


In [12]:
# Mostrar los centros de los clústeres
centros = modelo.clusterCenters()
print("Centros del clúster: ", centros)

print("Centros del clúster: ")
for center in centros:
    print(center)

Centros del clúster:  [array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]
Centros del clúster: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


In [13]:
# Predicciones del conjunto de entrenamiento (clúster al cual pertenecen)
modelo.summary.predictions.show()

+--------------------+----------+
|           atributos|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



### Predicción

In [14]:
# Se aplica el modelo a nuevos datos

# En este caso, se aplicará a los mismos datos de entrenamiento, pero podría ser a los de prueba
resultados = modelo.transform(df)
resultados.show()

+--------------------+----------+
|           atributos|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+

