# Árboles de Decisión y Derivados: Ejemplo 1

In [1]:
# Solo si ce corre en Google Colab
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/89/db/e18cfd78e408de957821ec5ca56de1250645b05f8523d169803d8df35a64/pyspark-3.1.2.tar.gz (212.4MB)
[K     |████████████████████████████████| 212.4MB 68kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 18.6MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=eeec1aa481f5d3e59e57d36dafc9d461386a12eb6022c691f200b7719110f980
  Stored in directory: /root/.cache/pip/wheels/40/1b/2c/30f43be2627857ab80062bef1527c0128f7b4070b6b2d02139
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Ejemplo-Arboles-1').getOrCreate()

## Carga de los Datos

In [3]:
# Cargar los datos y convertirlos a un dataframe
df = spark.read.format("libsvm").load("datos-ejemplo.libsvm.txt")

# Dividir en datos de entrenamiento y prueba
df_train, df_test = df.randomSplit([0.7, 0.3])

# Mostrar algunos datos
df_train.show(5)
#df_train.printSchema()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[95,96,97,12...|
|  0.0|(692,[98,99,100,1...|
|  0.0|(692,[100,101,102...|
|  0.0|(692,[121,122,123...|
|  0.0|(692,[122,123,124...|
+-----+--------------------+
only showing top 5 rows



## Modelos de Clasificación basados en Árboles

Se va a utilizar 3 modelos de clasificación:
* Clasificación usando árboles de decisión
* Clasificación usando Random Forest
* Clasificación usando Gradient Boosted Tree

In [4]:
from pyspark.ml.classification import (DecisionTreeClassifier,
                                       RandomForestClassifier,
                                       GBTClassifier)

# Si el problema es de regresión:
# from pyspark.ml.regression import (DecisionTreeRegressor, 
#                                    RandomForestRegressor, GBTRegressor)

In [5]:
# Árbol de decisiones
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5, minInstancesPerNode=1)

# Random Forest
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)

# Gradient Boosting Tree
gb = GBTClassifier(labelCol="label", featuresCol="features", maxIter=20)

## Entrenamiento de los modelos

In [6]:
# Entrenamiento usando los datos de entrenamiento
modelo_DT = dt.fit(df_train)
modelo_RF = rf.fit(df_train)
modelo_GB = gb.fit(df_train)

In [8]:
modelo_DT

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b38556dd121c, depth=1, numNodes=3, numClasses=2, numFeatures=692

In [11]:
modelo_DT.getMaxDepth()

5

## Predicción

In [12]:
# Aplicar el modelo de árbol de decisiones a los datos de prueba
preds_DT = modelo_DT.transform(df_test)

# Resultados con Árbol de decisiones
preds_DT.show(5)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[122,123,148...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [28.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 5 rows



In [13]:
# Mostrar solo algunas columnas
preds_DT.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows



In [14]:
preds_RF = modelo_RF.transform(df_test)

# Resultados con Random Forest
preds_RF.show(5)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[122,123,148...|   [91.0,9.0]|[0.91,0.09]|       0.0|
|  0.0|(692,[123,124,125...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[124,125,126...|   [98.0,2.0]|[0.98,0.02]|       0.0|
|  0.0|(692,[124,125,126...|   [98.0,2.0]|[0.98,0.02]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 5 rows



In [15]:
# Aplicar el modelo de gradient boost a los datos de prueba
preds_GB = modelo_GB.transform(df_test)

# Resultados con Gradient Boosting
preds_GB.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[122,123,148...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[123,124,125...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[123,124,125...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



## Evaluación de la Predicción

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
# Evaluador de exactitud
evaluador = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

exactitud = evaluador.evaluate(preds_DT)
print("Error DT en conjunto de prueba: {:.3f}".format(1.0 - exactitud))
exactitud

Error DT en conjunto de prueba: 0.050


0.95

In [22]:
exactitud = evaluador.evaluate(preds_RF)
print("Error RF en conjunto de prueba: {:.3f}".format(1.0 - exactitud))
exactitud

Error RF en conjunto de prueba: 0.050


0.95

In [23]:
exactitud = evaluador.evaluate(preds_GB)
print("Error GB en conjunto de prueba: {:.3f}".format(1.0 - exactitud))

Error GB en conjunto de prueba: 0.050


## Importancia de los atributos

In [24]:
# Mientras más grande sea el valor, más importante es el atributo
vimp = modelo_RF.featureImportances
vimp

SparseVector(692, {122: 0.0007, 153: 0.0015, 156: 0.0042, 174: 0.0012, 185: 0.0014, 187: 0.002, 203: 0.0024, 205: 0.0023, 207: 0.0019, 208: 0.0004, 209: 0.0022, 212: 0.0007, 213: 0.0011, 235: 0.0024, 236: 0.0018, 243: 0.011, 244: 0.0004, 259: 0.0036, 262: 0.0094, 263: 0.0075, 270: 0.0007, 272: 0.0025, 273: 0.0007, 287: 0.0013, 290: 0.0075, 291: 0.0014, 296: 0.0006, 300: 0.0143, 301: 0.0004, 303: 0.0007, 317: 0.0008, 318: 0.0064, 319: 0.0008, 321: 0.0001, 322: 0.0035, 323: 0.0024, 326: 0.0012, 327: 0.0009, 329: 0.0063, 331: 0.0035, 342: 0.007, 345: 0.0017, 349: 0.0009, 350: 0.0208, 351: 0.0181, 354: 0.0006, 355: 0.0057, 356: 0.0065, 357: 0.0076, 358: 0.0127, 371: 0.0006, 372: 0.0112, 373: 0.0031, 374: 0.008, 377: 0.0232, 378: 0.0391, 379: 0.0083, 380: 0.0007, 383: 0.0012, 384: 0.0014, 385: 0.0065, 397: 0.0021, 400: 0.0084, 401: 0.0073, 404: 0.0005, 405: 0.0306, 406: 0.03, 407: 0.0094, 408: 0.0007, 413: 0.0077, 414: 0.0007, 416: 0.003, 427: 0.0009, 428: 0.0156, 429: 0.0217, 432: 0.0015, 

In [28]:
modelo_GB.featureImportances

SparseVector(692, {406: 0.6465, 433: 0.2179, 434: 0.0569, 462: 0.014, 490: 0.0647})

In [29]:
import numpy as np

# Índices de valores ordenados (menor a mayor)
idx = np.argsort(vimp.values)
# Orden de mayor a menor
idx = np.flip(idx, axis=0)

# Atributos más importantes (540, 462, 407, 511)
vimp.indices[idx]

array([489, 461, 434, 490, 378, 462, 405, 406, 433, 377, 512, 568, 429,
       350, 351, 524, 428, 551, 605, 456, 578, 300, 457, 495, 358, 540,
       517, 435, 372, 243, 539, 468, 407, 262, 496, 628, 511, 400, 379,
       518, 455, 374, 413, 357, 463, 290, 263, 401, 342, 356, 385, 318,
       329, 516, 355, 596, 156, 259, 331, 322, 609, 373, 416, 485, 272,
       523, 323, 235, 439, 203, 205, 209, 397, 187, 471, 579, 207, 492,
       236, 345, 662, 432, 466, 153, 291, 384, 582, 185, 287, 594, 174,
       383, 326, 436, 213, 658, 327, 427, 349, 319, 317, 270, 408, 599,
       212, 552, 273, 606, 627, 414, 303, 380, 122, 354, 460, 665, 296,
       371, 623, 601, 404, 244, 546, 570, 301, 208, 663, 493, 655, 653,
       598, 321], dtype=int32)

In [30]:
# Valores correspondientes a los atributos
np.round(vimp.values[idx], 3)

array([0.063, 0.059, 0.043, 0.04 , 0.039, 0.034, 0.031, 0.03 , 0.027,
       0.023, 0.022, 0.022, 0.022, 0.021, 0.018, 0.017, 0.016, 0.016,
       0.015, 0.014, 0.014, 0.014, 0.014, 0.013, 0.013, 0.012, 0.011,
       0.011, 0.011, 0.011, 0.011, 0.01 , 0.009, 0.009, 0.009, 0.009,
       0.009, 0.008, 0.008, 0.008, 0.008, 0.008, 0.008, 0.008, 0.007,
       0.007, 0.007, 0.007, 0.007, 0.006, 0.006, 0.006, 0.006, 0.006,
       0.006, 0.005, 0.004, 0.004, 0.004, 0.003, 0.003, 0.003, 0.003,
       0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
       0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
       0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
       0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
       0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
       0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
       0.001, 0.001, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   ,