# Curso Big Data #9 - Decision Tree-Ranom Forest-Boosted

#### 1. Inicializamos la sesion en spark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dt_rf_gbt').getOrCreate()

#### 2. Cargamos el dataset¶

In [3]:
df = spark.read.csv('C:/Users/pc/pruebas/dog_food.csv', inferSchema=True, header=True)

In [5]:
df.show(5)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 5 rows



In [6]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



#### 3. Convertir el dataset a caracteristicas y etiquetas

In [7]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = df.columns[:-1], outputCol = 'features')
final_df = assembler.transform(df).select('features', 'Spoiled')

#### 4. Inicializamos los modelos

In [9]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

dtc = DecisionTreeClassifier(labelCol = 'Spoiled')
rfc = RandomForestClassifier(labelCol = 'Spoiled', numTrees = 3)
gbtc = GBTClassifier(labelCol = 'Spoiled')

#### 5. Fitting los modelos

In [10]:

dtc_model = dtc.fit(final_df)
rfc_model = rfc.fit(final_df)
gbtc_model = gbtc.fit(final_df)

#### 6. Seleccionamos las mejores caracteristicas

In [12]:
print('DTC:', end='')
dtc_model.featureImportances

DTC:

SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})

In [13]:
print('RFC:', end='')
rfc_model.featureImportances

RFC:

SparseVector(4, {0: 0.0287, 1: 0.034, 2: 0.9048, 3: 0.0325})

In [14]:
print('GBTC:', end='')
gbtc_model.featureImportances

GBTC:

SparseVector(4, {0: 0.0296, 1: 0.0383, 2: 0.8286, 3: 0.1034})

In [15]:
df.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows

