## Dataset que analiza y predice predominancia de caracteristicas de la especie de la planta de iris

In [1]:
#Generic Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Apache Spark Libraries
import pyspark
from pyspark.sql import SparkSession

#Apache Spark ML CLassifier Libraries
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,NaiveBayes

#Apache Spark Evaluation Library
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Apache Spark Features libraries
from pyspark.ml.feature import StandardScaler,StringIndexer, VectorAssembler, VectorIndexer, OneHotEncoder

#Apache Spark Pipelin Library
from pyspark.ml import Pipeline

# Apache Spark `DenseVector`
from pyspark.ml.linalg import DenseVector

#Data Split Libraries
import sklearn
from sklearn.model_selection import train_test_split

#Garbage
import gc

### Creacion de sesion de spark

In [2]:
#Building Spark Session
spark = (SparkSession.builder
                  .appName('Apache Spark Beginner Tutorial')
                  .config("spark.executor.memory", "1G")
                  .config("spark.executor.cores","4")
                  .getOrCreate())

In [3]:
spark.sparkContext.setLogLevel('INFO')

#### Cargado de informacion

In [4]:
url = 'iris.csv'

data = spark.read.format("csv") \
       .option("header", "true") \
       .option("inferSchema","true")\
       .load(url) 

data.cache() #for faster re-use

DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]

### Exploracion y preparacion de datos

In [5]:
#Total de registros
data.count()

150

In [6]:
#Tipos de datos
data.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [7]:
#Mostramos los registros
data.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [8]:
#Registros por especies
data.groupBy('species').count().show()

+----------+-----+
|   species|count|
+----------+-----+
| virginica|   50|
|versicolor|   50|
|    setosa|   50|
+----------+-----+



In [9]:
#Dataset con resumen de caracteristicas
data.describe().show()

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  species|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|     null|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|     null|
|    min|               4.3|                2.0|               1.0|               0.1|   setosa|
|    max|               7.9|                4.4|               6.9|               2.5|virginica|
+-------+------------------+-------------------+------------------+------------------+---------+



In [10]:
#Indexacion de string de la columna de especies
SIndexer = StringIndexer(inputCol='species', outputCol='species_indx')
data = SIndexer.fit(data).transform(data)

#Inspeccion del dataset
data.show(5)

+------------+-----------+------------+-----------+-------+------------+
|sepal_length|sepal_width|petal_length|petal_width|species|species_indx|
+------------+-----------+------------+-----------+-------+------------+
|         5.1|        3.5|         1.4|        0.2| setosa|         0.0|
|         4.9|        3.0|         1.4|        0.2| setosa|         0.0|
|         4.7|        3.2|         1.3|        0.2| setosa|         0.0|
|         4.6|        3.1|         1.5|        0.2| setosa|         0.0|
|         5.0|        3.6|         1.4|        0.2| setosa|         0.0|
+------------+-----------+------------+-----------+-------+------------+
only showing top 5 rows



### Ingenieria de las caracteristicas

In [11]:
#Creamos un dataframe separado con columnas reordenadas
df = data.select("species_indx","sepal_length", "sepal_width", "petal_length", "petal_width")

#Inspeccionamos el dataframe
df.show(5)

+------------+------------+-----------+------------+-----------+
|species_indx|sepal_length|sepal_width|petal_length|petal_width|
+------------+------------+-----------+------------+-----------+
|         0.0|         5.1|        3.5|         1.4|        0.2|
|         0.0|         4.9|        3.0|         1.4|        0.2|
|         0.0|         4.7|        3.2|         1.3|        0.2|
|         0.0|         4.6|        3.1|         1.5|        0.2|
|         0.0|         5.0|        3.6|         1.4|        0.2|
+------------+------------+-----------+------------+-----------+
only showing top 5 rows



Ahora podemos observar que la columna de especies que es nuestra etiqueta (también conocida como Target) está ahora al principio del marco de datos

In [12]:
# Definimos el `input_data` como vector denso
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:])))

Creamos un Vector Denso. Así, luego cuando creamos un nuevo dataframe indexado (el de abajo) la máquina entiende que la primera columna es una Etiqueta (Objetivo) y las restantes columnas son Características

In [13]:
# Creamos un dataframe indexado
df_indx = spark.createDataFrame(input_data, ["label", "features"])

In [14]:
#vista del dataframe indexado
df_indx.show(5)

+-----+-----------------+
|label|         features|
+-----+-----------------+
|  0.0|[5.1,3.5,1.4,0.2]|
|  0.0|[4.9,3.0,1.4,0.2]|
|  0.0|[4.7,3.2,1.3,0.2]|
|  0.0|[4.6,3.1,1.5,0.2]|
|  0.0|[5.0,3.6,1.4,0.2]|
+-----+-----------------+
only showing top 5 rows



### Escalamos la informacion

In [15]:
#Inicializamos el standarscaler
stdScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

#Acercamos el standar scaler al dataframe indexado
scaler = stdScaler.fit(df_indx)

#transformamos el dataframe
df_scaled =scaler.transform(df_indx)

In [16]:
#Vemos la informacion escalada
df_scaled.show(5)

+-----+-----------------+--------------------+
|label|         features|     features_scaled|
+-----+-----------------+--------------------+
|  0.0|[5.1,3.5,1.4,0.2]|[6.15892840883878...|
|  0.0|[4.9,3.0,1.4,0.2]|[5.9174018045706,...|
|  0.0|[4.7,3.2,1.3,0.2]|[5.67587520030241...|
|  0.0|[4.6,3.1,1.5,0.2]|[5.55511189816831...|
|  0.0|[5.0,3.6,1.4,0.2]|[6.03816510670469...|
+-----+-----------------+--------------------+
only showing top 5 rows



In [17]:
#Eliminamos la columna de Features 
df_scaled = df_scaled.drop("features")

Hacemos split de la informacion

In [18]:
train_data, test_data = df_scaled.randomSplit([0.9, 0.1], seed = 12345)

In [19]:
#Inspeccionamos la inforacion de entrenamiento
train_data.show(5)

+-----+--------------------+
|label|     features_scaled|
+-----+--------------------+
|  0.0|[5.19282199176603...|
|  0.0|[5.31358529390013...|
|  0.0|[5.31358529390013...|
|  0.0|[5.31358529390013...|
|  0.0|[5.43434859603422...|
+-----+--------------------+
only showing top 5 rows



Construimos, entrenamos y evaluamos el modelo

In [20]:
model = ['Decision Tree','Random Forest','Naive Bayes']
model_results = []

In [21]:
# -- Clasificador Arbol de desicion --

dtc = DecisionTreeClassifier(labelCol="label", featuresCol="features_scaled")          #instanciamos el modelo
dtc_model = dtc.fit(train_data)                                                        #Entrenamos el modelo
dtc_pred = dtc_model.transform(test_data)                                              #modelo de prediccion

#Evaluamos el modelo
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
dtc_acc = evaluator.evaluate(dtc_pred)
model_results.extend([[model[0],'{:.2%}'.format(dtc_acc)]])                               #agregamos a la lista

In [22]:
# -- Clasificador Random Forest --

rfc = RandomForestClassifier(labelCol="label", featuresCol="features_scaled", numTrees=10)          #instanciamos el modelo
rfc_model = rfc.fit(train_data)                                                                     #entrenamos el modelo
rfc_pred = rfc_model.transform(test_data)                                                           #modelo de prediccion

#evaluamos el modelo
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
rfc_acc = evaluator.evaluate(rfc_pred)
model_results.extend([[model[1],'{:.2%}'.format(rfc_acc)]])                                            #agregamos a la lista

In [23]:
# -- Clasificador Naive Bayes --

nbc = NaiveBayes(smoothing=1.0,modelType="multinomial", labelCol="label",featuresCol="features_scaled")    #instanciamos el modelo
nbc_model = nbc.fit(train_data)                                                                          #entrenamos el modelo
nbc_pred = nbc_model.transform(test_data)                                                                 #modelo de prediccion

#Evaluate the Model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
nbc_acc = evaluator.evaluate(nbc_pred)
#print("Naive Bayes Accuracy =", '{:.2%}'.format(nbc_acc))
model_results.extend([[model[2],'{:.2%}'.format(nbc_acc)]])                                            #agregamos a la lista

In [24]:
#librerar memoria
gc.collect()

479

In [25]:
!pip install tabulate
from tabulate import tabulate
# Tabulamos la informacion obtenida
print (tabulate(model_results, headers=["Classifier Models", "Accuracy"]))

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Classifier Models    Accuracy
-------------------  ----------
Decision Tree        90.91%
Random Forest        100.00%
Naive Bayes          100.00%


Como podemos ver hay un altisimo nivel de presicion de los modelos