In [17]:
import seaborn as sns
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.evaluation import (BinaryClassificationEvaluator, 
                                   MulticlassClassificationEvaluator)
from pyspark.ml.classification import (LogisticRegression, 
                                       DecisionTreeClassifier, 
                                       RandomForestClassifier, 
                                       GBTClassifier, 
                                       MultilayerPerceptronClassifier)

In [3]:
spark = SparkSession.builder.appName('clasificacion_penguins').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('penguins').dropna())
df.show(5)

+-------+---------+--------------+-------------+-----------------+-----------+------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|
+-------+---------+--------------+-------------+-----------------+-----------+------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|
| Adelie|Torgersen|          40.3|         18.0|            195.0|     3250.0|Female|
| Adelie|Torgersen|          36.7|         19.3|            193.0|     3450.0|Female|
| Adelie|Torgersen|          39.3|         20.6|            190.0|     3650.0|  Male|
+-------+---------+--------------+-------------+-----------------+-----------+------+
only showing top 5 rows



## Encoding y Assembler

In [6]:
# paso 1 . aplicar StringIndexer

indexer_species = StringIndexer(inputCol='species', outputCol='species_indexed')
df = indexer_species.fit(df).transform(df)

indexer_islands = StringIndexer(inputCol='island', outputCol='label')
df = indexer_islands.fit(df).transform(df)

indexer_sex = StringIndexer(inputCol='sex', outputCol='sex_indexed')
df = indexer_sex.fit(df).transform(df)

df.show(2)


+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|sex_indexed|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|        0.0|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|        1.0|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+
only showing top 2 rows



In [None]:
# Paso 2 . aplicar OneHotEncoder sobre las cateoricas de entrada en features: species_indexed, sex_indexed
encoder = OneHotEncoder(
    inputCols=['sex_indexed', 'species_indexed'],
    outputCols=['sex_onehot', 'species_onehot']
)
df = encoder.fit(df).transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+-------------+--------------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|sex_indexed|   sex_onehot|species_onehot|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+-------------+--------------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|        0.0|(1,[0],[1.0])| (2,[0],[1.0])|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|        1.0|    (1,[],[])| (2,[0],[1.0])|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+-------------+--------------+
only showing top 2 rows



In [None]:
assembler = VectorAssembler(
    inputCols=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex_onehot', 'species_onehot'],
    outputCol='features' #le llamamos features para que coincida con lo que piden los algoritmos
)
# df = assembler.transform(df).select('features','label') se puede hacer asi o en dos celdas
df = assembler.transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+-------------+--------------+--------------------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|sex_indexed|   sex_onehot|species_onehot|            features|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+-------------+--------------+--------------------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|        0.0|(1,[0],[1.0])| (2,[0],[1.0])|[39.1,18.7,181.0,...|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|        1.0|    (1,[],[])| (2,[0],[1.0])|[39.5,17.4,186.0,...|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+-------------+--------------+--

In [12]:
df_to_predict = df.select('features', 'label')
df_to_predict.show(2)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[39.1,18.7,181.0,...|  2.0|
|[39.5,17.4,186.0,...|  2.0|
+--------------------+-----+
only showing top 2 rows



In [13]:
#particionamiento de datos
df_train, df_test = df_to_predict.randomSplit([0.8, 0.2], seed=42)

In [14]:
lr = LogisticRegression()
model = lr.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[35.3,18.9,187.0,...|  0.0|[0.07212777470235...|[0.35779265402679...|       0.0|
|[36.5,18.0,182.0,...|  1.0|[0.09888914177782...|[0.36476506792571...|       0.0|
|[36.7,19.3,193.0,...|  2.0|[-0.1922890272724...|[0.27058540798590...|       2.0|
|[37.8,18.3,174.0,...|  0.0|[0.61964645747517...|[0.55839875149316...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 4 rows



In [20]:
# evaluator_auc = MulticlassClassificationEvaluator(metricName='areaUnderROC') no lo tiene
evaluator_accuracy = MulticlassClassificationEvaluator(metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(metricName='weightedRecall')

print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

accuracy 0.7285714285714285
f1 0.7085714285714286
precision 0.7103174603174602
recall 0.7285714285714286


In [21]:
rfc = RandomForestClassifier()
model = rfc.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[35.3,18.9,187.0,...|  0.0|[7.95302810384073...|[0.39765140519203...|       0.0|
|[36.5,18.0,182.0,...|  1.0|[6.49383582821825...|[0.32469179141091...|       1.0|
|[36.7,19.3,193.0,...|  2.0|[5.72120306613334...|[0.28606015330666...|       1.0|
|[37.8,18.3,174.0,...|  0.0|[6.21449860747620...|[0.31072493037381...|       1.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 4 rows

accuracy 0.6714285714285714
f1 0.6683713485394158
precision 0.6801879372438631
recall 0.6714285714285715


In [22]:
tree = DecisionTreeClassifier()
model = tree.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))


+--------------------+-----+---------------+--------------------+----------+
|            features|label|  rawPrediction|         probability|prediction|
+--------------------+-----+---------------+--------------------+----------+
|[35.3,18.9,187.0,...|  0.0|[21.0,21.0,9.0]|[0.41176470588235...|       0.0|
|[36.5,18.0,182.0,...|  1.0|[21.0,21.0,9.0]|[0.41176470588235...|       0.0|
|[36.7,19.3,193.0,...|  2.0|[21.0,21.0,9.0]|[0.41176470588235...|       0.0|
|[37.8,18.3,174.0,...|  0.0|[21.0,21.0,9.0]|[0.41176470588235...|       0.0|
+--------------------+-----+---------------+--------------------+----------+
only showing top 4 rows

accuracy 0.7142857142857143
f1 0.6841168467386954
precision 0.670390243902439
recall 0.7142857142857142


In [29]:
# si tenemos mas de dos labels da error
# gbtc = GBTClassifier(seed=42)
# model = gbtc.fit(df_train)
# df_pred = model.transform(df_test)
# df_pred.show(4)
# print('accuracy', evaluator_accuracy.evaluate(df_pred))
# print('f1', evaluator_f1.evaluate(df_pred))
# print('precision', evaluator_precision.evaluate(df_pred))
# print('recall', evaluator_recall.evaluate(df_pred))

In [28]:
num_features = df_train.first()['features'].size
num_labels = df.select('label').distinct().count()
mlp = MultilayerPerceptronClassifier(layers=[num_features, 32, 32, num_labels], seed=42, maxIter=20) #layers=[capa input, capas ocultas..., capa output]
model = mlp.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[35.3,18.9,187.0,...|  0.0|[0.30848638607658...|[0.48967809759043...|       0.0|
|[36.5,18.0,182.0,...|  1.0|[0.30848638607658...|[0.48967809759043...|       0.0|
|[36.7,19.3,193.0,...|  2.0|[0.30848638607658...|[0.48967809759043...|       0.0|
|[37.8,18.3,174.0,...|  0.0|[0.30848638607658...|[0.48967809759043...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 4 rows

accuracy 0.4857142857142857
f1 0.31758241758241756
precision 0.23591836734693875
recall 0.4857142857142857
