In [19]:
import seaborn as sns
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, Imputer, StringIndexer, StandardScaler
from pyspark.ml.evaluation import (BinaryClassificationEvaluator, 
                                   MulticlassClassificationEvaluator)
from pyspark.ml.classification import (LogisticRegression, 
                                       DecisionTreeClassifier, 
                                       RandomForestClassifier, 
                                       GBTClassifier, 
                                       MultilayerPerceptronClassifier,
                                       OneVsRest)
from pyspark.sql.types import NumericType, StringType
from pyspark.sql.functions import col

In [20]:
spark = SparkSession.builder.appName('clasificacion_penguins_preprocesados').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('penguins').dropna())
df.show(5)

+-------+---------+--------------+-------------+-----------------+-----------+------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|
+-------+---------+--------------+-------------+-----------------+-----------+------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|
| Adelie|Torgersen|          40.3|         18.0|            195.0|     3250.0|Female|
| Adelie|Torgersen|          36.7|         19.3|            193.0|     3450.0|Female|
| Adelie|Torgersen|          39.3|         20.6|            190.0|     3650.0|  Male|
+-------+---------+--------------+-------------+-----------------+-----------+------+
only showing top 5 rows



In [21]:
#queremos predecir species por lo tanto elimino filas donde species dea Nan
df = df.dropna(subset=['species'])
# si estuviera en dataframe de pandas: 
# df['island'] = df['island'].fillna('other')

In [22]:
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

## Imputers

In [23]:
numeric_cols =  [field.name for field in df.schema.fields if isinstance(field.dataType, NumericType)]
# Filtramos species porque species es la variable a predecir y ya hemos asegurado que no tiene nan
categorical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, StringType) and field.name != 'species']

In [24]:
imputer = Imputer(
    strategy='median',
    inputCols=numeric_cols,
    outputCols=[col +'_imputed' for col in numeric_cols]
)
imputer_model = imputer.fit(df_train)
df_train = imputer_model.transform(df_train)
df_test = imputer_model.transform(df_test)
df_train.show(5)



+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+
|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|bill_length_mm_imputed|bill_depth_mm_imputed|flipper_length_mm_imputed|body_mass_g_imputed|
+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+
| Adelie|Biscoe|          35.3|         18.9|            187.0|     3800.0|Female|                  35.3|                 18.9|                    187.0|             3800.0|
| Adelie|Biscoe|          35.9|         19.2|            189.0|     3800.0|Female|                  35.9|                 19.2|                    189.0|             3800.0|
| Adelie|Biscoe|          37.8|         18.3|            174.0|     3400.0|Female|                  37.8|                 18.3|   

In [25]:
# Opcion 1 para las categoricas: rellenar con un valor fijo
# al inferir el schema automáticamente nos está diciendo que la columna sex NO es nullable y no tiene nan, por lo que en realidad
# lo que está pasando es que los NaN los tiene como palabras 'NaN' texto, por tanto no sirve el fill y usamos replace:


# df_train = df_train.na.fill('other', subset=categorical_cols)
# df_test = df_test.na.fill('other', subset=categorical_cols)

# df_train = df_train.fillna('other', subset=categorical_cols)
# df_test = df_test.fillna('other', subset=categorical_cols)

# df_train = df_train.replace('NaN', 'other', subset=categorical_cols)
# df_test = df_test.replace('NaN', 'other', subset=categorical_cols)


In [26]:
# ver value_counts para ver valores mas frecuentes
df.groupBy('island').count().orderBy(col('count').desc()).show()

+---------+-----+
|   island|count|
+---------+-----+
|   Biscoe|  163|
|    Dream|  123|
|Torgersen|   47|
+---------+-----+



In [27]:
# Opcion 2: usar Imputer
# No se puede, IllegalArgumentException, requiere numéricas.
# Se haría si hemos hecho primero un StringIndexer para convertir a numéricas a índices
# imputer = Imputer(
#     strategy='mode',
#     inputCols=categorical_cols,
#     outputCols=[col + '_imputed' for col in categorical_cols]
# )
# imputer_model = imputer.fit(df_train) # fit solo sobre train para evitar fuga de datos data leakage
# df_train = imputer_model.transform(df_train)
# df_test = imputer_model.transform(df_test)

In [28]:
# 3 rellenar con la moda
island_mode = df.groupBy('island').count().orderBy(col('count').desc()).first()['island']
sex_mode = df.groupBy('sex').count().orderBy(col('count').desc()).first()['sex']

df_train = df_train.replace('NaN', island_mode, subset=['island'])
df_test = df_test.replace('NaN', island_mode, subset=['island'])

df_train = df_train.replace('NaN', sex_mode, subset=['sex'])
df_test = df_test.replace('NaN', sex_mode, subset=['sex'])

## StringIndexer + OneHotEncoder

In [29]:
#indexar 'species' columna  a predecir. Ya no le aplicamos OneHotEncoder
indexer_label = StringIndexer(inputCol='species', outputCol='label')
indexer_model = indexer_label.fit(df_train)
df_train = indexer_model.transform(df_train)
df_test = indexer_model.transform(df_test)

#indexar las otras categoricas
for categorical_col in categorical_cols:
    indexer = StringIndexer(inputCol=categorical_col, outputCol=categorical_col + '_indexed')
    model = indexer.fit(df_train)
    df_train = model.transform(df_train)
    df_test = model.transform(df_test)
    
df_train.show(3)

+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+
|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|bill_length_mm_imputed|bill_depth_mm_imputed|flipper_length_mm_imputed|body_mass_g_imputed|label|island_indexed|sex_indexed|
+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+
| Adelie|Biscoe|          35.3|         18.9|            187.0|     3800.0|Female|                  35.3|                 18.9|                    187.0|             3800.0|  0.0|           0.0|        0.0|
| Adelie|Biscoe|          35.9|         19.2|            189.0|     3800.0|Female|                  35.9|                 19.2|                    189.0|             3800.0

In [30]:
encoder = OneHotEncoder(
    inputCols=[col + '_indexed' for col in categorical_cols],
    outputCols=[col + '_onehot' for col in categorical_cols]
)

model = encoder.fit(df_train)
df_train = model.transform(df_train)
df_test = model.transform(df_test)

df_train.show(3)

+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+
|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|bill_length_mm_imputed|bill_depth_mm_imputed|flipper_length_mm_imputed|body_mass_g_imputed|label|island_indexed|sex_indexed|island_onehot|   sex_onehot|
+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+
| Adelie|Biscoe|          35.3|         18.9|            187.0|     3800.0|Female|                  35.3|                 18.9|                    187.0|             3800.0|  0.0|           0.0|        0.0|(2,[0],[1.0])|(1,[0],[1.0])|
| Adelie|Biscoe|          35.9|         19.2|            189

## Assembler

In [31]:
onehot = [col + '_onehot' for col in categorical_cols]
imputed = [col + '_imputed' for col in numeric_cols]
onehot + imputed
 
assembler = VectorAssembler(
    inputCols= onehot + imputed,
    outputCol='features'
)

df_train = assembler.transform(df_train)
df_test = assembler.transform(df_test)

df_train.show(2)

+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+
|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|bill_length_mm_imputed|bill_depth_mm_imputed|flipper_length_mm_imputed|body_mass_g_imputed|label|island_indexed|sex_indexed|island_onehot|   sex_onehot|            features|
+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+
| Adelie|Biscoe|          35.3|         18.9|            187.0|     3800.0|Female|                  35.3|                 18.9|                    187.0|             3800.0|  0.0|           0.0|        0.0|(2,[0],[1.0])|(1,[0],[1.0]

## Scaler

In [32]:
scaler = StandardScaler(
    inputCol='features',
    outputCol='scaled_features'
)
model = scaler.fit(df_train)
df_train = model.transform(df_train)
df_test = model.transform(df_test)

## Modelado

In [33]:
lr = LogisticRegression(featuresCol='scaled_features')
model = lr.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)

+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|bill_length_mm_imputed|bill_depth_mm_imputed|flipper_length_mm_imputed|body_mass_g_imputed|label|island_indexed|sex_indexed|island_onehot|   sex_onehot|            features|     scaled_features|       rawPrediction|         probability|prediction|
+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
| Adelie|B

In [34]:
evaluator_accuracy = MulticlassClassificationEvaluator(metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(metricName='weightedRecall')

print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

accuracy 1.0
f1 1.0
precision 1.0
recall 1.0


In [35]:
# sin feature_scaled
lr = LogisticRegression()
model = lr.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|bill_length_mm_imputed|bill_depth_mm_imputed|flipper_length_mm_imputed|body_mass_g_imputed|label|island_indexed|sex_indexed|island_onehot|   sex_onehot|            features|     scaled_features|       rawPrediction|         probability|prediction|
+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
| Adelie|B

In [37]:
rfc = RandomForestClassifier()
model = rfc.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|bill_length_mm_imputed|bill_depth_mm_imputed|flipper_length_mm_imputed|body_mass_g_imputed|label|island_indexed|sex_indexed|island_onehot|   sex_onehot|            features|     scaled_features|       rawPrediction|         probability|prediction|
+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
| Adelie|B

In [41]:
rfc = RandomForestClassifier(featuresCol='scaled_features')
model = rfc.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|bill_length_mm_imputed|bill_depth_mm_imputed|flipper_length_mm_imputed|body_mass_g_imputed|label|island_indexed|sex_indexed|island_onehot|   sex_onehot|            features|     scaled_features|       rawPrediction|         probability|prediction|
+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
| Adelie|B

In [38]:
tree = DecisionTreeClassifier()
model = tree.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------+-------------+----------+
|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|bill_length_mm_imputed|bill_depth_mm_imputed|flipper_length_mm_imputed|body_mass_g_imputed|label|island_indexed|sex_indexed|island_onehot|   sex_onehot|            features|     scaled_features| rawPrediction|  probability|prediction|
+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------+-------------+----------+
| Adelie|Biscoe|          37.7|         18.7|    

In [39]:
# si tenemos mas de dos labels da error
gbtc = GBTClassifier()
ovr = OneVsRest(classifier=gbtc)
model = ovr.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|bill_length_mm_imputed|bill_depth_mm_imputed|flipper_length_mm_imputed|body_mass_g_imputed|label|island_indexed|sex_indexed|island_onehot|   sex_onehot|            features|     scaled_features|       rawPrediction|prediction|
+-------+------+--------------+-------------+-----------------+-----------+------+----------------------+---------------------+-------------------------+-------------------+-----+--------------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
| Adelie|Biscoe|          37.7|         18.7|            180.0|     3600.

In [40]:
num_features = df_train.first()['features'].size
num_labels = df.select('label').distinct().count()
mlp = MultilayerPerceptronClassifier(layers=[num_features, 32, 32, num_labels], seed=42, maxIter=20) #layers=[capa input, capas ocultas..., capa output]
model = mlp.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `label` cannot be resolved. Did you mean one of the following? [`sex`, `island`, `species`, `body_mass_g`, `bill_depth_mm`].;
'Project ['label]
+- Filter atleastnnonnulls(1, species#1300)
   +- LogicalRDD [species#1300, island#1301, bill_length_mm#1302, bill_depth_mm#1303, flipper_length_mm#1304, body_mass_g#1305, sex#1306], false
