# Spark Machine Learning

Modelo de Clasificación binaria para tratar de predecir si los costos médicos de una persona serán superiores a 50.000 dólares o no.

## Load the dataset

In [0]:
%fs head --maxBytes=1024 databricks-datasets/adult/adult.data

Because the dataset does not include column names, create a schema to assign column names and datatypes.

In [0]:
schema = """`age` DOUBLE,
`workclass` STRING,
`fnlwgt` DOUBLE,
`education` STRING,
`education_num` DOUBLE,
`marital_status` STRING,
`occupation` STRING,
`relationship` STRING,
`race` STRING,
`sex` STRING,
`capital_gain` DOUBLE,
`capital_loss` DOUBLE,
`hours_per_week` DOUBLE,
`native_country` STRING,
`income` STRING"""

dataset = spark.read.csv("/databricks-datasets/adult/adult.data", schema=schema)

In [0]:
display(dataset)

age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
37.0,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,<=50K
49.0,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K
52.0,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
31.0,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,>50K
42.0,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,>50K


In [0]:
#Review if there is any None, NULL & Empty String Literal Values

from pyspark.sql.functions import *
col_null_cnt =  dataset.select([count(when(col(c).contains('None') | col(c).contains('NULL') | col(c).contains('?') | (col(c) == '')|  (col(c).isNull()) | isnan(c), c)).alias(c) for c in dataset.columns])

display(col_null_cnt)

age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,1836,0,0,0,0,1843,0,0,0,0,0,0,583,0


In [0]:

from pyspark.sql.functions import col,when
df2=dataset.select([when(col(c).contains('?'),None).otherwise(col(c)).alias(c) for c in dataset.columns])
df2.display()

age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
37.0,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,<=50K
49.0,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K
52.0,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
31.0,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,>50K
42.0,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,>50K


In [0]:
dataset = df2.na.drop("any")
dataset.display()

age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
37.0,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,<=50K
49.0,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K
52.0,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
31.0,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,>50K
42.0,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,>50K


Randomly split data into training and test sets, and set seed for reproducibility.

It's best to split the data before doing any preprocessing. This allows the test dataset to more closely simulate new data when we evaluate the model.

In [0]:
trainDF, testDF = dataset.randomSplit([0.8, 0.2], seed=42)
print(trainDF.cache().count()) # Cache because accessing training data multiple times
print(testDF.count())

24162
6000


Review the data.

In [0]:
display(trainDF)

age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
17.0,Federal-gov,99893.0,11th,7.0,Never-married,Adm-clerical,Not-in-family,Black,Female,0.0,1602.0,40.0,United-States,<=50K
17.0,Local-gov,32124.0,9th,5.0,Never-married,Other-service,Own-child,Black,Male,0.0,0.0,9.0,United-States,<=50K
17.0,Local-gov,148194.0,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,0.0,12.0,United-States,<=50K
17.0,Local-gov,170916.0,10th,6.0,Never-married,Protective-serv,Own-child,White,Female,0.0,1602.0,40.0,United-States,<=50K
17.0,Local-gov,173497.0,11th,7.0,Never-married,Prof-specialty,Own-child,Black,Male,0.0,0.0,15.0,United-States,<=50K
17.0,Local-gov,182070.0,11th,7.0,Never-married,Other-service,Own-child,White,Female,0.0,0.0,16.0,United-States,<=50K
17.0,Local-gov,192387.0,9th,5.0,Never-married,Other-service,Own-child,White,Male,0.0,0.0,45.0,United-States,<=50K
17.0,Local-gov,195262.0,11th,7.0,Never-married,Craft-repair,Own-child,White,Male,0.0,0.0,35.0,United-States,<=50K
17.0,Local-gov,244856.0,11th,7.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,0.0,40.0,United-States,<=50K
17.0,Local-gov,246308.0,11th,7.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,0.0,20.0,Puerto-Rico,<=50K


What's the distribution of the number of `hours_per_week`?

In [0]:
display(trainDF.select("hours_per_week").summary())

summary,hours_per_week
count,24162.0
mean,40.89185497889248
stddev,12.039386710998391
min,1.0
25%,40.0
50%,40.0
75%,45.0
max,99.0


How about `education` status?

In [0]:
display(trainDF
        .groupBy("education")
        .count()
        .sort("count", ascending=False))

education,count
HS-grad,7904
Some-college,5366
Bachelors,4039
Masters,1290
Assoc-voc,1035
11th,854
Assoc-acdm,803
10th,668
7th-8th,449
Prof-school,426


## Transformadores, estimadores y tuberías

Tres conceptos importantes en el aprendizaje automático de MLlib que se ilustran en este notebook son **Transformers**, **estimadores** y **Pipelines**.

- **Transformer**: Toma un DataFrame como entrada y devuelve un nuevo DataFrame. Los transformadores no aprenden ningún parámetro de los datos y simplemente aplican transformaciones basadas en reglas para preparar los datos para el entrenamiento del modelo o generar predicciones usando un modelo MLlib entrenado. Llamas a un transformador con un método `.transform ()`.

- **Estimador**: aprende los parámetros del DataFrame a través de un método `.fit ()` y devuelve un modelo, que es un transformador.

- **Pipeline**: combina varios pasos en un solo flujo de trabajo que se puede ejecutar fácilmente. La creación de un modelo de aprendizaje automático generalmente implica configurar muchos pasos diferentes e iterar sobre ellos. Los pipelines  ayudan a automatizar este proceso.

## Preprocesamiento de características

El objetivo de este cuaderno es construir un modelo que prediga el nivel de "ingresos" a partir de las características incluidas en el conjunto de datos (nivel de educación, estado civil, ocupación, etc.). El primer paso es manipular, o preprocesar, las características para que estén en el formato que requiere MLlib.

### Convertir variables categóricas en numéricas

Algunos algoritmos de aprendizaje automático, como la regresión lineal y logística, requieren funciones numéricas. El conjunto de datos incluye características categóricas como educación, ocupación y estado civil.

El siguiente bloque de código ilustra cómo usar `StringIndexer` y` OneHotEncoder` para convertir variables categóricas en un conjunto de variables numéricas que solo toman los valores 0 y 1.

- `StringIndexer` convierte una columna de valores de cadena en una columna de índices de etiquetas. 
- 'OneHotEncoder` mapea una columna de índices de categoría a una columna de vectores binarios, con como máximo un "1" en cada fila que indica el índice de categoría para esa fila.

La codificación one-hot en Spark es un proceso de dos pasos. Primero usa StringIndexer, seguido de OneHotEncoder. El siguiente bloque de código define StringIndexer y OneHotEncoder pero aún no lo aplica a ningún dato.

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

categoricalCols = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex"]

# The following two lines are estimators. They return functions that we will later apply to transform the dataset.
stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=[x + "Index" for x in categoricalCols]) 
encoder = OneHotEncoder(inputCols=stringIndexer.getOutputCols(), outputCols=[x + "OHE" for x in categoricalCols]) 

# The label column ("income") is also a string value - it has two possible values, "<=50K" and ">50K". 
# Convert it to a numeric value using StringIndexer.
labelToIndex = StringIndexer(inputCol="income", outputCol="label")

In [0]:
stringIndexerModel = stringIndexer.fit(trainDF)
display(stringIndexerModel.transform(trainDF))

age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,workclassIndex,educationIndex,marital_statusIndex,occupationIndex,relationshipIndex,raceIndex,sexIndex
17.0,Federal-gov,99893.0,11th,7.0,Never-married,Adm-clerical,Not-in-family,Black,Female,0.0,1602.0,40.0,United-States,<=50K,5.0,5.0,1.0,3.0,1.0,1.0,1.0
17.0,Local-gov,32124.0,9th,5.0,Never-married,Other-service,Own-child,Black,Male,0.0,0.0,9.0,United-States,<=50K,2.0,10.0,1.0,5.0,2.0,1.0,0.0
17.0,Local-gov,148194.0,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,0.0,12.0,United-States,<=50K,2.0,5.0,1.0,3.0,2.0,0.0,1.0
17.0,Local-gov,170916.0,10th,6.0,Never-married,Protective-serv,Own-child,White,Female,0.0,1602.0,40.0,United-States,<=50K,2.0,7.0,1.0,11.0,2.0,0.0,1.0
17.0,Local-gov,173497.0,11th,7.0,Never-married,Prof-specialty,Own-child,Black,Male,0.0,0.0,15.0,United-States,<=50K,2.0,5.0,1.0,1.0,2.0,1.0,0.0
17.0,Local-gov,182070.0,11th,7.0,Never-married,Other-service,Own-child,White,Female,0.0,0.0,16.0,United-States,<=50K,2.0,5.0,1.0,5.0,2.0,0.0,1.0
17.0,Local-gov,192387.0,9th,5.0,Never-married,Other-service,Own-child,White,Male,0.0,0.0,45.0,United-States,<=50K,2.0,10.0,1.0,5.0,2.0,0.0,0.0
17.0,Local-gov,195262.0,11th,7.0,Never-married,Craft-repair,Own-child,White,Male,0.0,0.0,35.0,United-States,<=50K,2.0,5.0,1.0,0.0,2.0,0.0,0.0
17.0,Local-gov,244856.0,11th,7.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,0.0,40.0,United-States,<=50K,2.0,5.0,1.0,1.0,2.0,0.0,1.0
17.0,Local-gov,246308.0,11th,7.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,0.0,20.0,Puerto-Rico,<=50K,2.0,5.0,1.0,1.0,2.0,0.0,1.0


###VectorAssembler
Combina todas las columnas de características en un solo vector de características.
La mayoría de los algoritmos MLlib requieren una sola columna de características como entrada. Cada fila de esta columna contiene un vector de puntos de datos correspondiente al conjunto de características utilizadas para la predicción.

MLlib proporciona el transformador `VectorAssembler` para crear una única columna vectorial a partir de una lista de columnas.

In [0]:
from pyspark.ml.feature import VectorAssembler

# This includes both the numeric columns and the one-hot encoded binary vector columns in our dataset.
numericCols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
assemblerInputs = [c + "OHE" for c in categoricalCols] + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

##Definir el modelo

Utilizaremos Regresión logística (https://spark.apache.org/docs/latest/ml-classification-regression.html#logistic-regression).

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label", regParam=1.0)

##Construir el Pipeline

Un Pipeline es una lista ordenada de transformadores y estimadores. Puede definir una canalización para automatizar y garantizar la repetibilidad de las transformaciones que se aplicarán a un conjunto de datos. En este paso, definimos la canalización y luego la aplicamos al conjunto de datos de prueba.

Similar a lo que vimos con `StringIndexer`, un` Pipeline` es un estimador. El método `pipeline.fit ()` devuelve un `PipelineModel`, que es un transformador.

In [0]:
from pyspark.ml import Pipeline

# Define the pipeline based on the stages created in previous steps.
pipeline = Pipeline(stages=[stringIndexer, encoder, labelToIndex, vecAssembler, lr])

# Define the pipeline model.
pipelineModel = pipeline.fit(trainDF)

# Apply the pipeline model to the test dataset.
predDF = pipelineModel.transform(testDF)

Muestra las predicciones del modelo. La columna `features` es un vector disperso, que suele ser el caso después de la codificación one-hot, porque hay muchos valores 0.

In [0]:
display(predDF.select("features", "label", "prediction", "probability"))

features,label,prediction,probability
"Map(vectorType -> sparse, length -> 56, indices -> List(2, 13, 22, 32, 42, 45, 50, 51, 52, 55), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 17.0, 39815.0, 6.0, 25.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8970638362534651, 0.10293616374653491))"
"Map(vectorType -> sparse, length -> 56, indices -> List(2, 11, 22, 38, 42, 45, 49, 50, 51, 52, 55), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 17.0, 175587.0, 7.0, 30.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8597185394385873, 0.14028146056141266))"
"Map(vectorType -> sparse, length -> 56, indices -> List(2, 11, 22, 31, 42, 45, 49, 50, 51, 52, 55), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 17.0, 191910.0, 7.0, 20.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8690531322816111, 0.13094686771838893))"
"Map(vectorType -> sparse, length -> 56, indices -> List(2, 11, 22, 30, 42, 45, 50, 51, 52, 55), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 17.0, 308901.0, 7.0, 15.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8933498835524672, 0.10665011644753275))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 13, 22, 31, 42, 45, 50, 51, 52, 55), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 17.0, 27032.0, 6.0, 12.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8939251021928951, 0.10607489780710488))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 13, 22, 32, 42, 45, 50, 51, 52, 55), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 17.0, 31007.0, 6.0, 30.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9004148302147796, 0.09958516978522036))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 11, 22, 30, 42, 45, 50, 51, 52, 55), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 17.0, 38611.0, 7.0, 23.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8956390762137646, 0.1043609237862354))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 11, 22, 32, 42, 45, 50, 51, 52, 55), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 17.0, 47425.0, 7.0, 15.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9048381283851223, 0.09516187161487766))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 17, 22, 31, 42, 45, 50, 51, 52, 55), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 17.0, 56986.0, 8.0, 18.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8824699502013702, 0.1175300497986298))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 13, 22, 32, 42, 45, 49, 50, 51, 52, 55), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 17.0, 57324.0, 6.0, 30.0))",0.0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8902003689521937, 0.10979963104780632))"


## Evaluar el modelo

El comando `display` tiene una opción de curva ROC incorporada.

In [0]:
display(pipelineModel.stages[-1], predDF.drop("prediction", "rawPrediction", "probability"), "ROC")

False Positive Rate,True Positive Rate,Threshold
0.0,0.0,0.7025534511002796
0.0,0.03125,0.7025534511002796
0.0,0.0625,0.5122270895769052
0.0,0.09375,0.5008113784173951
0.0,0.125,0.4717456833182906
0.0,0.15625,0.4593269949455256
0.0,0.1875,0.456475672273857
0.0114942528735632,0.1875,0.4440375208206403
0.0114942528735632,0.21875,0.4400094108321785
0.0229885057471264,0.21875,0.4277504850181302


Para evaluar el modelo, usamos el `BinaryClassificationEvaluator` para evaluar el área bajo la curva ROC y el` MulticlassClassificationEvaluator` para evaluar la precisión.

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

bcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
print(f"Area under ROC curve: {bcEvaluator.evaluate(predDF)}")

mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator.evaluate(predDF)}")

Area under ROC curve: 0.8854775712113089
Accuracy: 0.7645


## Ajuste de hiperparámetros

MLlib proporciona métodos para facilitar el ajuste de hiperparámetros y la validación cruzada.
- Para el ajuste de hiperparámetros, `ParamGridBuilder` le permite definir una búsqueda sobre un conjunto de hiperparámetros del modelo.
- Para la validación cruzada, `CrossValidator` le permite especificar un estimador (la canalización que se aplicará al conjunto de datos de entrada), un evaluador, un espacio de hiperparámetros y el número de iteraciones o folds que se utilizarán para la validación cruzada.

`ParamGridBuilder` y` CrossValidator` para ajustar el modelo. Aquí usaremos tres valores para `regParam` y tres para` elasticNetParam`, para un total de 3 x 3 = 9 combinaciones de hiperparámetros para que `CrossValidator` las examine.

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

#Definimos la matrix de hiperparámetros "paramGrid"
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .build())

Siempre que llame a `CrossValidator` en MLlib, Databricks rastrea automáticamente todas las ejecuciones usando [MLflow] (https://mlflow.org/). Puede usar la interfaz de usuario de MLflow para comparar el rendimiento de cada modelo.

Usamos la canalización que creamos como estimador.

In [0]:
# Create a 3-fold CrossValidator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=bcEvaluator, numFolds=3, parallelism = 4)

# Run cross validations. This step takes a few minutes and returns the best model found from the cross validation.
cvModel = cv.fit(trainDF)

## Predicciones y evaluación del rendimiento del modelo
Se utilizó el mejor modelo identificado por la validación cruzada para hacer predicciones en el conjunto de datos de prueba y luego evalué el rendimiento del modelo utilizando el área bajo la curva ROC.

In [0]:
# Use the model identified by the cross-validation to make predictions on the test dataset
cvPredDF = cvModel.transform(testDF)

# Evaluate the model's performance based on area under the ROC curve and accuracy 
print(f"Area under ROC curve: {bcEvaluator.evaluate(cvPredDF)}")
print(f"Accuracy: {mcEvaluator.evaluate(cvPredDF)}")

Area under ROC curve: 0.903068663917924
Accuracy: 0.8395


## Predicts Visualization with SQL

In [0]:
cvPredDF.createOrReplaceTempView("finalPredictions")

In [0]:
%sql
SELECT occupation, prediction, count(*) AS count
FROM finalPredictions
GROUP BY occupation, prediction
ORDER BY occupation

occupation,prediction,count
Adm-clerical,0.0,695
Adm-clerical,1.0,57
Armed-Forces,0.0,2
Craft-repair,1.0,60
Craft-repair,0.0,681
Exec-managerial,0.0,398
Exec-managerial,1.0,402
Farming-fishing,0.0,170
Farming-fishing,1.0,11
Handlers-cleaners,1.0,3


In [0]:
%sql
SELECT age, prediction, count(*) AS count
FROM finalPredictions
GROUP BY age, prediction
ORDER BY age

age,prediction,count
17.0,0.0,50
18.0,0.0,74
19.0,0.0,110
20.0,0.0,124
21.0,0.0,114
21.0,1.0,1
22.0,0.0,130
22.0,1.0,2
23.0,0.0,161
24.0,0.0,162
