In [1]:
from pyspark import SparkContext, SparkConf
from pyspark import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


conf = SparkConf().setAppName("NN_1").setMaster("local")
sc = SparkContext(conf=conf)

sqlContext = SQLContext(sc)

# Abrimos el csv
rdd = sqlContext.read.csv("mexico_covid19.csv", header=True).rdd
rdd = rdd.map(lambda x: ( 0 if x["RESULTADO"] == "2" else 1, 
                               int(x["DELAY"]), 
                               int(x["SEXO"]), 
                               int(x["TIPO_PACIENTE"]), 
                               0 if x["FECHA_DEF"] == "9999-99-99" else 1, 
                               (int(x["INTUBADO"])-1)/(98-1), 
                               int(x["NEUMONIA"]), 
                               int(x["EDAD"])/120, 
                               (int(x["EMBARAZO"])-1)/(98-1), 
                               int(x["DIABETES"]), 
                               int(x["EPOC"]), 
                               int(x["ASMA"]),
                               int(x["INMUSUPR"]), 
                               int(x["HIPERTENSION"]), 
                               int(x["OTRA_COM"]), 
                               int(x["CARDIOVASCULAR"]), 
                               int(x["OBESIDAD"]), 
                               int(x["RENAL_CRONICA"]), 
                               int(x["TABAQUISMO"]), 
                               (int(x["OTRO_CASO"])-1)/(98-1), 
                               (int(x["UCI"])-1)/(99-1) ) )


df = rdd.toDF(["RESULTADO","DELAY","SEXO","TIPO_PACIENTE","DEFUNCION","INTUBADO","NEUMONIA","EDAD","EMBARAZO",
                           "DIABETES","EPOC","ASMA","INMUSUPR","HIPERTENSION","OTRA_COM","CARDIOVASCULAR","OBESIDAD",
                           "RENAL_CRONICA","TABAQUISMO","OTRO_CASO","UCI"])

df = df.filter(df['RESULTADO'] == 1)

df.show()

+---------+-----+----+-------------+---------+--------------------+--------+-------------------+--------------------+--------+----+----+--------+------------+--------+--------------+--------+-------------+----------+--------------------+-------------------+
|RESULTADO|DELAY|SEXO|TIPO_PACIENTE|DEFUNCION|            INTUBADO|NEUMONIA|               EDAD|            EMBARAZO|DIABETES|EPOC|ASMA|INMUSUPR|HIPERTENSION|OTRA_COM|CARDIOVASCULAR|OBESIDAD|RENAL_CRONICA|TABAQUISMO|           OTRO_CASO|                UCI|
+---------+-----+----+-------------+---------+--------------------+--------+-------------------+--------------------+--------+----+----+--------+------------+--------+--------------+--------+-------------+----------+--------------------+-------------------+
|        1|    0|   2|            2|        0|0.010309278350515464|       1| 0.4166666666666667|  0.9896907216494846|       2|   2|   2|       2|           2|       2|             2|       2|            2|         2|  1.010309

In [2]:
# Test Chi2
assembler = VectorAssembler(inputCols=["DELAY","SEXO","TIPO_PACIENTE","INTUBADO","NEUMONIA","EDAD","EMBARAZO",
                           "DIABETES","EPOC","ASMA","INMUSUPR","HIPERTENSION","OTRA_COM","CARDIOVASCULAR","OBESIDAD",
                           "RENAL_CRONICA","TABAQUISMO","OTRO_CASO","UCI"], outputCol="featuresChi2")

df_chi2 = assembler.transform(df)
df_chi2 = df_chi2.select("featuresChi2", "DEFUNCION")

selector = ChiSqSelector(
    numTopFeatures=19,
    featuresCol="featuresChi2",
    labelCol="DEFUNCION",
    outputCol="featuresSelected")

df_result = selector.fit(df_chi2).transform(df_chi2)

# Dividir data en training y test
(df_training, df_test) = df_result.randomSplit([0.7, 0.3])

# Definir arquitectura de nuestra red (hiperparametro)
capas = [19, 12, 9 , 2]

# Construimos al entrenador
# Hiperparametro: maxIter
entrenador = MultilayerPerceptronClassifier(
    featuresCol="featuresSelected", labelCol="DEFUNCION", maxIter=100, layers=capas
)
# Entrenar nuestro modelo
modelo = entrenador.fit(df_training)

# Validar nuestro modelo
df_predictions = modelo.transform(df_test)
evaluador = MulticlassClassificationEvaluator(
    labelCol="DEFUNCION", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluador.evaluate(df_predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9700460077658498
