In [1]:
from pyspark import SparkContext, SparkConf
from pyspark import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

conf = SparkConf().setAppName("Analisis Housing").setMaster("local")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

rdd = sqlContext.read.csv("mexico_covid19.csv", header=True).rdd

df = rdd.toDF()
df.show()

+-----+-------------+-----------+----------+-----------+---------+-----+----------------+----------------+-------+-------------------+------+------+----+-----------+-------------+-------------+-------------+--------------+----------+--------+--------+----+------------+--------+------------------+--------+----+----+--------+------------+--------+--------------+--------+-------------+----------+---------+--------+-----------------+-----------+---+
|   id|FECHA_ARCHIVO|ID_REGISTRO|ENTIDAD_UM|ENTIDAD_RES|RESULTADO|DELAY|ENTIDAD_REGISTRO|         ENTIDAD|ABR_ENT|FECHA_ACTUALIZACION|ORIGEN|SECTOR|SEXO|ENTIDAD_NAC|MUNICIPIO_RES|TIPO_PACIENTE|FECHA_INGRESO|FECHA_SINTOMAS| FECHA_DEF|INTUBADO|NEUMONIA|EDAD|NACIONALIDAD|EMBARAZO|HABLA_LENGUA_INDIG|DIABETES|EPOC|ASMA|INMUSUPR|HIPERTENSION|OTRA_COM|CARDIOVASCULAR|OBESIDAD|RENAL_CRONICA|TABAQUISMO|OTRO_CASO|MIGRANTE|PAIS_NACIONALIDAD|PAIS_ORIGEN|UCI|
+-----+-------------+-----------+----------+-----------+---------+-----+----------------+-----------

In [111]:
clean_rdd = rdd.map(lambda x: ( 0 if x["RESULTADO"] == "2" else 1, 
                               int(x["DELAY"]), 
                               int(x["SEXO"]), 
                               int(x["TIPO_PACIENTE"]), 
                               0 if x["FECHA_DEF"] == "9999-99-99" else 1, 
                               (int(x["INTUBADO"])-1)/(98-1), 
                               int(x["NEUMONIA"]), 
                               int(x["EDAD"])/120, 
                               (int(x["EMBARAZO"])-1)/(98-1), 
                               int(x["DIABETES"]), 
                               int(x["EPOC"]), 
                               int(x["ASMA"]),
                               int(x["INMUSUPR"]), 
                               int(x["HIPERTENSION"]), 
                               int(x["OTRA_COM"]), 
                               int(x["CARDIOVASCULAR"]), 
                               int(x["OBESIDAD"]), 
                               int(x["RENAL_CRONICA"]), 
                               int(x["TABAQUISMO"]), 
                               (int(x["OTRO_CASO"])-1)/(98-1), 
                               (int(x["UCI"])-1)/(99-1) ) )


clean_df = clean_rdd.toDF(["RESULTADO","DELAY","SEXO","TIPO_PACIENTE","DEFUNCION","INTUBADO","NEUMONIA","EDAD","EMBARAZO",
                           "DIABETES","EPOC","ASMA","INMUSUPR","HIPERTENSION","OTRA_COM","CARDIOVASCULAR","OBESIDAD",
                           "RENAL_CRONICA","TABAQUISMO","OTRO_CASO","UCI"])
clean_df.show()

+---------+-----+----+-------------+---------+--------------------+--------+-------------------+--------------------+--------+----+----+--------+------------+--------+--------------+--------+-------------+----------+--------------------+-------------------+
|RESULTADO|DELAY|SEXO|TIPO_PACIENTE|DEFUNCION|            INTUBADO|NEUMONIA|               EDAD|            EMBARAZO|DIABETES|EPOC|ASMA|INMUSUPR|HIPERTENSION|OTRA_COM|CARDIOVASCULAR|OBESIDAD|RENAL_CRONICA|TABAQUISMO|           OTRO_CASO|                UCI|
+---------+-----+----+-------------+---------+--------------------+--------+-------------------+--------------------+--------+----+----+--------+------------+--------+--------------+--------+-------------+----------+--------------------+-------------------+
|        0|    0|   2|            1|        0|  0.9896907216494846|       2| 0.6166666666666667|  0.9896907216494846|       1|   2|   2|       2|           1|       2|             2|       1|            2|         2|0.01030927

In [33]:
clean_df.printSchema()

root
 |-- RESULTADO: long (nullable = true)
 |-- DELAY: long (nullable = true)
 |-- SEXO: long (nullable = true)
 |-- TIPO_PACIENTE: long (nullable = true)
 |-- DEFUNCION: long (nullable = true)
 |-- INTUBADO: long (nullable = true)
 |-- NEUMONIA: long (nullable = true)
 |-- EDAD: long (nullable = true)
 |-- EMBARAZO: long (nullable = true)
 |-- DIABETES: long (nullable = true)
 |-- EPOC: long (nullable = true)
 |-- ASMA: long (nullable = true)
 |-- INMUSUPR: long (nullable = true)
 |-- HIPERTENSION: long (nullable = true)
 |-- OTRA_COM: long (nullable = true)
 |-- CARDIOVASCULAR: long (nullable = true)
 |-- OBESIDAD: long (nullable = true)
 |-- RENAL_CRONICA: long (nullable = true)
 |-- TABAQUISMO: long (nullable = true)
 |-- OTRO_CASO: long (nullable = true)
 |-- UCI: long (nullable = true)



In [29]:
clean_df.count()

263007

In [39]:
clean_df.select('RESULTADO').distinct().show()

+---------+
|RESULTADO|
+---------+
|        0|
|        1|
+---------+



In [45]:
# Conteo de resultados negativos y positivos
clean_df.groupby('RESULTADO').count().show()

+---------+------+
|RESULTADO| count|
+---------+------+
|        0|160348|
|        1|102659|
+---------+------+



In [106]:

clean_df.groupby('DEFUNCION').count().show()

+---------+------+
|DEFUNCION| count|
+---------+------+
|        0|258465|
|        1|  4542|
+---------+------+



In [112]:
# Conteo de defunciones con resultado positivo
clean_df.filter(clean_df['RESULTADO'] == 1).filter(clean_df['DEFUNCION'] == 1).count()

3104

In [113]:
filter_df = clean_df.filter(clean_df['RESULTADO'] == 1)

assembler = VectorAssembler(inputCols=["DELAY","SEXO","TIPO_PACIENTE","INTUBADO","NEUMONIA","EDAD","EMBARAZO",
                           "DIABETES","EPOC","ASMA","INMUSUPR","HIPERTENSION","OTRA_COM","CARDIOVASCULAR","OBESIDAD",
                           "RENAL_CRONICA","TABAQUISMO","OTRO_CASO","UCI"], outputCol="features")
a_df = assembler.transform(filter_df)
a_df.show()

+---------+-----+----+-------------+---------+--------------------+--------+-------------------+--------------------+--------+----+----+--------+------------+--------+--------------+--------+-------------+----------+--------------------+-------------------+--------------------+
|RESULTADO|DELAY|SEXO|TIPO_PACIENTE|DEFUNCION|            INTUBADO|NEUMONIA|               EDAD|            EMBARAZO|DIABETES|EPOC|ASMA|INMUSUPR|HIPERTENSION|OTRA_COM|CARDIOVASCULAR|OBESIDAD|RENAL_CRONICA|TABAQUISMO|           OTRO_CASO|                UCI|            features|
+---------+-----+----+-------------+---------+--------------------+--------+-------------------+--------------------+--------+----+----+--------+------------+--------+--------------+--------+-------------+----------+--------------------+-------------------+--------------------+
|        1|    0|   2|            2|        0|0.010309278350515464|       1| 0.4166666666666667|  0.9896907216494846|       2|   2|   2|       2|           2|     

In [121]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import StandardScaler

(df_training, df_test) = a_df.randomSplit([0.7, 0.3])

capas = [19, 10, 9 , 2]

entrenador = MultilayerPerceptronClassifier(
        featuresCol="features", labelCol="DEFUNCION", maxIter=100, layers=capas
    )

modelo = entrenador.fit(df_training)





df_predictions = modelo.transform(df_test)
evaluador = MulticlassClassificationEvaluator(
        labelCol="DEFUNCION", predictionCol="prediction", metricName="accuracy"
    )

accuracy = evaluador.evaluate(df_predictions)

In [116]:
accuracy

0.9698639121167405

In [124]:
df_predictions.select("prediction", "rawPrediction", "probability").show()

+----------+--------------------+--------------------+
|prediction|       rawPrediction|         probability|
+----------+--------------------+--------------------+
|       0.0|[2.90567689312543...|[0.99768727767642...|
|       0.0|[2.47267574996960...|[0.99482977636962...|
|       0.0|[2.76614696862494...|[0.99705841846153...|
|       0.0|[2.26633397639093...|[0.99235505718563...|
|       0.0|[2.54425389313278...|[0.99552868401481...|
|       0.0|[1.75639609450048...|[0.98051693088538...|
|       0.0|[2.52010781446215...|[0.99532004379647...|
|       0.0|[2.49852799566395...|[0.99512536540420...|
|       0.0|[1.98017708810495...|[0.98740868141371...|
|       0.0|[2.47678361956341...|[0.99492099173154...|
|       0.0|[2.47218568352578...|[0.99487603147406...|
|       0.0|[2.06876765428625...|[0.98840253047590...|
|       0.0|[2.45487573299570...|[0.99470639672252...|
|       0.0|[2.45487573299570...|[0.99470639672252...|
|       0.0|[2.45028485806711...|[0.99465959835116...|
|       0.

In [123]:
df_predictions.groupby('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|30716|
|       1.0|    8|
+----------+-----+

