### Calcula correlação letalidade x IBGE municipal indexes
(Spark)

* Inputs:
    * datasets/prepared/df_ind_ibge_complete/ (parquet)
    * datasets/prepared/covid_complete/ (parquet)
* Outputs:
    * datasets/correlations_full.csv

In [1]:
# Spark
import findspark
findspark.init()

from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.window import Window

spark = SparkSession \
            .builder \
            .config("spark.sql.broadcastTimeout", "360000") \
            .config('spark.sql.execution.arrow.enabled', 'false') \
            .config("spark.driver.memory", '14G') \
            .config("spark.executor.memory", '14G') \
            .config("spark.driver.maxResultSize", '4G') \
            .getOrCreate()

In [2]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [3]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline

### 0. Constantes

In [4]:
main_path = 'D:/data_dash_covid/datasets/'

In [5]:
# Última data disponível do dado
data_max = '2020-10-24'

### 1. Read data

In [6]:
df_covid_complete = spark.read.parquet(main_path+'prepared/df_covid_complete/')\
                         .filter(f.col('Data') == f.lit(data_max)) \
                         .select('Município_Cod','Casos_Cumul','Mortes_Cumul',)\
                         .withColumn('Taxa_Letalidade', f.round(f.col('Mortes_Cumul') / f.col('Casos_Cumul'), 4))\
                         .withColumn('Casos_Cumul', f.col('Casos_Cumul').cast('int'))\
                         .withColumn('Mortes_Cumul', f.col('Mortes_Cumul').cast('int'))

In [7]:
df_ind_ibge_complete = spark.read.parquet(main_path+'prepared/df_ind_ibge_complete/')

In [8]:
df_ind_ibge_complete.printSchema()

root
 |-- id_indicador: integer (nullable = true)
 |-- periodo: integer (nullable = true)
 |-- id_pesquisa: integer (nullable = true)
 |-- nome_indicador: string (nullable = true)
 |-- nome_pesquisa: string (nullable = true)
 |-- arvore_indicador: string (nullable = true)
 |-- descri_indicador: string (nullable = true)
 |-- nivel_indicador: string (nullable = true)
 |-- nome_completo_indicador: string (nullable = true)
 |-- id_cidade: integer (nullable = true)
 |-- resultado: string (nullable = true)



### 2. Join data

Covid (casos, mortes e letalidade) versus os indicadores IBGE

In [8]:
df = df_ind_ibge_complete.select('id_cidade','id_indicador','resultado')\
                         .groupBy('id_cidade').pivot('id_indicador').agg(f.mean('resultado'))\
                         .withColumnRenamed('id_cidade','Município_Cod')\
                         .join(df_covid_complete, 'Município_Cod', 'inner')\
                         .drop('Município_Cod')

In [9]:
print(df.count(), len(df.columns))

5570 282


In [10]:
df.printSchema()

root
 |-- 5903: double (nullable = true)
 |-- 5908: double (nullable = true)
 |-- 5913: double (nullable = true)
 |-- 5924: double (nullable = true)
 |-- 5929: double (nullable = true)
 |-- 5934: double (nullable = true)
 |-- 5945: double (nullable = true)
 |-- 5950: double (nullable = true)
 |-- 5955: double (nullable = true)
 |-- 15752: double (nullable = true)
 |-- 15754: double (nullable = true)
 |-- 15755: double (nullable = true)
 |-- 15756: double (nullable = true)
 |-- 15760: double (nullable = true)
 |-- 15765: double (nullable = true)
 |-- 15769: double (nullable = true)
 |-- 15773: double (nullable = true)
 |-- 15777: double (nullable = true)
 |-- 15781: double (nullable = true)
 |-- 15785: double (nullable = true)
 |-- 15789: double (nullable = true)
 |-- 15793: double (nullable = true)
 |-- 15797: double (nullable = true)
 |-- 15801: double (nullable = true)
 |-- 15805: double (nullable = true)
 |-- 15809: double (nullable = true)
 |-- 15813: double (nullable = true)
 |-- 

### 3. Correlação para Casos, Mortes, Letalidade

Based on [here](https://databricks.com/blog/2015/06/02/statistical-and-mathematical-functions-with-dataframes-in-spark.html) and [here](https://spark.apache.org/docs/2.2.0/ml-statistics.html)

In [11]:
dict_corr = {}
dict_corr['Mortes'] = {}
dict_corr['Casos'] = {}
dict_corr['Letalidade'] = {}

for c in df.columns:
    if c not in ['Casos_Cumul','Mortes_Cumul','Taxa_Letalidade']:
        print(c)
        try:
            dict_corr['Casos'][c] = df.stat.corr(c, 'Casos_Cumul')
            dict_corr['Mortes'][c] = df.stat.corr(c, 'Mortes_Cumul')
            dict_corr['Letalidade'][c] = df.stat.corr(c, 'Taxa_Letalidade')
        except:
            print('Não foi possível calcular correlação com indicador {}'.format(c))

5903
5908
5913
5924
5929
5934
5945
5950
5955
15752
15754
15755
15756
15760
15765
15769
15773
15777
15781
15785
15789
15793
15797
15801
15805
15809
15813
15817
15820
15824
15829
15833
15837
21636
21637
21638
21639
21640
21641
21642
21643
21644
21645
21646
21647
21906
21907
21910
21911
21912
21913
28120
28122
28123
28124
28125
28126
28127
28128
28129
28130
28131
28132
28133
28135
28136
28137
28139
28140
28141
28142
28143
28144
28145
28147
28148
28149
28150
28151
28152
28154
28155
28157
28158
28159
28160
28161
28162
28163
28164
28166
28167
28168
28169
28170
28171
28172
28173
28174
28231
28232
28233
28234
28235
28236
28237
28239
28240
28241
28242
28243
28244
28245
28246
28247
28249
28257
28266
28267
28268
28269
28270
28271
28280
28288
28297
28298
28299
28301
28302
28304
28305
28306
28307
28308
28309
28310
28311
28313
28314
28315
28316
28317
28318
29166
29167
29168
29169
29170
29171
29748
29749
29750
29751
29752
29753
29754
29755
29756
30255
30279
46997
46998
46999
47001
47002
47004
47006
4

In [12]:
corr_values = pd.DataFrame(dict_corr).reset_index()
corr_values.rename(columns={'index':'id_indicador'}, inplace=True)
corr_values = corr_values.round(4)
corr_values

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


Unnamed: 0,id_indicador,Mortes,Casos,Letalidade
0,5903,0.0332,0.0356,0.0259
1,5908,0.0404,0.0428,0.027
2,5913,0.0434,0.044,0.027
3,5924,0.0244,0.0264,0.0252
4,5929,0.0319,0.0337,0.0268
5,5934,0.0328,0.0328,0.0286
6,5945,0.0282,0.0287,0.03
7,5950,0.0399,0.0429,0.0301
8,5955,0.0299,0.0297,0.0287
9,15752,-0.0012,-0.0018,-0.0003


In [13]:
corr_values.to_csv(main_path+'correlations.csv', index=False)

#### Refazendo para os faltantes

In [11]:
ids_faltantes = [76838,76839,76906,76908,76910,76912,76915,76916,76918,76920,76921,76922,76923,76924,76925,76926,76927,
                 76928,76929,76930,76931,76932,76933,76934,76936,76937,76938,76939,76940,76941,77861,77862,77863,77881,
                 77882,77887,77888,77893,77894,77899,77900,77901,77902,77903,77904,77905,77906,77907,77908,77909,77910,
                 77911,77912,77913,77914,77927,77928,78122,78159,78160,78161,78162,78163,78164,78165,78166,78167,78168,
                 78169,78170,78187,78192,82270]

In [12]:
dict_corr = {}
dict_corr['Mortes'] = {}
dict_corr['Casos'] = {}
dict_corr['Letalidade'] = {}

for i in ids_faltantes:
    c = str(i)
    print(c)
    try:
        dict_corr['Casos'][c] = df.stat.corr(c, 'Casos_Cumul')
        dict_corr['Mortes'][c] = df.stat.corr(c, 'Mortes_Cumul')
        dict_corr['Letalidade'][c] = df.stat.corr(c, 'Taxa_Letalidade')
    except:
        print('Não foi possível calcular correlação com indicador {}'.format(c))

76838
76839
76906
76908
76910
76912
76915
76916
76918
76920
76921
76922
76923
76924
76925
76926
76927
76928
76929
76930
76931
76932
76933
76934
76936
76937
76938
76939
76940
76941
77861
77862
77863
77881
77882
77887
77888
77893
77894
77899
77900
77901
77902
77903
77904
77905
77906
77907
77908
77909
77910
77911
77912
77913
77914
77927
77928
78122
78159
78160
78161
78162
78163
78164
78165
78166
78167
78168
78169
78170
78187
78192
82270


In [49]:
corr_values_2 = pd.DataFrame(dict_corr).reset_index()
corr_values_2.rename(columns={'index':'id_indicador'}, inplace=True)
corr_values_2 = corr_values_2.round(4)
corr_values_2

Unnamed: 0,id_indicador,Mortes,Casos,Letalidade
0,76838,,,
1,76839,,,
2,76906,,,
3,76908,,,
4,76910,,,
5,76912,,,
6,76915,,,
7,76916,,,
8,76918,,,
9,76920,,,


In [50]:
corr_values_2 = corr_values_2.round(4)
corr_values_2.to_csv(main_path+'correlations_2.csv', index=False)

#### Junta em um só arquivo

In [51]:
corr_values_1 = pd.read_csv(main_path+'correlations.csv')
corr_values_full = pd.concat([corr_values_1,corr_values_2])

In [52]:
print(corr_values_full.shape)
corr_values_full['id_indicador'].nunique()

(280, 4)


280

In [55]:
corr_values_full.to_csv(main_path+'correlations_full.csv', index=False)

In [56]:
corr_values_full

Unnamed: 0,id_indicador,Mortes,Casos,Letalidade
0,5903,0.0332,0.0356,0.0259
1,5908,0.0404,0.0428,0.027
2,5913,0.0434,0.044,0.027
3,5924,0.0244,0.0264,0.0252
4,5929,0.0319,0.0337,0.0268
5,5934,0.0328,0.0328,0.0286
6,5945,0.0282,0.0287,0.03
7,5950,0.0399,0.0429,0.0301
8,5955,0.0299,0.0297,0.0287
9,15752,-0.0012,-0.0018,-0.0003
