### Data Preparation
(Spark)

* Inputs:
    * /datasets/raw/brazil_covid19_cities.csv
    * /datasets/raw/Lista_Municípios_com_IBGE_Brasil_Versao_CSV.csv
    * /datasets/pesquisas_ibge_cidades/df_id_ibge/ (parquet)
* Outputs:
    * prepared/df_covid_complete/ (parquet)

In [1]:
# Spark
import findspark
findspark.init()

from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.window import Window

spark = SparkSession \
            .builder \
            .config("spark.sql.broadcastTimeout", "360000") \
            .config('spark.sql.execution.arrow.enabled', 'false') \
            .config("spark.driver.memory", '14G') \
            .config("spark.executor.memory", '14G') \
            .config("spark.driver.maxResultSize", '4G') \
            .getOrCreate()

In [2]:
import pandas as pd
from itertools import chain

### 0. Constantes

In [3]:
main_path = 'D:/data_dash_covid/datasets/'

### 1. Read data

In [4]:
# IBGE adota cód município de 6 e de 7 dígitos. Fazer o DE-PARA

br_cod_munic = spark.read.csv(main_path+'raw/Lista_Municípios_com_IBGE_Brasil_Versao_CSV.csv', 
                              sep=';', header=True, encoding='ISO-8859-1')\
                    .select('IBGE','IBGE7')\
                    .withColumnRenamed('IBGE','Município_Cod_6')\
                    .withColumnRenamed('IBGE7','Município_Cod_7')

In [5]:
df = spark.read.csv(main_path+'raw/brazil_covid19_cities.csv', header=True)\
          .select('date','code','cases','deaths')\
          .withColumnRenamed('date','Data')\
          .withColumnRenamed('cases','Casos_Cumul')\
          .withColumnRenamed('deaths','Mortes_Cumul')\
          .withColumnRenamed('code','Município_Cod_6')\
          .withColumn('Município_Cod_6', f.col('Município_Cod_6').cast('int'))

In [6]:
df_id_ibge = spark.read.parquet(main_path+'pesquisas_ibge_cidades/df_id_ibge/')\
                  .select('id_uf','sigla_uf','nome_uf','id_regiao','nome_regiao','id_cidade','nome_cidade')\
                  .distinct()\
                  .withColumnRenamed('id_cidade','Município_Cod_7')\
                  .withColumnRenamed('sigla_uf','UF')\
                  .withColumnRenamed('id_uf','UF_Cod')\
                  .withColumnRenamed('nome_uf','UF_Nome')\
                  .withColumnRenamed('nome_regiao','Região')\
                  .withColumnRenamed('id_regiao','Região_Cod')\
                  .withColumnRenamed('nome_cidade','Município')

In [7]:
df = df.join(br_cod_munic, 'Município_Cod_6', 'left')\
        .join(df_id_ibge, 'Município_Cod_7', 'left')\
        .drop('Município_Cod_6')\
        .withColumnRenamed('Município_Cod_7','Município_Cod')

### 2. Dimensões tempo

In [8]:
df = df.withColumn('Mês', f.month('Data')) \
        .withColumn('Mês_Nome', f.date_format('Data', 'MMMM')) \
        .withColumn('Dia_Semana', f.dayofweek('Data')) \
        .withColumn('Dia_Semana_Nome', f.date_format('Data', 'EEEE')) \
        .withColumn('Semana_Ano', f.weekofyear('Data'))

In [9]:
month_dict = {"March":"Março", "April":"Abril", "May":"Maio", "June":"Junho", "July":"Julho", "August":"Agosto",
              "September":"Setembro", "October":"Outubro"}

mapping_month = f.create_map([f.lit(x) for x in chain(*month_dict.items())])

In [10]:
weekday_dict = {"Sunday":"Domingo", "Monday":"Segunda-feira", "Tuesday":"Terça-feira", "Wednesday":"Quarta-feira", 
                "Thursday":"Quinta-feira", "Friday":"Sexta-feira", "Saturday":"Sábado",}

mapping_weekday = f.create_map([f.lit(x) for x in chain(*weekday_dict.items())])

In [11]:
df = df.withColumn('Mês_Nome', mapping_month[f.col('Mês_Nome')])\
       .withColumn('Dia_Semana_Nome', mapping_weekday[f.col('Dia_Semana_Nome')])

### 3. Novos casos e novas mortes

O dado original exibe o acumulado até a data, fazer a diferença entre hoje e ontem para obter os novos casos e mortes

In [12]:
w = Window.partitionBy('Município_Cod').orderBy('Data')
df = df.withColumn('Casos_Ontem', f.lag('Casos_Cumul').over(w)) \
        .withColumn('Novos_Casos', f.col('Casos_Cumul') - f.col('Casos_Ontem')) \
        .withColumn('Mortes_Ontem', f.lag('Mortes_Cumul').over(w)) \
        .withColumn('Novas_Mortes', f.col('Mortes_Cumul') - f.col('Mortes_Ontem')) \
        .drop('Casos_Ontem', 'Mortes_Ontem')

### 4. Export final data

In [13]:
df.write.parquet(main_path+'prepared/df_covid_complete/', mode='overwrite')

In [None]:
df.toPandas().to_csv(main_path+'prepared/pd_df_covid_complete.csv', sep=';', index=False, header=True)

In [14]:
df.limit(10).toPandas()

Unnamed: 0,Município_Cod,Data,Casos_Cumul,Mortes_Cumul,UF_Cod,UF,UF_Nome,Região_Cod,Região,Município,Mês,Mês_Nome,Dia_Semana,Dia_Semana_Nome,Semana_Ano,Novos_Casos,Novas_Mortes
0,1302108,2020-03-27,0,0,13,AM,Amazonas,1,Norte,Japurá,3,Março,6,Sexta-feira,13,,
1,1302108,2020-03-28,0,0,13,AM,Amazonas,1,Norte,Japurá,3,Março,7,Sábado,13,0.0,0.0
2,1302108,2020-03-29,0,0,13,AM,Amazonas,1,Norte,Japurá,3,Março,1,Domingo,13,0.0,0.0
3,1302108,2020-03-30,0,0,13,AM,Amazonas,1,Norte,Japurá,3,Março,2,Segunda-feira,14,0.0,0.0
4,1302108,2020-03-31,0,0,13,AM,Amazonas,1,Norte,Japurá,3,Março,3,Terça-feira,14,0.0,0.0
5,1302108,2020-04-01,0,0,13,AM,Amazonas,1,Norte,Japurá,4,Abril,4,Quarta-feira,14,0.0,0.0
6,1302108,2020-04-02,0,0,13,AM,Amazonas,1,Norte,Japurá,4,Abril,5,Quinta-feira,14,0.0,0.0
7,1302108,2020-04-03,0,0,13,AM,Amazonas,1,Norte,Japurá,4,Abril,6,Sexta-feira,14,0.0,0.0
8,1302108,2020-04-04,0,0,13,AM,Amazonas,1,Norte,Japurá,4,Abril,7,Sábado,14,0.0,0.0
9,1302108,2020-04-05,0,0,13,AM,Amazonas,1,Norte,Japurá,4,Abril,1,Domingo,14,0.0,0.0


In [15]:
df.count()

1180840