In [168]:
# Importa as bibliotecas necessárias
import pandas as pd
import datetime
import os
import lxml
import html5lib
from bs4 import BeautifulSoup
import requests

In [169]:
# Define a localização de arquivos e variáveis importantes

PASTA_SAIDA = '../dados/'
HISTORICO = '../dados/corona_brasil.csv'
LINK_MINSAUDE = '../dados/auxiliares/link_minsaude.csv'
HOJE = str(datetime.date.today())

HOJE

'2020-03-27'

In [170]:
# Lê a tabela com links do Ministério da Saúde e encontra o dia de hoje
acervo = pd.read_csv(LINK_MINSAUDE)

url = acervo[acervo.Data == HOJE].Link.values[0]

url

'https://www.saude.gov.br/noticias/agencia-saude/46614-brasil-registra-3-417-casos-confirmados-de-coronavirus-e-92-mortes'

In [171]:
# Baixa o link 
arquivo = requests.get(url).text

In [172]:
# Lê o arquivo HTML com os casos por estado

soup = BeautifulSoup(arquivo, 'lxml') 

table = soup.find_all('table')[0] 

rows = table.find_all('tr')

df = []

for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    df.append([ele for ele in cols if ele])
    
df = pd.DataFrame(df)

# Define nome das colunas
df.columns = df.iloc[0]

# Retira colunas desnecessárias para a conciliação da série histórica
df = df.iloc[2:,1:4]

df

Unnamed: 0,UF,CONFIRMADOS,ÓBITOS
2,,,
3,AC,25.0,0
4,AM,81.0,1
5,AP,2.0,0
6,PA,13.0,0
7,RO,6.0,0
8,RR,10.0,0
9,TO,8.0,0
10,,,
11,AL,11.0,0


In [173]:
# Remove entradas impróprias
df = df[df.CONFIRMADOS.notnull()]
df = df[~df['ÓBITOS'].str.contains('%', na = False)] # Remove valores percentuais

df


Unnamed: 0,UF,CONFIRMADOS,ÓBITOS
3,AC,25.0,0
4,AM,81.0,1
5,AP,2.0,0
6,PA,13.0,0
7,RO,6.0,0
8,RR,10.0,0
9,TO,8.0,0
11,AL,11.0,0
12,BA,115.0,0
13,CE,282.0,3


In [174]:
# Adiciona coluna de data e renomeia

df['date'] = agora
df.rename(columns={'UF':'uf', 'ÓBITOS':'deaths', 'CONFIRMADOS':'cases'}, inplace=True)

df= df.reset_index()

df # Contagem de zero até 26: ok, temos as 27 UFs

Unnamed: 0,index,uf,cases,deaths,date
0,3,AC,25.0,0,2020-03-27
1,4,AM,81.0,1,2020-03-27
2,5,AP,2.0,0,2020-03-27
3,6,PA,13.0,0,2020-03-27
4,7,RO,6.0,0,2020-03-27
5,8,RR,10.0,0,2020-03-27
6,9,TO,8.0,0,2020-03-27
7,11,AL,11.0,0,2020-03-27
8,12,BA,115.0,0,2020-03-27
9,13,CE,282.0,3,2020-03-27


In [175]:
# Carrega a série histórica
corona = pd.read_csv(CORONABR)

# Agrega o resultado de hoje
corona.append(df)

Unnamed: 0,uid,date,time,suspects,refuses,confirmado,deads,local,cases,comments,broadcast,deaths,uf,index
0,11.0,2020-03-25,,,,,,,5,,,0,RO,
1,12.0,2020-03-25,,,,,,,23,,,0,AC,
2,13.0,2020-03-25,,,,,,,54,,,1,AM,
3,14.0,2020-03-25,,,,,,,8,,,0,RR,
4,15.0,2020-03-25,,,,,,,7,,,0,PA,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22,,2020-03-27,,,,,,,28,,,0,MS,28.0
23,,2020-03-27,,,,,,,11,,,0,MT,29.0
24,,2020-03-27,,,,,,,119,,,2,PR,31.0
25,,2020-03-27,,,,,,,149,,,1,SC,32.0


In [133]:
# O dataset originalmente disponibilizado pelo Ministério da Saúde 
# continha os seguintes registros duplicados no dia 25/02/2020
# Vamos salvar eles no arquivo registros_duplicados.csv para registro histórico
# duplifile = os.path.join(PASTA_SAIDA, 'registros_duplicados' + '.csv')
# corona[corona.duplicated()].to_csv(duplifile, index = False)

Unnamed: 0,uid,date,time,suspects,refuses,confirmado,deads,local,cases,comments,broadcast,deaths,uf
657,23.0,2020-02-25,13:50,0.0,1.0,0.0,0.0,0,0.0,0,0,0,CE
659,31.0,2020-02-25,13:50,0.0,2.0,0.0,0.0,0,0.0,0,0,0,MG
661,33.0,2020-02-25,13:50,0.0,8.0,0.0,0.0,0,0.0,0,0,0,RJ
663,35.0,2020-02-25,13:50,4.0,26.0,0.0,0.0,0,0.0,0,0,0,SP
665,41.0,2020-02-25,13:50,0.0,3.0,0.0,0.0,0,0.0,0,0,0,PR
667,42.0,2020-02-25,13:50,0.0,4.0,0.0,0.0,0,0.0,0,0,0,SC
669,43.0,2020-02-25,13:50,0.0,10.0,0.0,0.0,0,0.0,0,0,0,RS
671,53.0,2020-02-25,13:50,0.0,1.0,0.0,0.0,0,0.0,0,0,0,DF
811,23.0,2020-02-06,15:30,0.0,1.0,0.0,0.0,0,0.0,0,0,0,CE
813,31.0,2020-02-06,15:30,1.0,1.0,0.0,0.0,0,0.0,0,0,0,MG


In [176]:
# Aqui vamos remover aquelas e outras entradas duplicados. 
# Por exemplo, caso o script seja rodado sem novas atualizações.
corona = corona.drop_duplicates()

# Transofrma casos e mortes em números inteiros
corona['deaths'] = corona['deaths'].astype(int)
corona['cases'] = corona['cases'].astype(int)

corona

Unnamed: 0,uid,date,time,suspects,refuses,confirmado,deads,local,cases,comments,broadcast,deaths,uf
0,11.0,2020-03-25,,,,,,,5,,,0,RO
1,12.0,2020-03-25,,,,,,,23,,,0,AC
2,13.0,2020-03-25,,,,,,,54,,,1,AM
3,14.0,2020-03-25,,,,,,,8,,,0,RR
4,15.0,2020-03-25,,,,,,,7,,,0,PA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
907,,2020-03-27,,,,,,,28,,,0,MS
908,,2020-03-27,,,,,,,11,,,0,MT
909,,2020-03-27,,,,,,,119,,,2,PR
910,,2020-03-27,,,,,,,149,,,1,SC


In [177]:
# Exporta a base em CSV
dados = os.path.join(PASTA_SAIDA, 'corona_brasil' + '.csv')

corona.to_csv(dados, index = False)