In [84]:
# Importa as bibliotecas necessárias
import pandas as pd
import datetime
import os
import lxml
import html5lib
from bs4 import BeautifulSoup
import requests

In [85]:
# Define variáveis
PASTA_SAIDA = '../dados/'
CORONABR = '../dados/corona_brasil.csv'
LINK_MINSAUDE = '../dados/auxiliares/link_minsaude.csv'
hoje = str(datetime.date.today())

hoje

'2020-03-27'

In [86]:
# Lê a tabela com links do Ministério da Saúde e encontra o dia de hoje
acervo = pd.read_csv(LINK_MINSAUDE)

url = acervo[acervo.Data == hoje].Link.values[0]

url

'https://www.saude.gov.br/noticias/agencia-saude/46614-brasil-registra-3-417-casos-confirmados-de-coronavirus-e-92-mortes'

In [87]:
# Baixa o link 
arquivo = requests.get(url).text
arquivo

'<!DOCTYPE html>\n<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="pt-br" dir="ltr"> <![endif]-->\n<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8" lang="pt-br" dir="ltr"> <![endif]-->\n<!--[if IE 8]>         <html class="no-js lt-ie9" lang="pt-br" dir="ltr"> <![endif]-->\n<!--[if gt IE 8]><!--> <html class="no-js" lang="pt-br" dir="ltr"> <!--<![endif]-->\n<head>\n<!-- Google Tag Manager -->\n<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({\'gtm.start\':\nnew Date().getTime(),event:\'gtm.js\'});var f=d.getElementsByTagName(s)[0],\nj=d.createElement(s),dl=l!=\'dataLayer\'?\'&l=\'+l:\'\';j.async=true;j.src=\n\'https://www.googletagmanager.com/gtm.js?id=\'+i+dl;f.parentNode.insertBefore(j,f);\n})(window,document,\'script\',\'dataLayer\',\'GTM-N9GV4MG\');</script>\n<!-- End Google Tag Manager -->\n       <!--[if lt IE 9]>\n    <script src="/templates/padraogoverno01/js/html5shiv.js"></script>\n    <![endif]-->\n    <link rel="stylesheet" href="/templates/

In [88]:
# Lê o arquivo HTML com os casos por estado

soup = BeautifulSoup(arquivo, 'lxml') 

table = soup.find_all('table')[0] 

rows = table.find_all('tr')

df = []

for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    df.append([ele for ele in cols if ele])
    
df = pd.DataFrame(df)

# Define nome das colunas
df.columns = df.iloc[0]

# Retira colunas desnecessárias para a conciliação da série histórica
df = df.iloc[2:,1:4]

df

Unnamed: 0,UF,CONFIRMADOS,ÓBITOS
2,,,
3,AC,25.0,0
4,AM,81.0,1
5,AP,2.0,0
6,PA,13.0,0
7,RO,6.0,0
8,RR,10.0,0
9,TO,8.0,0
10,,,
11,AL,11.0,0


In [89]:
# Converte o arquivo JavaScript para JSON
# Remove entradas impróprias
df = df[df.CONFIRMADOS.notnull()]
df = df[~df['ÓBITOS'].str.contains('%', na = False)] # Remove valores percentuais

df

Unnamed: 0,UF,CONFIRMADOS,ÓBITOS
3,AC,25.0,0
4,AM,81.0,1
5,AP,2.0,0
6,PA,13.0,0
7,RO,6.0,0
8,RR,10.0,0
9,TO,8.0,0
11,AL,11.0,0
12,BA,115.0,0
13,CE,282.0,3


In [92]:
# Adiciona coluna de data e renomeia

df['date'] = agora
#df
df.rename(columns={'UF':'uf', 'ÓBITOS':'deaths', 'CONFIRMADOS':'cases'}, inplace=True)
df

Unnamed: 0,uf,cases,deaths,date
3,AC,25.0,0,2020-03-27
4,AM,81.0,1,2020-03-27
5,AP,2.0,0,2020-03-27
6,PA,13.0,0,2020-03-27
7,RO,6.0,0,2020-03-27
8,RR,10.0,0,2020-03-27
9,TO,8.0,0,2020-03-27
11,AL,11.0,0,2020-03-27
12,BA,115.0,0,2020-03-27
13,CE,282.0,3,2020-03-27


In [93]:
# Carrega a série histórica
corona = pd.read_csv(CORONABR)

#Visualiza o resultado final
corona.append(df)

Unnamed: 0,uid,date,time,suspects,refuses,confirmado,deads,local,cases,comments,broadcast,deaths,uf
0,11.0,2020-03-25,,,,,,,5,,,0,RO
1,12.0,2020-03-25,,,,,,,23,,,0,AC
2,13.0,2020-03-25,,,,,,,54,,,1,AM
3,14.0,2020-03-25,,,,,,,8,,,0,RR
4,15.0,2020-03-25,,,,,,,7,,,0,PA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,,2020-03-27,,,,,,,28,,,0,MS
29,,2020-03-27,,,,,,,11,,,0,MT
31,,2020-03-27,,,,,,,119,,,2,PR
32,,2020-03-27,,,,,,,149,,,1,SC


In [95]:
# Exporta a base em CSV

dados = os.path.join(PASTA_SAIDA, 'corona_brasil' + '.csv')

corona.append(df).to_csv(dados, index = False)