# Projeto Covid-19
## Digital Inovation One

Primeiro vamos importar algumas das bibliotecas necessárias para nosso projeto de hoje.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
import re

In [2]:
# importando os dados para o projeto
df = pd.read_csv("covid_19_data.csv", parse_dates=['ObservationDate', 'Last Update'])
df

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,2020-01-22,Anhui,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
1,2,2020-01-22,Beijing,Mainland China,2020-01-22 17:00:00,14.0,0.0,0.0
2,3,2020-01-22,Chongqing,Mainland China,2020-01-22 17:00:00,6.0,0.0,0.0
3,4,2020-01-22,Fujian,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
4,5,2020-01-22,Gansu,Mainland China,2020-01-22 17:00:00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
26708,26709,2020-05-19,Wyoming,US,2020-05-20 02:32:19,776.0,10.0,0.0
26709,26710,2020-05-19,Xinjiang,Mainland China,2020-05-20 02:32:19,76.0,3.0,73.0
26710,26711,2020-05-19,Yukon,Canada,2020-05-20 02:32:19,11.0,0.0,11.0
26711,26712,2020-05-19,Yunnan,Mainland China,2020-05-20 02:32:19,185.0,2.0,183.0


In [3]:
# Conferir os tipos de cada coluna
df.dtypes

SNo                         int64
ObservationDate    datetime64[ns]
Province/State             object
Country/Region             object
Last Update        datetime64[ns]
Confirmed                 float64
Deaths                    float64
Recovered                 float64
dtype: object

Nomes de colunas não devem ter letras maiúsculas e nem caracteres especiais.
Vamos implementar uma função para fazer a limepsa dos nomes dessas colunas.

In [4]:
def corrige_colunas(col_name):
    return re.sub(r"[/| ]", "", col_name).lower()

In [5]:
corrige_colunas("AdgE/P ou") # testando a função

'adgepou'

In [6]:
# Vamos corrigir todas as colunas do df
df.columns = [corrige_colunas(col) for col in df.columns]

In [7]:
df

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
0,1,2020-01-22,Anhui,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
1,2,2020-01-22,Beijing,Mainland China,2020-01-22 17:00:00,14.0,0.0,0.0
2,3,2020-01-22,Chongqing,Mainland China,2020-01-22 17:00:00,6.0,0.0,0.0
3,4,2020-01-22,Fujian,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
4,5,2020-01-22,Gansu,Mainland China,2020-01-22 17:00:00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
26708,26709,2020-05-19,Wyoming,US,2020-05-20 02:32:19,776.0,10.0,0.0
26709,26710,2020-05-19,Xinjiang,Mainland China,2020-05-20 02:32:19,76.0,3.0,73.0
26710,26711,2020-05-19,Yukon,Canada,2020-05-20 02:32:19,11.0,0.0,11.0
26711,26712,2020-05-19,Yunnan,Mainland China,2020-05-20 02:32:19,185.0,2.0,183.0


# Brasil
 - Vamos selecionar apenas os dados do Brasil para investigar

In [8]:
df.countryregion.value_counts()

US                     4990
Mainland China         3687
Canada                 1093
Australia               788
France                  752
                       ... 
North Ireland             1
Channel Islands           1
Cape Verde                1
Republic of Ireland       1
East Timor                1
Name: countryregion, Length: 223, dtype: int64

In [9]:
df.countryregion.unique()

array(['Mainland China', 'Hong Kong', 'Macau', 'Taiwan', 'US', 'Japan',
       'Thailand', 'South Korea', 'Singapore', 'Philippines', 'Malaysia',
       'Vietnam', 'Australia', 'Mexico', 'Brazil', 'Colombia', 'France',
       'Nepal', 'Canada', 'Cambodia', 'Sri Lanka', 'Ivory Coast',
       'Germany', 'Finland', 'United Arab Emirates', 'India', 'Italy',
       'UK', 'Russia', 'Sweden', 'Spain', 'Belgium', 'Others', 'Egypt',
       'Iran', 'Israel', 'Lebanon', 'Iraq', 'Oman', 'Afghanistan',
       'Bahrain', 'Kuwait', 'Austria', 'Algeria', 'Croatia',
       'Switzerland', 'Pakistan', 'Georgia', 'Greece', 'North Macedonia',
       'Norway', 'Romania', 'Denmark', 'Estonia', 'Netherlands',
       'San Marino', ' Azerbaijan', 'Belarus', 'Iceland', 'Lithuania',
       'New Zealand', 'Nigeria', 'North Ireland', 'Ireland', 'Luxembourg',
       'Monaco', 'Qatar', 'Ecuador', 'Azerbaijan', 'Czech Republic',
       'Armenia', 'Dominican Republic', 'Indonesia', 'Portugal',
       'Andorra', 'Latvia

In [10]:
df.loc[df.countryregion == "Brazil"]

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
82,83,2020-01-23,,Brazil,2020-01-23 17:00:00,0.0,0.0,0.0
2455,2456,2020-02-26,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2559,2560,2020-02-27,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2668,2669,2020-02-28,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2776,2777,2020-02-29,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...
24850,24851,2020-05-15,,Brazil,2020-05-16 02:32:19,220291.0,14962.0,84970.0
25227,25228,2020-05-16,,Brazil,2020-05-17 02:32:32,233511.0,15662.0,89672.0
25604,25605,2020-05-17,,Brazil,2020-05-18 02:32:21,241080.0,16118.0,94122.0
25981,25982,2020-05-18,,Brazil,2020-05-19 02:32:18,255368.0,16853.0,100459.0


In [11]:
# filtro para identificar no brasil apenas as linhas que tenham pelo menos 1 caso confirmado
brasil = df.loc[
    (df.countryregion == 'Brazil') &
    (df.confirmed > 0)
]

In [12]:
brasil

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
2455,2456,2020-02-26,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2559,2560,2020-02-27,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2668,2669,2020-02-28,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2776,2777,2020-02-29,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
2903,2904,2020-03-01,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...
24850,24851,2020-05-15,,Brazil,2020-05-16 02:32:19,220291.0,14962.0,84970.0
25227,25228,2020-05-16,,Brazil,2020-05-17 02:32:32,233511.0,15662.0,89672.0
25604,25605,2020-05-17,,Brazil,2020-05-18 02:32:21,241080.0,16118.0,94122.0
25981,25982,2020-05-18,,Brazil,2020-05-19 02:32:18,255368.0,16853.0,100459.0


# Casos Confirmados

In [13]:
# Gráfico da evolução de casos confirmados
px.line(brasil, 'observationdate', 'confirmed', title='Casos confirmados no Brasil')

# Novos casos por dia

In [14]:
# Técnica de programação funcional
brasil['novos_casos'] = list(map(
    lambda x: 0 if (x==0) else brasil['confirmed'].iloc[x] - brasil['confirmed'].iloc[x-1],
    np.arange(brasil.shape[0])
))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [15]:
brasil

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered,novos_casos
2455,2456,2020-02-26,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0,0.0
2559,2560,2020-02-27,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0,0.0
2668,2669,2020-02-28,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0,0.0
2776,2777,2020-02-29,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0,1.0
2903,2904,2020-03-01,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
24850,24851,2020-05-15,,Brazil,2020-05-16 02:32:19,220291.0,14962.0,84970.0,17126.0
25227,25228,2020-05-16,,Brazil,2020-05-17 02:32:32,233511.0,15662.0,89672.0,13220.0
25604,25605,2020-05-17,,Brazil,2020-05-18 02:32:21,241080.0,16118.0,94122.0,7569.0
25981,25982,2020-05-18,,Brazil,2020-05-19 02:32:18,255368.0,16853.0,100459.0,14288.0


In [16]:
# Visualizando
px.line(brasil, 'observationdate', 'novos_casos', title='Novos casos por dia')

## Mortes

In [19]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(x=brasil.observationdate, y=brasil.deaths, name='Mortes',
            mode='lines+markers',line={'color':'red'})

)

#Layout
fig.update_layout(title='Mortes por COVID-19 no Brasil')

fig.show()

# Taxa de crescimento

taxa_crescimento = (presente/passado)**(1/n) - 1

In [21]:
def taxa_cescimento(data, variable, data_inicio=None, data_fim=None):
    # se data inicio for None, define como a primeira data disponível
    if (data_inicio == None):
        data_inicio = data.observationdate.loc[data[variable] > 0].min()
    else:
        data_inicio = pd.to_datetime(data_inicio)

    if (data_fim == None):
        data_fim = data.observationdate.iloc[-1]
    else:
        data_fim = pd.to_datetime(data_fim)

    #Define os valores do presente e passado
    passado = data.loc[data.observationdate == data_inicio, variable].values[0]
    presente = data.loc[data.observationdate == data_fim, variable].values[0]

    #Define o número de pontos no tempo que vamos avaliar
    n = (data_fim - data_inicio).days

    # Calcular a taxa
    taxa = (presente/passado)**(1/n) - 1

    return taxa*100

In [22]:
# Taxa de crescimento médio da Covid no Brasil em todo o período
taxa_cescimento(brasil, 'confirmed')

16.27183353112116

In [32]:
def taxa_crescimento_diario(data, variable, data_inicio=None):
    # se data inicio for None, define como a primeira data disponível
    if (data_inicio == None):
        data_inicio = data.observationdate.loc[data[variable] > 0].min()
    else:
        data_inicio = pd.to_datetime(data_inicio)

    data_fim = data.observationdate.max()
    #Define o número de pontos no tempo que vamos avaliar
    n = (data_fim - data_inicio).days

    #Taxa calculada de um  dia para o outro
    taxas = list(map(
        lambda x: (data[variable].iloc[x] - data[variable].iloc[x-1]) / data[variable].iloc[x-1],
                range(1, n+1)
    ))
    return np.array(taxas) * 100

In [33]:
tx_dia = taxa_crescimento_diario(brasil, 'confirmed')

In [28]:
tx_dia

array([0.00000e+00, 0.00000e+00, 1.00000e+02, 1.00000e+02, 1.00000e+02,
       1.00000e+02, 3.00000e+02, 3.00000e+02, 1.20000e+03, 1.20000e+03,
       1.90000e+03, 2.40000e+03, 3.00000e+03, 3.70000e+03, 5.10000e+03,
       1.50000e+04, 1.50000e+04, 1.61000e+04, 1.99000e+04, 3.20000e+04,
       3.71000e+04, 6.20000e+04, 7.92000e+04, 1.02000e+05, 1.54500e+05,
       1.92300e+05, 2.24600e+05, 2.55300e+05, 2.98400e+05, 3.41600e+05,
       3.90300e+05, 4.25500e+05, 4.57800e+05, 5.71600e+05, 6.83500e+05,
       8.04300e+05, 9.05500e+05, 1.03590e+06, 1.11290e+06, 1.21600e+06,
       1.40330e+06, 1.61690e+06, 1.80910e+06, 1.96370e+06, 2.07260e+06,
       2.21910e+06, 2.34290e+06, 2.52610e+06, 2.83190e+06, 3.04240e+06,
       3.36810e+06, 3.66570e+06, 3.86530e+06, 4.07420e+06, 4.30780e+06,
       4.57560e+06, 5.00350e+06, 5.40420e+06, 5.93230e+06, 6.30990e+06,
       6.74450e+06, 7.32340e+06, 7.96840e+06, 8.71860e+06, 9.22010e+06,
       9.70990e+06, 1.01825e+07, 1.08619e+07, 1.15454e+07, 1.266

In [34]:
primeiro_dia = brasil.observationdate.loc[brasil.confirmed > 0].min()

px.line(x=pd.date_range(primeiro_dia, brasil.observationdate.max())[1:],
        y=tx_dia, title='Taxa de crescimento de casos confirmados no Brasil')