Import data from Brazil

In [1]:
import pandas as pd
import glob
import os

path = os.getcwd() + "/data_source/Brazil/brazil_covid19.csv"
path = path.replace("/",os.path.sep)

columns = ["total_cases","daily_cases","actual_cases","total_dead","daily_dead", "total_test","daily_test"]


dataset = pd.read_csv(path, index_col = 0)
dataset = dataset.fillna(0)
dataset

Unnamed: 0_level_0,hour,state,suspects,refuses,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-30,16:00,Minas Gerais,1,0,0,0
2020-01-30,16:00,Rio de Janeiro,1,0,0,0
2020-01-30,16:00,Santa Catarina,0,2,0,0
2020-01-30,16:00,São Paulo,3,1,0,0
2020-01-30,16:00,Rio Grande do Sul,2,2,0,0
...,...,...,...,...,...,...
2020-03-23,0,Roraima,7,22,2,0
2020-03-23,0,Santa Catarina,410,0,68,0
2020-03-23,0,São Paulo,0,0,745,30
2020-03-23,0,Sergipe,0,0,10,0


In [12]:
#Delate duplicated values with the same index!
dataset = dataset[~dataset.reset_index().duplicated().values]

Grouping by date

In [13]:
dataset = dataset.groupby(["date"]).sum()
dataset

Unnamed: 0_level_0,suspects,refuses,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-30,9,6,0,0
2020-01-31,13,9,0,0
2020-02-01,16,10,0,0
2020-02-02,16,10,0,0
2020-02-03,14,13,0,0
2020-02-04,13,16,0,0
2020-02-05,11,21,0,0
2020-02-06,9,24,0,0
2020-02-07,8,26,0,0
2020-02-08,8,28,0,0


Day 2020-03-09 is missing so we add the date with unchanged values from 2020-03-08

In [14]:
dataset.loc["2020-03-09"] = dataset.loc["2020-03-08"]
dataset.sort_index(inplace=True)
dataset

Unnamed: 0_level_0,suspects,refuses,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-30,9,6,0,0
2020-01-31,13,9,0,0
2020-02-01,16,10,0,0
2020-02-02,16,10,0,0
2020-02-03,14,13,0,0
2020-02-04,13,16,0,0
2020-02-05,11,21,0,0
2020-02-06,9,24,0,0
2020-02-07,8,26,0,0
2020-02-08,8,28,0,0


Creating a new dataframe with our specific format

In [15]:
brazil_dataset = pd.DataFrame().reindex_like(dataset)
brazil_dataset = pd.DataFrame(index = dataset.index , columns = columns) 
brazil_dataset = brazil_dataset.fillna(0)
brazil_dataset

Unnamed: 0_level_0,total_cases,daily_cases,actual_cases,total_dead,daily_dead,total_test,daily_test
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-30,0,0,0,0,0,0,0
2020-01-31,0,0,0,0,0,0,0
2020-02-01,0,0,0,0,0,0,0
2020-02-02,0,0,0,0,0,0,0
2020-02-03,0,0,0,0,0,0,0
2020-02-04,0,0,0,0,0,0,0
2020-02-05,0,0,0,0,0,0,0
2020-02-06,0,0,0,0,0,0,0
2020-02-07,0,0,0,0,0,0,0
2020-02-08,0,0,0,0,0,0,0


Filling the new dataframe with the avaiable data

In [16]:
from datetime import datetime, timedelta

for days in dataset.index:
    try:
        #from 2nd row ---- to subtract a day from the date we had to convert it to a date and then back to a string
        yesterday = (datetime.strptime(days,"%Y-%m-%d")-timedelta(days=1)).strftime("%Y-%m-%d")
        brazil_dataset.loc[days]["daily_cases"] = dataset.loc[days]["cases"]-dataset.loc[yesterday]["cases"]
        brazil_dataset.loc[days]["daily_dead"] = dataset.loc[days]["deaths"] - dataset.loc[yesterday]["deaths"]
        brazil_dataset.loc[days]["daily_test"] = (dataset.loc[days]["suspects"] + dataset.loc[days]["refuses"] + dataset.loc[days]["cases"]) - (dataset.loc[yesterday]["suspects"] + dataset.loc[yesterday]["refuses"] + dataset.loc[yesterday]["cases"])
    except:
        #First row
        brazil_dataset.loc[days]["daily_cases"] = dataset.loc[days]["cases"]
        brazil_dataset.loc[days]["daily_dead"] = dataset.loc[days]["deaths"]
        brazil_dataset.loc[days]["daily_test"] = dataset.loc[days]["suspects"] + dataset.loc[days]["refuses"] + dataset.loc[days]["cases"]
    finally:
        #Single row
        brazil_dataset.loc[days]["total_cases"] = dataset.loc[days]["cases"]
        #Missing recovered -> brazil_dataset.loc[days]["actual_cases"] =
        brazil_dataset.loc[days]["total_dead"] = dataset.loc[days]["deaths"]
        brazil_dataset.loc[days]["total_test"] = dataset.loc[days]["suspects"] + dataset.loc[days]["refuses"] + dataset.loc[days]["cases"]


brazil_dataset

Unnamed: 0_level_0,total_cases,daily_cases,actual_cases,total_dead,daily_dead,total_test,daily_test
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-30,0,0,0,0,0,15,15
2020-01-31,0,0,0,0,0,22,7
2020-02-01,0,0,0,0,0,26,4
2020-02-02,0,0,0,0,0,26,0
2020-02-03,0,0,0,0,0,27,1
2020-02-04,0,0,0,0,0,29,2
2020-02-05,0,0,0,0,0,32,3
2020-02-06,0,0,0,0,0,33,1
2020-02-07,0,0,0,0,0,34,1
2020-02-08,0,0,0,0,0,36,2


Converting dates from String to Datetime

In [17]:
import datetime as dt

index_list = brazil_dataset.index.tolist()

for i in range(len(index_list)):
    index_list[i] = dt.datetime.strptime(index_list[i], '%Y-%m-%d').date()

brazil_dataset.index = index_list
brazil_dataset.index.name = 'date'

Il Database negli ultimi giorni è diventato inconsistente, per esempio in San Paolo il numero di sospetti e negativi diventa 0, nonstante i dati siano comulativi

In [18]:
#save data as CSV 

path = os.getcwd() + "/parsed_data/brazil/"
path = path.replace("/",os.path.sep)

brazil_dataset.to_csv (path + "brazil.csv", index = True, header=True)

