In [None]:
#Import packages
import pandas as pd
import plotly.express as px
import plotly.offline as pyo
import plotly.io as pio
import matplotlib.pyplot as plt
import seaborn as sns

# Set notebook to show plots
pyo.init_notebook_mode()
%matplotlib inline


# Set plots style
pio.templates.default = 'ggplot2'
sns.set_style("darkgrid")

In [None]:
#Define URL for csv files on COVID-19 volumes from Johns Hopkins University
iso_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv'

urls = dict(cases = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv',
            deaths = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
            recovers = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

#Define dict to renaming columns (/ is too problematic)
columns_renaming = {'Province/State':'Province_State', 'Country/Region':'Country_Region'}

In [None]:
#Get Countries ISO dataframe
df_iso = pd.read_csv(iso_url,index_col=0,parse_dates=[0])

#Get only country level info (They don't have Brazil's states)
df_iso = df_iso[df_iso.Province_State.isnull()].drop(columns = 'Province_State')

In [None]:
#Get DataFrames for Registered Cases, Deaths and Recovers

for key, url in urls.items():
    
    df = (
        pd.read_csv(url,index_col=0,parse_dates=[0])
        .reset_index()
        .rename(columns = columns_renaming)
    )
    
    #Put DataFrames in tidy format (Original file have one column per day)
    df = (
        df
        .melt(
            id_vars = ['Province_State', 'Country_Region', 'Lat', 'Long'],
            value_vars = (
                df
                .drop(columns = ['Province_State', 'Country_Region', 'Lat', 'Long'])
                .columns
            ),
            var_name = 'date',
            value_name = key
        )
    )
    
    #Define data types correctly
    df['date'] = pd.to_datetime(df['date'])
    df[key] = df[key].astype('Int64')
    
    #Agregate data from state and provinces
    df = (
        df
        .drop(columns = ['Province_State',
                         'Lat',
                         'Long'])
        .groupby(['Country_Region',
                  'date'], as_index = False)[key]
        .sum()
    )
    
    #Create DataFrames
    exec('df_{} = df'.format(key))

In [None]:
#Merge all DFs
df_covid = (
    df_iso
    .merge(df_cases, on = 'Country_Region')
    .merge(df_deaths, on = ['Country_Region', 'date'], how = 'outer')
    .merge(df_recovers, on = ['Country_Region', 'date'], how = 'outer')
)

In [None]:
#Find out which day the country had the first cases
day_1st_case = (
    df_covid[df_covid.cases >= 1]
    .sort_values('date')
    .groupby('Country_Region', as_index = False)
    .first()[['Country_Region', 'date']]
)

day_1st_case.columns = ['Country_Region', 'day_1st_case']

#Merge df_cases to day_1_cases
df_covid = df_covid.merge(day_1st_case, on = 'Country_Region')
df_covid['day_1st_case'] = pd.to_datetime(df_covid['day_1st_case'])


#Normalize days starting on 1st case's day
df_covid['days_since_1st_case'] = (df_covid['date'] - df_covid['day_1st_case']).dt.days

In [None]:
#Find out which day the country reached 100 cases
day_100th_case = (
    df_covid[df_covid.cases >= 100]
    .sort_values('date')
    .groupby('Country_Region', as_index = False)
    .first()[['Country_Region', 'date']]
)

day_100th_case.columns = ['Country_Region', 'day_100th_case']

#Merge df_cases to day_100th_cases
df_covid = df_covid.merge(day_100th_case, on = 'Country_Region')
df_covid['day_100th_case'] = pd.to_datetime(df_covid['day_100th_case'])


#Normalize days starting on 100th case's day
df_covid['days_since_100th_case'] = (df_covid['date'] - df_covid['day_100th_case']).dt.days

In [None]:
#Select only countries with 1500 or more deaths
most_deaths = (
    df_covid
    .groupby('Country_Region')
    .filter(lambda group: group['deaths'].nlargest(1) > 1500)['Country_Region']
    .drop_duplicates()
    .reset_index(drop = True)
)

In [None]:
#Add 'daily new observations' for cases, deaths and recovers
df_covid[['new_cases', 'new_deaths', 'new_recovers']] = (
    df_covid
    .groupby('Country_Region')[['cases', 'deaths', 'recovers']]
    .diff()
    .fillna(0)
)

In [None]:
#Add deaths by 1M people
df_covid['deaths_by_1m'] = (
    df_covid['deaths']
    .divide(
        df_covid['Population'].divide(1000000),
        fill_value = 0)
    .round(1)
)

In [None]:
#Convert Datetime to str because plotly doesn't animate with date
df_covid['date'] = df_covid['date'].astype(str)
df_covid = df_covid.sort_values('date', ascending = True)

#Plot countries cases stating on the day they had their 1st cases
px.choropleth(df_covid[df_covid.days_since_1st_case >= 0],
        locations = 'iso3',
        color = 'cases',
        animation_frame = 'date',
        animation_group = 'Country_Region',
        color_continuous_scale = px.colors.sequential.YlOrRd,
        projection = 'natural earth',
        hover_name = 'Country_Region',
        hover_data = ['cases', 'deaths', 'recovers', 'deaths_by_1m'],
        #range_color = [0, df_covid['cases'].max()],
        title = 'Total COVID-19 cases',
        labels = dict(deaths = "Accumulated registered deaths",
                      date = 'Date',
                      Country_Region = "Country",
                      cases = "Registered cases",
                      recovers = "Registered recovered pacients",
                      deaths_by_1m = "Deaths by 1M people")
       )

In [None]:
#Plot countries deaths stating on the day they reached 100 cases

px.line(df_covid[(df_covid.days_since_100th_case >= 0) & (df_covid['Country_Region'].isin(most_deaths))],
        x = 'days_since_100th_case',
        y = 'deaths',
        color = 'Country_Region',
        hover_name = 'Country_Region',
        hover_data = ['cases', 'deaths', 'recovers', 'deaths_by_1m'],
        log_y = True,
        title = 'Deaths since 100th case - Countries with more than 1500 deaths',
        labels = dict(deaths = "Accumulated registered deaths",
                      days_since_100th_case = "Days since 100th case",
                      Country_Region = "Country",
                      cases = "Registered cases",
                      recovers = "Registered recovered pacients",
                      deaths_by_1m = "Deaths by 1M people")
       )

In [None]:
g = sns.catplot(
    data = df_covid[(df_covid.days_since_100th_case >= 0) & (df_covid['Country_Region'].isin(most_deaths))],
    x = 'days_since_1st_case',
    y = 'new_deaths',
    col = 'Country_Region',
    col_wrap = 3,
    kind = 'bar'
)

g.set(yscale = 'log')
g.set_xticklabels(rotation = 90)
g.fig.suptitle("New deaths by day since 1st case - Countries with more than 1500 deaths")
plt.subplots_adjust(top = 0.95)