# COVID19 International Version 2

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Python-setup" data-toc-modified-id="Python-setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Python setup</a></span></li><li><span><a href="#Get-the-raw-data-and-adjust-for-anomalies" data-toc-modified-id="Get-the-raw-data-and-adjust-for-anomalies-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Get the raw data and adjust for anomalies</a></span></li><li><span><a href="#Plot-weekly-data-since-the-year-began" data-toc-modified-id="Plot-weekly-data-since-the-year-began-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Plot weekly data since the year began</a></span></li><li><span><a href="#Plot-new-vs-cumulative-for-the-past-three-months" data-toc-modified-id="Plot-new-vs-cumulative-for-the-past-three-months-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Plot new-vs-cumulative for the past three months</a></span></li></ul></div>

## Python setup

In [1]:
# system imports
import sys
from pathlib import Path

#analytic imports
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.dates as mdates
import matplotlib.units as munits
from pandas.plotting import register_matplotlib_converters

# COVID19 specific imports
sys.path.append(r'../bin')
from datagrabber import get_data, get_population_from_eu
import plotstuff as ps

# directory
CHART_DIRECTORY = '../charts'
Path(CHART_DIRECTORY).mkdir(parents=True, exist_ok=True)

# display settings
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

# plotting stuff
plt.style.use('ggplot')
register_matplotlib_converters()

## Get the raw data and adjust for anomalies

In [2]:
DATA_SOURCE = 'EU' # 'EU' or 'OWID'
CASES = 0
DEATHS = 1
modes = {
    'Cases': CASES,
    'Deaths': DEATHS,
}
raw_cum_data = [None, None]
raw_daily_data = [None, None]
adj_daily_data = [None, None]
adj_cum_data = [None, None]

data_quality = [None, None]

for mode, index in modes.items():
    
    # get the raw data
    raw_cum_data[index], source = get_data(data_type=mode.lower(), 
                                     from_where=DATA_SOURCE)
    
    # make missing data zero
    raw_cum_data[index] = raw_cum_data[index].fillna(0)
    
    # adjust data for anomalies
    (raw_daily_data[index], 
        adj_daily_data[index], 
        adj_cum_data[index]) = ps.dataframe_correction(raw_cum_data[index])
    
    # identify whether the adjustment for anomalies chanhed the data
    data_quality[index] = pd.Series(None, 
                                    index=raw_cum_data[index].columns,
                                    dtype='str'
                                   )
    for col in raw_cum_data[index].columns:
        if (raw_daily_data[index][col] == adj_daily_data[index][col]).all():
            data_quality[index][col] = f'Source: {source}; original data'
        else:
            data_quality[index][col] = f'Source: {source}; data adjusted for extreme outliers'

Retrieving data from: https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-2020-11-21.xlsx
https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-2020-11-21.xlsx
Spikes in Andorra
DateRep  2020-06-03
spike     79.000000
mean       0.714286
zeros     10.000000
Data too sparse in Anguilla (max_consecutive=1)
Data too sparse in Antigua And Barbuda (max_consecutive=2)
Data too sparse in Barbados (max_consecutive=11)
Negatives in Benin
DateRep
2020-05-20   -209.0
Name: Benin, dtype: float64
Data too sparse in Benin (max_consecutive=10)
Data too sparse in Bermuda (max_consecutive=3)
Data too sparse in Bhutan (max_consecutive=6)
Data too sparse in Bonaire, Saint Eustatius And Saba (max_consecutive=5)
Data too sparse in Botswana (max_consecutive=2)
Data too sparse in British Virgin Islands (max_consecutive=2)
Spikes in Burkina Faso
DateRep  2020-05-07  2020-09-14
spike     41.000000  193.000000
mean  

Spikes in China
DateRep  2020-04-17
spike        1290.0
mean            0.5
zeros          10.0
Data too sparse in Comoros (max_consecutive=1)
Data too sparse in Congo (max_consecutive=2)
Data too sparse in Cote D'Ivoire (max_consecutive=5)
Data too sparse in Cuba (max_consecutive=14)
Data too sparse in Curaã§Ao (max_consecutive=1)
Data too sparse in Cyprus (max_consecutive=6)
Negatives in Czechia
DateRep
2020-07-05   -1.0
2020-07-06   -3.0
Name: Czechia, dtype: float64
Data too sparse in Democratic Republic Of Congo (max_consecutive=9)
Data too sparse in Djibouti (max_consecutive=4)
Data too sparse in Dominica (max_consecutive=0)
Spikes in Ecuador
DateRep   2020-09-07  2020-10-09
spike    3800.000000       398.0
mean       40.428571        39.5
zeros       0.000000         0.0
Data too sparse in Equatorial Guinea (max_consecutive=2)
Data too sparse in Eritrea (max_consecutive=0)
Data too sparse in Estonia (max_consecutive=9)
Data too sparse in Eswatini (max_consecutive=8)
Data too spa

## Plot weekly data since the year began

In [3]:
def plot_weekly(daily, mode, data_quality, dfrom="2020-01-21"):
    """Plot weekly bar charts for daily new cases and deaths
        Function paramters:
        - daily is a DataFrame of daily timeseries data
        - mode is one of 'cases' or 'deaths' 
        - data_quality is a Series of strings,
            used for the left footnote on charts
        - dfrom is a date string to display from
        Returns: weekly data in a DataFrame """
    
    DISPLAY_FROM = pd.Timestamp(dfrom)
    
    # find the day that the week ends - last day of dataframe
    LAST_DAY = daily.index[-1]
    RULE = {
        0: 'W-MON',
        1: 'W-TUE',
        2: 'W-WED',
        3: 'W-THU',
        4: 'W-FRI',
        5: 'W-SAT',
        6: 'W-SUN',
    }[LAST_DAY.dayofweek]

    # convert the data to weekly
    returnable = weekly = daily.resample(rule=RULE, closed='right').sum()
    total = weekly.sum()
    
    # adjust data and dates for plotting
    # we move the data by half a week becuase we want the bars to be centred
    weekly = weekly[weekly.index > DISPLAY_FROM]
    weekly.index = weekly.index - pd.Timedelta(3.5, unit='d')
    
    for name in daily.columns:
    
        # avoid plotting an empty plot
        if total[name] == 0: continue
    
        # plot the data
        fig, ax = plt.subplots(figsize=(8, 4))
        ax.bar(weekly.index, weekly[name], width=5, color='#dd0000', )
        ax.margins(0.01)
            
        # This makes the dates for xticklabels look a little nicer
        locator = mdates.AutoDateLocator(minticks=4, maxticks=13)
        formatter = mdates.ConciseDateFormatter(locator)
        ax.xaxis.set_major_locator(locator)
        ax.xaxis.set_major_formatter(formatter)

        # label the plot
        ax.set_title(f'COVID-19 {mode.title()}: {name}')
        ax.set_ylabel(f'New {mode.lower()}/week ending {RULE[-3:].title()}')
        ax.set_xlabel(None)
        
        # adjust y-limits to be prettier, 
        # assume ylim[0] is zero
        # this adjustment should not be needed, but it is
        ylim = ax.get_ylim()
        ylim = ylim[0], ylim[1] * 1.025
        ax.set_ylim(ylim)
        if ylim[0] != 0:
            # this should not happen - ever.
            print(f'Warning: ylim[0] is {ylim[0]} for {name}')
        
        # an ugly kludge for putting commas in the ylabels
        def small(x): return np.round(x, 2)
        def big(x): return int(x)
        functor = small if weekly[name].max() <= 4 else big
        ax.get_yaxis().set_major_formatter(
            mpl.ticker.FuncFormatter(
                lambda x, p: format(functor(x), ',')))
            
        fig = ax.figure
        
        # footnote the plot
        fig.text(0.01, 0.01, 
            data_quality[index][name],
            ha='left', va='bottom',
            fontsize=9, fontstyle='italic',
            color='#999999')
        fig.text(0.99, 0.01, 
            f'Total {mode.lower()}: {int(total[name]):,}',
            ha='right', va='bottom',
            fontsize=9, fontstyle='italic',
            color='#999999')

        # final tidy - save - close
        fig.tight_layout(pad=1.2)
        #plt.show()
        fig.savefig(
            f'{CHART_DIRECTORY}/{name}-{mode.lower()}-new-weekly', 
            dpi=125)
        plt.close()
        
    return returnable

In [4]:
if False: # switch - turn this output on/off
    adj_weekly_data = [None, None]
    for (mode, index) in modes.items():
        daily = adj_daily_data[index]
        adj_weekly_data[index] = plot_weekly(daily, mode.lower(), data_quality)

## Plot new-vs-cumulative for the past three months

In [5]:
if True: # switch - turn this output on/off
    THREE_MONTHS = 93 # days
    for mode, index in modes.items():
        for name in adj_cum_data[index].columns:
        
            # let's not plot empty charts
            if adj_daily_data[index][name][-THREE_MONTHS:].sum() == 0: continue
            
            ps.plot_new_cum(
                adj_daily_data[index][name][-THREE_MONTHS:].copy(), 
                adj_cum_data[index][name][-THREE_MONTHS:].copy(), 
                mode, name, 
                title=f'{name}: COVID-19 {mode.title()} (recent daily)',
                rfooter=data_quality[index][name],
                tight=1.2, 
                savefig=f'{CHART_DIRECTORY}/{name}-{mode.lower()}-'+
                f'new-vs-cum-3months.png'
            )