# COVID19 International

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Python-setup" data-toc-modified-id="Python-setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Python setup</a></span></li><li><span><a href="#Get-the-raw-data-and-adjust-for-anomalies" data-toc-modified-id="Get-the-raw-data-and-adjust-for-anomalies-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Get the raw data and adjust for anomalies</a></span><ul class="toc-item"><li><span><a href="#Nation-naming-stuff" data-toc-modified-id="Nation-naming-stuff-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Nation naming stuff</a></span></li><li><span><a href="#COVID-data-retrieval" data-toc-modified-id="COVID-data-retrieval-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>COVID data retrieval</a></span></li><li><span><a href="#Check-for-missing/odd-data" data-toc-modified-id="Check-for-missing/odd-data-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Check for missing/odd data</a></span></li><li><span><a href="#Adjust-the-data-for-negative-growth-and-outlier-spikes" data-toc-modified-id="Adjust-the-data-for-negative-growth-and-outlier-spikes-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Adjust the data for negative growth and outlier spikes</a></span></li></ul></li><li><span><a href="#International-comparisons---maps/leader-boards/swarms" data-toc-modified-id="International-comparisons---maps/leader-boards/swarms-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>International comparisons - maps/leader-boards/swarms</a></span><ul class="toc-item"><li><span><a href="#supporting-functions" data-toc-modified-id="supporting-functions-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>supporting functions</a></span></li><li><span><a href="#Bar-charts-of-top-performers" data-toc-modified-id="Bar-charts-of-top-performers-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Bar charts of top performers</a></span></li><li><span><a href="#Maps" data-toc-modified-id="Maps-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Maps</a></span></li><li><span><a href="#Swarm-plots" data-toc-modified-id="Swarm-plots-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Swarm plots</a></span></li></ul></li><li><span><a href="#Semi-log-comparison-plots-of-cumulative" data-toc-modified-id="Semi-log-comparison-plots-of-cumulative-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Semi-log comparison plots of cumulative</a></span></li><li><span><a href="#Comparison-plots-of-daily-new-per-capita" data-toc-modified-id="Comparison-plots-of-daily-new-per-capita-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Comparison plots of daily new per capita</a></span></li><li><span><a href="#Plot-weekly-new-case/deaths-data-from-January-2020" data-toc-modified-id="Plot-weekly-new-case/deaths-data-from-January-2020-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Plot weekly new-case/deaths data from January 2020</a></span></li><li><span><a href="#Plot-new-vs-cumulative-(all-and-last-3-months)" data-toc-modified-id="Plot-new-vs-cumulative-(all-and-last-3-months)-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Plot new-vs-cumulative (all and last 3 months)</a></span></li><li><span><a href="#Growth" data-toc-modified-id="Growth-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Growth</a></span></li><li><span><a href="#The-End" data-toc-modified-id="The-End-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>The End</a></span></li></ul></div>

## Python setup

In [1]:
# system imports
import sys
from pathlib import Path

#analytic imports
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.dates as mdates
import matplotlib.units as munits
import seaborn as sns
from pandas.plotting import register_matplotlib_converters
import geopandas as gpd
import iso3166

# COVID19 specific imports
sys.path.append(r'../bin')
import plotstuff as ps

# directory
CHART_DIRECTORY = '../charts'
Path(CHART_DIRECTORY).mkdir(parents=True, exist_ok=True)
CHART_DIRECTORY += '/'
I_PREFIX = '!I-'

# display settings
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

# plotting stuff
plt.style.use('ggplot')
register_matplotlib_converters()

## Get the raw data and adjust for anomalies

### Nation naming stuff

In [2]:
# make country names ISO compliant
iso_name_map = {
    # from              # to
    "Bolivia":          "Bolivia, Plurinational State of",
    "Bonaire, Saint Eustatius And Saba": "Bonaire, Sint Eustatius and Saba",
    "British Virgin Islands": "Virgin Islands, British",
    "Brunei":           "Brunei Darussalam",
    "Cape Verde":       "Cabo Verde",
    "Cote D'Ivoire":    "Côte d'Ivoire",
    "Cote d'Ivoire":    "Côte d'Ivoire",
    "Curaã§Ao":         "Curaçao",
    "Democratic Republic Of Congo": "Congo, Democratic Republic of the",
    "Democratic Republic of Congo": "Congo, Democratic Republic of the",
    "Guinea Bissau":    "Guinea-Bissau",
    "Vatican":          "Holy See",
    "Iran":             "Iran, Islamic Republic of",
    "Laos":             "Lao People's Democratic Republic",
    "Moldova":          "Moldova, Republic of",
    "Russia":           "Russian Federation",
    "Sint Maarten":     "Sint Maarten (Dutch part)",
    "South Korea":      "Korea, Republic of",
    "Syria":            "Syrian Arab Republic",
    "Tanzania":         "Tanzania, United Republic of",
    "Timor":            "Timor-Leste",
    "Timor Leste":      "Timor-Leste",
    "United Kingdom":   "United Kingdom of Great Britain and Northern Ireland",
    "United Republic Of Tanzania": "Tanzania, United Republic of",
    "United States":    "United States of America",
    "United States Virgin Islands":   "Virgin Islands, U.S.",
    "Venezuela":        "Venezuela, Bolivarian Republic of",
    "Vietnam":          "Viet Nam",
}


In [3]:
def name_map(object_, map_):
    """Rename the index of a Series or the columns of a DataFrame
       The renaming is done in place.
       Parameters
       - object_ - the pandas Series or DataFrame
       - map_ - a dictionary of old to new mappings 
       Returns: None """

    fixing = {}
    if isinstance(object_, pd.Series):
        for name in map_:
            if name in object_.index:
                fixing[name] = map_[name]
        object_.rename(index=fixing, inplace=True)
    else:
        for name in map_:
            if name in object_.columns:
                fixing[name] = map_[name]
        object_.rename(columns=fixing, inplace=True)
    
    return None

In [4]:
def get_national_code(nation):
    if nation in iso_name_map.keys():
        nation = iso_name_map[nation]
    if nation == 'European Union':
        return 'EU'
    code = iso3166.countries.get(nation).alpha2
    return code

### COVID data retrieval

In [5]:
_data_lake = {}
def get_OWID_data():
    URL = ('https://github.com/owid/covid-19-data/raw/master/'
           'public/data/owid-covid-data.csv')

    if URL in _data_lake:
        df =  _data_lake[URL].copy()
        print('Data retrieved from the lake')
    else:
        print(URL)
        df = pd.read_csv(URL, header=0)
        print('We have the data')
        _data_lake[URL] = df.copy()

    # get cases and deaths, 
    # remove null columns
    # remove null rows (largely affects the last row when not populated)
    # make index a DatetimeIndex
    cases = df.pivot(columns='location', index='date', values='total_cases')
    cases = cases.dropna(axis='columns', how='all')
    cases = cases.dropna(axis='index', how='all')
    source = f'OWID {cases.index[-1]}'
    cases.index = pd.DatetimeIndex(cases.index)

    deaths = df.pivot(columns='location', index='date', values='total_deaths')
    deaths = deaths.dropna(axis='columns', how='all')
    deaths.index = pd.DatetimeIndex(deaths.index)
    deaths = deaths.dropna(axis='index', how='all')
    
    # population data for each country
    population = df.pivot(columns='location', index='date', values='population')
    populations = {}
    for p in population.columns:
        address = population[p].last_valid_index()
        if address is None:
            continue
        populations[p] = population[p].loc[address]
    
    # return the lot
    return cases, deaths, pd.Series(populations).astype(int), source

In [6]:
# retrieve raw
raw_cum_cases, raw_cum_deaths, population, source = get_OWID_data()
SOURCE = f'Source: {source}'

https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-data.csv
We have the data


### Check for missing/odd data

In [7]:
# Check for missing data (NAN) in the last row.
# My experience is that these are usually plentiful in
# the early morning but resolved by the afternoon/evening 
# (Sydney time). 

print(f'In respect of data from {source}')
for mode, data in zip(['cases', 'deaths'],
                      [raw_cum_cases, raw_cum_deaths]):
    missing = data.iloc[-1]
    missing = missing[missing.isna()]
    if len(missing):
        print(f'{len(missing)} missing {mode} on last day')
    else: 
        print(f'All {mode} data on last day appear to be present')


In respect of data from OWID 2020-12-31
All cases data on last day appear to be present
All deaths data on last day appear to be present


In [8]:
# Check for unusual no-growth patterns in last row of the 
# cumulative data when there is substantial growth (of more 
# than 100) over the past week. 
# Note: this often just exposes those nations that do not 
# have a practice of daily reporting their cumulative cases 
# and deaths.

WEEKLY_THRESHOLD = 100

for mode, data in zip(['cases', 'deaths'],
                      [raw_cum_cases.copy(), 
                       raw_cum_deaths.copy()]):
    
    # ignore NANs - as they are picked up in the previous cell. 
    nan_last = data.iloc[-1].isna()
    data = data.drop(columns=nan_last[nan_last].index)

    # identify columns of concern in respct of absent last-day growth 
    last_day_growth = data.iloc[-1] - data.iloc[-2]
    week_growth = data.iloc[-1] - data.iloc[-8]
    odd = (week_growth > WEEKLY_THRESHOLD) & (last_day_growth == 0)
    if len(odd):
        print(f'\nThere are {odd.sum()} odd, zero-growth in {mode} on the last day')
        print('They are: ', odd[odd].index.values)
        print('Their last week of data is as follows:')
        display(data[odd[odd].index].iloc[-8:])


There are 11 odd, zero-growth in cases on the last day
They are:  ['Bosnia and Herzegovina' 'Cyprus' 'El Salvador' 'Eritrea' 'Gabon' 'Ghana'
 'India' 'Lesotho' 'Luxembourg' 'Sweden' 'Tajikistan']
Their last week of data is as follows:


location,Bosnia and Herzegovina,Cyprus,El Salvador,Eritrea,Gabon,Ghana,India,Lesotho,Luxembourg,Sweden,Tajikistan
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-12-24,108298.0,19316.0,44619.0,951.0,9469.0,54043.0,10146845.0,2725.0,45209.0,396048.0,13106.0
2020-12-25,108891.0,19366.0,44619.0,992.0,9497.0,54043.0,10169118.0,2725.0,45209.0,396048.0,13138.0
2020-12-26,109330.0,19391.0,44619.0,992.0,9497.0,54286.0,10187850.0,2725.0,45209.0,396048.0,13172.0
2020-12-27,109691.0,19657.0,44619.0,1039.0,9497.0,54401.0,10207871.0,2725.0,45209.0,396048.0,13205.0
2020-12-28,109911.0,20408.0,45415.0,1039.0,9510.0,54503.0,10224303.0,2956.0,45849.0,396048.0,13237.0
2020-12-29,110454.0,21315.0,45415.0,1220.0,9510.0,54681.0,10244852.0,3005.0,46088.0,428533.0,13265.0
2020-12-30,110985.0,22019.0,45960.0,1252.0,9571.0,54771.0,10266674.0,3094.0,46415.0,437379.0,13296.0
2020-12-31,110985.0,22019.0,45960.0,1252.0,9571.0,54771.0,10266674.0,3094.0,46415.0,437379.0,13296.0



There are 3 odd, zero-growth in deaths on the last day
They are:  ['Bosnia and Herzegovina' 'India' 'Sweden']
Their last week of data is as follows:


location,Bosnia and Herzegovina,India,Sweden
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-24,3878.0,147092.0,8279.0
2020-12-25,3901.0,147343.0,8279.0
2020-12-26,3923.0,147622.0,8279.0
2020-12-27,3953.0,147901.0,8279.0
2020-12-28,3942.0,148153.0,8279.0
2020-12-29,4024.0,148439.0,8484.0
2020-12-30,4050.0,148738.0,8727.0
2020-12-31,4050.0,148738.0,8727.0


### Adjust the data for negative growth and outlier spikes

In [9]:
# make adjustments to the data
CASES = 0
DEATHS = 1
modes = {
    'Cases': CASES,
    'Deaths': DEATHS,
}
raw_cum_data = [raw_cum_cases, raw_cum_deaths]
raw_daily_data = [None, None]
adj_daily_data = [None, None]
adj_cum_data = [None, None]

data_quality = [None, None]

for mode, index in modes.items():
    
    # adjust raw data for anomalies
    (raw_daily_data[index], 
        adj_daily_data[index], 
        adj_cum_data[index]) = ps.dataframe_correction(
                                    raw_cum_data[index])

    # identify whether the adjustment for anomalies 
    # changed the data
    data_quality[index] = pd.Series(None, 
                            index=raw_cum_data[index].columns,
                            dtype='str')
    for col in raw_cum_data[index].columns:
        if (raw_daily_data[index][col] == 
            adj_daily_data[index][col]).all():
            data_quality[index][col] = (f'{SOURCE}; original data')
        else:
            data_quality[index][col] = (f'{SOURCE}, '
                            'data adjusted for extreme outliers')

Spikes in Andorra
date   2020-06-02
spike   79.000000
mean     0.714286
zeros   10.000000
There are negatives in Antigua and Barbuda
date
2020-07-03   -1.0
Name: Antigua and Barbuda, dtype: float64
Data too sparse in Antigua and Barbuda (max_consecutive=2)
Data too sparse in Barbados (max_consecutive=11)
Spikes in Belize
date    2020-12-03
spike  1382.000000
mean    142.857143
zeros     2.000000
There are negatives in Benin
date
2020-05-19   -209.0
Name: Benin, dtype: float64
Data too sparse in Benin (max_consecutive=10)
Data too sparse in Bhutan (max_consecutive=9)
Data too sparse in Botswana (max_consecutive=2)
Spikes in Burkina Faso
date   2020-05-06  2020-06-02  2020-09-13
spike   41.000000   34.000000       193.0
mean     6.714286    1.785714        17.5
zeros    0.000000    6.000000         0.0
Data too sparse in Burundi (max_consecutive=10)
Data too sparse in Cambodia (max_consecutive=14)
Data too sparse in Cameroon (max_consecutive=7)
Spikes in Cape Verde
date   2020-04-15
spik

Data too sparse in Antigua and Barbuda (max_consecutive=2)
Spikes in Argentina
date    2020-10-01
spike  3351.000000
mean    355.928571
zeros     0.000000
There are negatives in Australia
date
2020-06-01   -1.0
Name: Australia, dtype: float64
There are negatives in Austria
date
2020-07-21   -1.0
2020-10-11   -1.0
Name: Austria, dtype: float64
Data too sparse in Bahamas (max_consecutive=9)
Data too sparse in Barbados (max_consecutive=3)
There are negatives in Belgium
date
2020-08-26   -117.0
Name: Belgium, dtype: float64
Data too sparse in Benin (max_consecutive=3)
Spikes in Bolivia
date    2020-09-07
spike  1656.000000
mean     55.142857
zeros     1.000000
There are negatives in Bosnia and Herzegovina
date
2020-12-28   -11.0
Name: Bosnia and Herzegovina, dtype: float64
Data too sparse in Botswana (max_consecutive=1)
Data too sparse in Brunei (max_consecutive=1)
Data too sparse in Burkina Faso (max_consecutive=5)
Data too sparse in Burundi (max_consecutive=1)
Data too sparse in Cameroon

Data too sparse in Uganda (max_consecutive=9)
Data too sparse in Uruguay (max_consecutive=10)
There are negatives in Venezuela
date
2020-05-01   -6.0
Name: Venezuela, dtype: float64
There are negatives in Vietnam
date
2020-08-19   -1.0
Name: Vietnam, dtype: float64
Data too sparse in Vietnam (max_consecutive=7)
There are negatives in Yemen
date
2020-12-08   -43.0
Name: Yemen, dtype: float64
Spikes in Yemen
date   2020-07-12
spike   48.394910
mean     3.722685
zeros    0.000000
Spikes in Zambia
date   2020-07-17
spike   67.000000
mean     1.928571
zeros   10.000000
Data too sparse in Zimbabwe (max_consecutive=12)


## International comparisons - maps/leader-boards/swarms

### supporting functions

In [10]:
def per_million_population(population: pd.Series, power:int = 6):
    """Take a population series and a power and return in a tuple:
       - the power
       - the factor (which is 10 ** power)
       - an updated population series (which is population / factor)"""
    
    factor = 10 ** power
    return power, factor, population / factor

In [11]:
def get_rear_offsets(dataframe, period):
    """Get offset to last non-zero values in dataframe by col
       provided offset is within period, otherwise zero offset
       [We do this becuase some nations are slow in reporting,
       and without this adjustment late reporting nations would 
       look better than they actually are.]"""
    
    nrows = len(dataframe)
    rear_offsets = pd.Series(0, index=dataframe.columns)
    for col in dataframe.columns:
        index_array = (np.nonzero(dataframe[col].to_numpy()))[0]
        if len(index_array) > 0:
            last = index_array[-1]
            rear_offsets[col] = nrows - last - 1
            # Note: rear_offsets[col] is 0 if len(index_array) == 0
    rear_offsets = rear_offsets.where(rear_offsets<=period, other=0)
    return rear_offsets

def get_recent_total(dataframe, rear_offsets, period):
    """Sum the last rows of a dataframe, making adjustments
       for zero rows at the very end"""
    
    daily_sum = pd.Series(0.0, index=dataframe.columns)
    for col in dataframe.columns:
        p = 0 - (period+rear_offsets[col])
        daily_sum[col] = dataframe[col].iloc[p:].sum()
    return daily_sum

def get_larger_nations(population, thresh=100_000):
    """return a list of nations with a population exceeding thresh"""
    
    return population[population >= thresh].index

In [12]:
def get_data_for_comapative(mode, index, data, population):

    PERIOD = 7 # days (recent data period for daily averages)
    THRESH = 100_000 # people (minimum nation size for plotting)
    
    # drop World
    if 'World' in data.columns:
        del data['World']
    if 'World' in population.index:
        population.drop(labels='World', inplace=True)

    keepers = get_larger_nations(population, THRESH)
    power, factor, pop_millions =  per_million_population(population)

    # cumulative data
    cumulative = data.sum()
    cumulative_percapita = (cumulative / pop_millions)[keepers]
    log_cumulative_percapita = np.log(cumulative_percapita + 1)

    # latest daily average data
    rear_offsets = get_rear_offsets(data, PERIOD)
    daily_ave = (get_recent_total(data, rear_offsets, PERIOD) / PERIOD)
    daily_ave_percapita = (daily_ave / pop_millions)[keepers]
    log_daily_ave_percapita = np.log(daily_ave_percapita + 1)

    return (power, THRESH, PERIOD, cumulative, cumulative_percapita, 
            log_cumulative_percapita, daily_ave, 
            daily_ave_percapita, log_daily_ave_percapita)

### Bar charts of top performers

In [13]:
if True: # switch this output on/off
    
    BAR_N = 40 # maximum bars on chart
    BAR_PLOT_SIZE = (8, 8)

    for mode, index in modes.items():
        print(mode)

        # get the data
        (power, THRESH, PERIOD, cumulative, cumulative_percapita, 
            log_cumulative_percapita, daily_ave, 
            daily_ave_percapita, log_daily_ave_percapita) = (
            get_data_for_comapative(mode, index, 
                                    adj_daily_data[index].copy(),
                                    population.copy())
        )
        lfooter = f'For nations with a population >= {THRESH:,}'
        
        # bar charts
        # - bar chart of the top cumulative performers
        top_tier = (cumulative.dropna()
                    .sort_values(ascending=True)[-BAR_N:])
        series = np.round(top_tier.copy(), 0).astype(int)
        ps.plot_barh(
            series=series, 
            title=(f'COVID-19: Top cumulative {mode.lower()}'),
            xlabel=(f'Cumulative {mode.lower()}'),
            lfooter=lfooter, rfooter=SOURCE,
            set_size_inches=BAR_PLOT_SIZE,
            chart_directory=CHART_DIRECTORY + I_PREFIX,
        )

        # - bar chart of the top cumulative performers per capita
        top_tier = (cumulative_percapita.dropna()
                    .sort_values(ascending=True)[-BAR_N:])
        series = np.round(top_tier.copy(), 1)
        ps.plot_barh(
            series=series, 
            title=(f'COVID-19: Top cumulative {mode.lower()} per capita'),
            xlabel=(f'Cumulative {mode.lower()} per '
                    f'$10^{power}$ population'),
            lfooter=lfooter, rfooter=SOURCE,
            set_size_inches=BAR_PLOT_SIZE,
            chart_directory=CHART_DIRECTORY + I_PREFIX,
        )

        # - bar chart of the top daily averages - past week
        top_tier = (daily_ave.dropna()
                    .sort_values(ascending=True)[-BAR_N:])
        series = np.round(top_tier.copy(), 1)
        ps.plot_barh(
            series=series,
            title=(f'COVID-19: Top {mode.lower()} - '
                  f'past {PERIOD} days'),
            xlabel=(f'Average daily {mode.lower()}'),
            lfooter=lfooter, rfooter=SOURCE,
            set_size_inches=BAR_PLOT_SIZE,
            chart_directory=CHART_DIRECTORY + I_PREFIX,
        ) 
        
        # - bar chart of the top daily averages per capita
        top_tier = (daily_ave_percapita.dropna()
                    .sort_values(ascending=True)[-BAR_N:])
        series = np.round(top_tier.copy(), 1)
        ps.plot_barh(
            series=series,
            title=(f'COVID-19: Top {mode.lower()} per capita - '
                  f'past {PERIOD} days'),
            xlabel=(f'Average daily {mode.lower()} per '
                    f'$10^{power}$ population'),
            lfooter=lfooter, rfooter=SOURCE,
            set_size_inches=BAR_PLOT_SIZE,
            chart_directory=CHART_DIRECTORY + I_PREFIX,
        )

Cases
Deaths


### Maps

In [14]:
def map_world(series, title, legend_title, source):
    
    # prepare data for mapping
    name_map(series, iso_name_map)
    score = pd.DataFrame(series) # back to DataFrame
    score.columns = ['Score']
    score['country'] = score.index
    score['code'] = [iso3166.countries.get(x.upper())[2] 
                     for x in score['country']]
    
    # get map data
    shapefile = ('../geo-data/ne_110m_admin_0_countries/'
        'ne_110m_admin_0_countries.shp')
    gdf = gpd.read_file(shapefile)[['ADMIN', 'ADM0_A3', 'geometry']]
    gdf.columns = ['country', 'country_code', 'geometry']
    gdf = gdf[gdf['country'] != 'Antarctica'] 
    
    merged = gdf.merge(score,
                       left_on='country_code', 
                       right_on='code', how='left')

    variable = 'Score'
    cmap = mpl.cm.get_cmap('viridis').reversed()
    cmap.set_bad('white')
    cmap.set_under('white')
    ax = merged.plot(column=variable, cmap=cmap, legend=False)
    
    # colorbar
    world_map = ax.collections[0]
    cb = plt.colorbar(world_map, ax=ax, orientation='horizontal')
    
    # legend title
    fig = ax.figure
    fig.text(0.5, 0.175, legend_title,
        ha='center', va='bottom',
        fontsize=12, # fontstyle='italic',
        color='#222222')
    
    ps.finalise_plot(ax, title=title,
                     xticklabels=[], yticklabels=[],
                     xticks=[], yticks=[],
                     rfooter=source,
                     set_size_inches=(8,5),
                     save_as=f'{CHART_DIRECTORY}'
                             f'{I_PREFIX}MAP-{title}.png',
                    )

In [15]:
if True: # switch this output on/off
    
    for mode, index in modes.items():
        print(mode)

        # get the data
        (power, THRESH, PERIOD, cumulative, cumulative_percapita, 
            log_cumulative_percapita, daily_ave, 
            daily_ave_percapita, log_daily_ave_percapita) = (
            get_data_for_comapative(mode, index, 
                                    adj_daily_data[index].copy(),
                                    population.copy())
        )
        
        # world maps
        # - world map - cumulative per capita
        title = f'COVID-19 Cumulative {mode.lower()} per capita'
        legend = (f'Cumulative {mode.lower()} per '
                  f'$10^{power}$ population')
        map_world(cumulative_percapita.copy(), title, legend, SOURCE)

        # - world map - cumulative per capita - log scale
        title = (f'COVID-19 Cumulative {mode.lower()} per capita '
                 f'(log scale)')
        legend = (f'log((cumulative {mode.lower()} per '
                  f'$10^{power}$ pop.) + 1)')
        map_world(log_cumulative_percapita.copy(), title, legend, SOURCE)
        
        # - world map - average daily per capita past PERIOD days
        title = (f'COVID-19 Ave. daily {mode.lower()} per capita '
             f'- past {PERIOD} days')
        legend = (f'Ave. daily {mode.lower()} per '
                  f'$10^{power}$ population')
        map_world(daily_ave_percapita.copy(), title, legend, SOURCE)

        # - world map - average daily per capita past PERIOD days - log scale
        title = (f'COVID-19 Ave. daily {mode.lower()} per capita '
             f'- past {PERIOD} days (log scale)')
        legend = (f'log((average daily {mode.lower()} per '
              f'$10^{power}$-population) + 1)')
        map_world(log_daily_ave_percapita.copy(), title, legend, SOURCE)

Cases
Deaths


### Swarm plots

In [16]:
def swarm(data: pd.Series,
          title="Don't forget the title",
          ylabel="Don't forget the ylabel",
          source="Don't forget the source",
          color='cornflowerblue'):
    """Produce a swarm plot from the following input:
        - data - a pandas Series of values, with an index of nations
        - title - plot title
        - ylabel - label for the y axis
        - source - string for data source - becomes right footer
        - color - colour of swarm plot dots
       """
    
    DEFAULT_SIZE = (8, 6)
    
    # get country information
    wbd = pd.read_excel('../data/CLASS.xls', header=4, index_col=0, 
                        ).iloc[1:219].dropna(how='all', axis=1)
    mapping = wbd['Income group']
    mapping.index = wbd['Code']
    
    # prepare for plot
    name_map(data, iso_name_map)
    data = pd.DataFrame(data)
    data.columns = ['Rate']
    data['alpha3'] = [iso3166.countries.get(x).alpha3 for x in data.index]
    data['alpha2'] = [iso3166.countries.get(x).alpha2 for x in data.index]
    data['Income Group'] = data.alpha3.map(mapping)
    data = data[data['Income Group'].notna()]
    
    # labels 
    data['alpha2'] = data['alpha2'].where(data['alpha2'].notna(), other='')
    
    # swarm plot
    # - set up
    categories = ['Low income', 'Lower middle income',
                  'Upper middle income', 'High income']
    # - plot
    fig, ax = plt.subplots(figsize=DEFAULT_SIZE)
    ax.margins(0.02)
    sns.swarmplot(x='Income Group',
                  y='Rate', 
                  data=data, 
                    size=10,
                       dodge=True, 
                       color=color, alpha=0.5,
                       order=categories,
                       ax=ax)
    
    # - point annotation - this is one ugly hack
    for collect, name in zip(ax.collections, categories):
        
        # retrieve positional data from plot
        retrieved_xy_pairs = collect.get_offsets()
        
        # build an xy map with duplicate keys
        THRESH = 7
        dup_map = {}
        for x, y in retrieved_xy_pairs:
            yy = np.round(y, THRESH)
            if yy in dup_map:
                dup_map[yy].append(x)
            else:
                dup_map[yy] = [x]
        
        # use this map to plot in the data labels
        for index, row in data[data['Income Group'] == name].iterrows():
            lookup = np.round(row['Rate'], THRESH)
            if lookup in dup_map:
               ax.text(dup_map[lookup].pop(), row['Rate'], row['alpha2'],
                       ha='center', va='center', fontsize='xx-small', 
                        color='#333333')
        
    ps.finalise_plot(ax, title=title,
                     xlabel=None, ylabel=ylabel,
                     rfooter=SOURCE,
                     set_size_inches=DEFAULT_SIZE,
                     chart_directory=CHART_DIRECTORY
                    )

In [17]:
if True: # switch this output on/off
    
    for mode, index in modes.items():
        print(mode)

        # get the data
        (power, THRESH, PERIOD, cumulative, cumulative_percapita, 
            log_cumulative_percapita, daily_ave, 
            daily_ave_percapita, log_daily_ave_percapita) = (
            get_data_for_comapative(mode, index, 
                                    adj_daily_data[index].copy(),
                                    population.copy())
        )
        
        # swarm plots
        # - swarm plot of cumulative per capita
        title = f'COVID-19 Cumulative {mode.lower()} per capita'
        ylabel = (f'Cumulative {mode.lower()} per '
                  f'$10^{power}$ population')
        swarm(cumulative_percapita.dropna().copy(), title, 
              ylabel, SOURCE, color='darkorchid')        
        
        # - swarm plot of cumulative per capita - log scale
        log_cumulative_percapita = np.log(cumulative_percapita + 1)
        title = f'COVID-19 Cumulative {mode.lower()} per capita (log scale)'
        ylabel = (f'log((cumulative {mode.lower()} per '
                  f'$10^{power}$ pop.) + 1)')
        swarm(log_cumulative_percapita.dropna().copy(), title, 
              ylabel, SOURCE, color='hotpink')
        
        # - swarm plot of daily average per capita past PERIOD days
        title = (f'COVID-19 Ave. daily {mode.lower()} per capita '
             f'- past {PERIOD} days')
        ylabel = (f'Ave. daily {mode.lower()} per '
                 f'$10^{power}$ population')
        swarm(daily_ave_percapita.dropna().copy(), title, 
              ylabel, SOURCE)
        
        # - swarm plot of daily average per capita past PERIOD days - log scale
        title = (f'COVID-19 Ave. daily {mode.lower()} per capita '
             f'- past {PERIOD} days (log scale)')
        ylabel = (f'log((ave. daily {mode.lower()} per '
                 f'$10^{power}$ pop.) + 1)')
        swarm(log_daily_ave_percapita.dropna().copy(), title, 
              ylabel, SOURCE, color='darkorange')        

Cases




Deaths




## Semi-log comparison plots of cumulative

In [18]:
def plot_semi_log_trajectory(data, mode, threshold, selections, source):
    """Produce semi-log plots of cumulative data from the 
       following inputs:
        - data - a pandas dataframe of cumulative data
        - mode - a string, either 'Cases' or 'Deaths'
        - threshold - starting point (eg. 100th case)
        - selections - a python dictionary of 
         'grouping': ['list', 'of', 'nations'], pairs
        - source - string for data source
       """
    
    styles = ['-'] #, '--', '-.', ':'] # 4 lines 
    markers = list('PXo^v<>D*pH.d') # 13 markers
    colours = ['blue', 'red', 'maroon', 'darkorange', 'brown', 
               'olivedrab', 'darkgoldenrod', 'green',  
               'purple', 'black', 'teal'] # 11 colours

    for tag, nation_list in selections.items():
    
        # set up for the plot
        ax = plt.subplot(111,)

        # add the data
        endpoints = pd.DataFrame()
        nation_list.sort()
        for i, name in enumerate(nation_list):
            if name not in data.columns:
                print(f'{name} is not in data')
                continue
            y = data[name]
            y.index = range(len(y))
            start = y[y >= threshold]
            if len(start) < 5:
                continue
            start = start.index[0] - 1
            if start < 0:
                start = 0
            y = y[start:].values
            x = range(-1, len(y) - 1)
            
            # plot line
            code = get_national_code(name)
            label = f'{name} ({code}) {int(y[-1]):,}'
            color = colours[i % len(colours)]
            ax.plot(x, y, label=label,
                    color=color,)
            
            # plot end text
            endpoints = endpoints.append(
                pd.Series([x[-1], y[-1], f'{code}', color],
                          index=['x', 'y', 'code', 'color'],
                          name=name))

        # add endpoint labels 
        EXPANSION = 0.02
        additional = endpoints.x.max() * EXPANSION
        for row in endpoints.itertuples(): 
            ax.text(x=row.x+additional, y=row.y, s=row.code,
                    size='small', color='black',
                    ha='left', va='center',
                    bbox={'alpha':0.5, 'facecolor':'white'})

        # etc.
        min_, max_ = ax.get_xlim()
        max_ = max_ + (max_ * EXPANSION)
        ax.set_xlim(min_, max_)
        ax.legend(loc='lower right', fontsize='8', ncol=3)
 
        ps.finalise_plot(ax,
                         title='COVID-19 Semilog plot of selected '
                               f'{mode.lower()[:-1]} trajectories',
                         xlabel='Days from the notional '
                                f'{int(threshold)}th {mode.lower()[:-1]}',
                         ylabel=f'Cumulative {mode} (log scale)',
                         yscale='log',
                         rfooter=source,
                         set_size_inches=(8,6),
                         chart_directory=CHART_DIRECTORY+I_PREFIX,
                         save_tag=tag,
                        )

In [19]:
if True: # switch this output on/off

    selections = {
        'Anglophone':  ['Australia', 'United States', 'Canada', 
                        'United Kingdom', 'New Zealand', 'Ireland', ],
        'Neighbours':  ['Australia', 'New Zealand', 'Papua New Guinea',
                        'Indonesia', 'Singapore', 'Malaysia', #'Brunei',
                        'Timor', 'China', 'Japan', 'South Korea', 'India'],
        "SelectedEurope":['Austria', 'Belgium', 'Denmark', 
                        'France', 'Germany', 'Greece', 'Italy', 'Netherlands', 
                        'Norway', 'Poland', 'Portugal', 'Russia', 'Spain',  
                        'Sweden', 'Switzerland', 'United Kingdom', ],
        "EU-v-US":     ['United States', "European Union"],
        "Italy-v-UK":  ['Italy', "United Kingdom"]
    }
    
    # https://europa.eu/european-union/about-eu/countries_en
    EU = [
        'Austria', 'Belgium', 'Bulgaria', 'Croatia', 
        'Cyprus', 'Czechia', 'Denmark',
        'Estonia', 'Finland', 'France', 'Germany',
        'Greece', 'Hungary', 'Ireland', 'Italy',
        'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 
        'Netherlands', 'Poland', 'Portugal', 'Romania', 
        'Slovakia', 'Slovenia', 'Spain', 'Sweden', 
    ]
    assert(len(EU) == 27) 
    
    # for cases then deaths ...
    for mode, index in modes.items():
        print(mode)
        cumulative = adj_cum_data[index].copy()
        if 'World' in cumulative.columns:
            del cumulative['World']
        cumulative["European Union"] = cumulative[EU].sum(axis=1)
        
        # include top N nations as a chart
        N = 9
        top = cumulative.iloc[-1].dropna().sort_values(ascending=False)[:N]
        selections['Top'] = top.index.to_list()
        
        # and plot
        thresh = 100 if mode.lower() == 'cases' else 10
        plot_semi_log_trajectory(cumulative, mode, thresh, 
                                 selections, f'Source: {source}')

Cases
Deaths
Timor is not in data


## Comparison plots of daily new per capita

In [20]:
def plot_daily_comparative(data, population, mode, regions, source, period=14):
    """Produce daily plots highlighting new cases/deaths for a 
       specific set of nations against a backdrop of all nations.
    Inputs:
        - data - pandas DataFrame of daily new cases/deaths
        - population - pandas Series of national populations 
        - mode - string - either "cases" or "deaths"
        - regions - python list of lists of strings 
          (nation names - same as columns in data)
        - source - string for data source
        - period - period for the rolling average """
    
    # set-up
    colours = ['blue', 'red', 'maroon', 'darkorange', 'brown', 
               'olivedrab', 'darkgoldenrod', 'green',  
               'purple', 'black', 'teal'] # 11 colours
    power, factor, pop_millions =  per_million_population(population)
    keepers = get_larger_nations(population, 100_000) # used for backgrounds

    # get rolling average per capita from start date
    df = data.rolling(period).mean().div(other=pop_millions)
    df = df[df.index >= pd.Timestamp('2020-02-01')]

    for region_list in regions:
    
        # plot background
        ax = df[keepers].plot(c='#aaaaaa', lw=0.5)
        ax.get_legend().remove()

        # plot highlighted regions
        region_list.sort()
        ax_new = ax.twinx()        
        for i, name in enumerate(region_list):
            df[name].plot(c=colours[i], label=name, lw=2.5, ax=ax_new)
        ax_new.legend(title=None, loc="upper left")
        ax_new.grid(False)
        ax_new.set_yticklabels([])
        ax_new.set_ylim(ax.get_ylim())

        ps.finalise_plot(ax,
                      title=f'COVID-19 Daily New {mode.title()}',
                      xlabel=None,
                      ylabel=f'Daily new {mode.lower()} per $10^{power}$ '
                              f'population\n{period}-day rolling average',
                      rfooter=source,
                      set_size_inches=(8, 5),
                      save_as=f'{CHART_DIRECTORY}{I_PREFIX}'
                              f'daily-{mode}-{" ".join(region_list)}',
                     )

In [21]:
if True: # switch this output on/off

    # identify the regional sets to be plotted
    regions = [
        ['European Union', 'United States'],
        ['Belgium', 'Ireland', 'Netherlands', 'United Kingdom', 'Iceland'],
        ['France', 'Italy', 'Portugal', 'Spain'],
        ['Denmark', 'Norway', 'Sweden', 'Austria', 'Germany', 'Switzerland'], 
        ['Finland', 'Estonia', 'Latvia', 'Lithuania', 'Poland'],
        ['Belarus', 'Russia', 'Ukraine', 'Romania', 'Bulgaria', 'Moldova', ],
        ['Czechia', 'Slovakia', 'Hungary', 'Slovenia'],
        ['Croatia', 'Bosnia and Herzegovina', 'Montenegro',
            'Serbia', 'Kosovo', 'Albania', 'North Macedonia', 'Greece'],
        ['Turkey', 'Syria', 'Lebanon', 'Israel', 'Jordan', 
             'Egypt', 'Libya', 'Cyprus'],
        ['Georgia', 'Armenia', 'Azerbaijan'],
        ['Iraq', 'Iran', 'Saudi Arabia', 'Bahrain', 'Qatar', 
            'United Arab Emirates', 'Kuwait', 'Yemen', 'Oman',], 
        ['Ghana', 'Ethiopia', 'Kenya', 
             'Nigeria', 'South Africa', 'Tanzania'],
        ['Afghanistan', 'Kazakhstan', 'Kyrgyzstan', 'Pakistan', 
            'Tajikistan', 'Uzbekistan', ],
        ['Bangladesh', 'Nepal', 'India', 'Sri Lanka'], 
        ['Indonesia', 'Malaysia', 'Philippines', 'Singapore', 'Thailand'],
        ['China', 'Japan', 'South Korea', 'Taiwan', 'Vietnam'],
        ['Australia', 'New Zealand', 'Papua New Guinea', 'Timor'], 
        ['Canada', 'Mexico', 'United States'], 
        ['Costa Rica', 'Guatemala', 'Honduras', 'Nicaragua', 'Panama'], 
        ['Argentina', 'Brazil', 'Chile', 'Colombia', 'Ecuador', 
            'Peru', 'Venezuela'], 
        ['Cuba', 'Dominican Republic', 'Haiti', 'Jamaica'],
    ]

    # And population for European Union
    pop = population.copy()
    pop['European Union'] = pop[EU].sum()
    # FYI only ...
    print('Populations:\n'
          f'United States: {int(pop["United States"]):,}\n'
          f'European Union: {int(pop["European Union"]):,}\n')

    # for cases then deaths ...
    for mode, index in modes.items():
        print(mode)
        daily = adj_daily_data[index].copy()
        daily['European Union'] = daily[EU].sum(axis=1)
        plot_daily_comparative(daily, pop, mode, regions, SOURCE)

Populations:
United States: 331,002,647
European Union: 444,919,060

Cases
Deaths


## Plot weekly new-case/deaths data from January 2020

In [22]:
if True: # switch this output on/off
    adj_weekly_data = [None, None]
    for (mode, index) in modes.items():
        daily = adj_daily_data[index].copy()
        adj_weekly_data[index] = ps.plot_weekly(daily, 
                                    mode.lower(), 
                                    data_quality[index],
                                    chart_directory=CHART_DIRECTORY,
                                    save_tag='weekly',
                                 )

## Plot new-vs-cumulative

In [23]:
if True: # switch this output on/off

    for mode, index in modes.items():
        for name in adj_cum_data[index].columns:
        
            # let's not plot empty charts
            if adj_daily_data[index][name].sum() == 0: 
                    continue
            
            ps.plot_new_cum(
                adj_daily_data[index][name].copy(), 
                adj_cum_data[index][name].copy(), 
                mode, 
                name,
                'day',
                title=f'{name}: COVID-19 {mode.title()}',
                rfooter=data_quality[index][name],
                save_as=f'{CHART_DIRECTORY}{name}-{mode.lower()}'
                        f'-daily-new-vs-cum.png'
            )

## Growth

In [24]:
if False: # switch this output on/off
    
    # for cases then deaths ...
    for mode, index in modes.items():
        daily = adj_daily_data[index].copy()
        
        # for each nation
        for name in daily.columns:
            ps.plot_growth_factor(daily[name], 
                title=f'{name}: W/W growth in new COVID-19 {mode.lower()}',
                ylabel='Week on Week Growth Factor',
                xlabel=None,
                save_as=f'{CHART_DIRECTORY}/{name}-{mode}-growth-factor.png',
                lfooter=f'Ave. daily new {mode.lower()} this-week/last-week; '
                        + data_quality[index][name],
        )        


## The End

In [25]:
print('Finished')

Finished
