In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import matplotlib as mpl


FONT_SIZE = 12
AXES_SIZE = FONT_SIZE * 1.5
TICK_SIZE = FONT_SIZE * 1.25
LEGEND_SIZE = FONT_SIZE

my_rcParams = {
    "axes.spines.top": False,
    "axes.spines.right": False,
    "lines.linewidth": 5,
    "font.size": FONT_SIZE,
    "font.family": "Helvetica",
    "axes.titlesize": AXES_SIZE,
    "axes.labelsize": AXES_SIZE,
    "xtick.labelsize": TICK_SIZE,
    "ytick.labelsize": TICK_SIZE,
    "legend.fontsize": LEGEND_SIZE,
    "xtick.major.pad": FONT_SIZE / 2,
    "ytick.major.pad": FONT_SIZE / 2,
}

for k, v in my_rcParams.items():
    mpl.rcParams[k] = v

## Country-specific sequences per week

In [None]:
plot_df = pd.read_csv("../data/down_scaled/weekly_sequences_by_country.tsv", sep="\t")
plot_countries = ["USA", 
                  "Japan", 
                  "United Kingdom", "France", 
                  "Germany", 
                  "Mexico",
                  "Colombia",
                  "Chile",
                  "Argentina",
                  "China", "Australia", 
                  "Brazil", "South Africa",
                  "Nigeria",
                  "Ghana",
                  "Senegal",
                  "Egypt",
                  "Morocco",
                  "South Korea",
                  "Vietnam",
                  "Philippines",
                  "Singapore",
                  "Malaysia",
                  "India",
                  "Pakistan",
                  "Spain",
                  "Portugal"]
_plot_df = plot_df[plot_df.country.isin(plot_countries)]

In [None]:
THRESES = [ 100,  500, 1000, 2000, 5000]

In [None]:
fig = plt.figure(figsize=(8., 8.), constrained_layout=True)
spec = fig.add_gridspec(ncols=1, nrows=1)
ax = fig.add_subplot(spec[0])

def plot_country_sequencing_rates(ax, plot_df):
    ax.bar(plot_df.country, plot_df.mean_seq_per_week, yerr= 1.96 * plot_df.sem_seq_per_week, 
           ec="k", color="#f1ddff", error_kw={"elinewidth": 1.5})

    ax.set_yscale("log")
    ax.set_yticks([50, 100, 1000, 10_000] + THRESES)
    ax.set_yticklabels([50, 100, 1000, 10_000] + THRESES)

    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

    for thres in THRESES:
        ax.axhline(y=thres, linestyle ="--", color="#525252", linewidth=1.)

    ax.set_ylabel("Sequences per week (2022)")
    
plot_country_sequencing_rates(ax, _plot_df)

## Plotting forecast errors

In [None]:
LEADS_TO_PLOT = np.array([-30, -14, 0, 14, 30])
lead_cmap =  mpl.colormaps["coolwarm"]

In [None]:
errors_df = pd.read_csv("../errors/down_scaled_errors.tsv", sep="\t")

In [None]:
threses = errors_df["thres"].unique()

In [None]:
errors_df.head()

In [None]:
# Filter to three leads of interest
errors = errors_df#[errors_df.lead.isin(DAYS_TO_PLOT_ERROR)]
mean_mae = (errors[["thres", "lead", "pivot_date", "MAE"]] # Why does mean lead differ?
    .groupby(["thres", "lead"])
    .agg({'MAE':['mean','sem']})
    .reset_index()
)
mean_mae.columns = ['thres', 'lead', 'MAE_mean', 'MAE_sem']

In [None]:
errors

In [None]:
def simplify_axes(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

In [None]:
# Plot MAE over lead for each threshold
fig = plt.figure(figsize=(14., 4.), constrained_layout=True)
spec = fig.add_gridspec(ncols=3, nrows=2)

# What should panels be 


def plot_error_by_thres_at_lead(ax, errors, lead, ylabel=None):
    _errors = errors.copy()
    if lead is not None:
        _errors = _errors[_errors.lead == lead]

    mean_errors_by_thres = _errors.groupby("thres").agg({'MAE':['mean','sem']}).reset_index()    
    ax.plot(_errors.thres, _errors.MAE, ec="k", alpha=0.5)
    ax.errorbar(mean_errors_by_thres.thres, 
                mean_errors_by_thres.MAE["mean"], 
                yerr=2*mean_errors_by_thres.MAE["sem"],
                color="k",
                linewidth=2.5)


    ax.set_xscale("log")
    ax.set_xlabel("Maximum # of sequences weekly")
    ax.set_ylabel(ylabel)
    
def plot_error_by_thres_at_lead(ax, errors, lead, ylabel=None, desired_thres=0.05):
    _errors = errors.copy()
    if lead is not None:
        _errors = _errors[_errors.lead == lead]
        
    ax.boxplot([df.MAE for n,df in _errors.groupby("thres")])
    ax.set_xticklabels([n for n,df in _errors.groupby("thres")])
    
    ax.axhline(y=desired_thres, color="k", linestyle="--", linewidth=1.5)
    ax.set_xlabel("Maximum number of sequences weekly")
    ax.set_ylabel(ylabel)

    
for l, lead in enumerate(LEADS_TO_PLOT):
    ax = fig.add_subplot(spec[l])
    
    if l == 0:
        ax0 = ax
    else:
        ax.sharey(ax0)
        
    simplify_axes(ax)
    plot_error_by_thres_at_lead(ax, errors, lead, ylabel=f"MAE {lead} days after forecast")
    
    # Changing axes
    #ax.set_yscale("log")
    ax.set_ylim((1e-4, 1.0))
    #ax.set_yticks([0.001, 0.005, 0.05, 0.1, 0.5, 1.0])
    #ax.set_yticklabels(["0.1%", "0.5%", "5%", "10%", "50%", "100%"])

# Change y-axes lo

In [None]:
def plot_error_by_thres(ax, mean_mae, lead, ylabel=None):
    _mean_mae = mean_mae.copy()
    if lead is not None:
        _mean_mae = _mean_mae[_mean_mae.lead == lead]
    
    mean_errors_by_thres = _mean_mae.groupby("thres").agg({'MAE_mean':['mean','sem']}).reset_index()    
    ax.scatter(_mean_mae.thres, _mean_mae.MAE_mean, ec="k", alpha=0.3)
    ax.errorbar(mean_errors_by_thres.thres, 
                mean_errors_by_thres.MAE_mean["mean"], 
                #yerr=2*mean_errors_by_thres.MAE_mean["sem"],
                color="k",
                linewidth=2.5)


    ax.set_xscale("log")
    ax.set_xlabel(ylabel)
    ax.set_ylabel("Average MAE over all leads")

In [None]:
threses

In [None]:
# Plot MAE over lead for each threshold
fig = plt.figure(figsize=(18., 6.), constrained_layout=True)
spec = fig.add_gridspec(ncols=6, nrows=2)

def plot_error_by_thres(ax, mean_mae, thres, ylabel=None, desired_thres=0.05):
    _mean_mae = mean_mae.copy()
    if lead is not None:
        _mean_mae = _mean_mae[_mean_mae.thres == thres]
    
    ax.plot(_mean_mae.lead, _mean_mae.MAE_mean, alpha=0.4)
    ax.fill_between(_mean_mae.lead, 
                    _mean_mae.MAE_mean - 2*_mean_mae.MAE_sem, 
                    _mean_mae.MAE_mean + 2*_mean_mae.MAE_sem,
                    alpha=0.1)

    ax.set_xlabel("Days from forecast")
    ax.set_ylabel(ylabel)
    ax.axhline(y=desired_thres, color="k", linestyle="--", linewidth=1.5)

for t, thres in enumerate(threses):
    ax = fig.add_subplot(spec[t])
    
    if t == 0:
        ax0 = ax
    else:
        ax.sharey(ax0)
        
    simplify_axes(ax)
    plot_error_by_thres(ax, mean_mae, thres, ylabel=f"Average MAE \n {thres} sequences weekly")
    ax.set_xlim((-30,30))
    ax.set_xticks([-30, -14, 0, 14, 30]) # Can find first point you cross accuracy threshold @ each sequencing thres
    
# Need to repeat for given leads
# Picking thresholds that are like actual historical / country-like rates

# Can we make a companion figure for this that shows what each countries sequencing effort 
# looks like currently in terms of sequences weekly (can tie in colors from rest of manuscript)

In [None]:
# Porportion less than desired_threshold

In [None]:
desired_thres=0.05

errors["acceptable"] = (errors.MAE < desired_thres)

mean_acceptable = (errors[["thres", "lead", "pivot_date", "acceptable"]] # Why does mean lead differ?
    .groupby(["thres", "lead"])
     .agg({'acceptable':['mean','sem']})
    .reset_index()

)
mean_acceptable.columns = ['thres', 'lead', 'acceptable_mean', 'acceptable_sem']

In [None]:
mean_acceptable

In [None]:
# Plot MAE over lead for each threshold
fig = plt.figure(figsize=(17., 6.), constrained_layout=True)
spec = fig.add_gridspec(ncols=6, nrows=2)

def plot_accept_by_thres(ax, mean_accept, thres, ylabel=None):
    _mean_accept = mean_accept.copy()
    if lead is not None:
        _mean_accept = _mean_accept[_mean_accept.thres == thres]
    
    ax.plot(_mean_accept.lead, _mean_accept.acceptable_mean, alpha=0.4)
    ax.fill_between(_mean_accept.lead, 
                    _mean_accept.acceptable_mean - 2*_mean_accept.acceptable_sem, 
                    _mean_accept.acceptable_mean + 2*_mean_accept.acceptable_sem,
                    alpha=0.1)

    ax.set_xlabel("Days from forecast")
    ax.set_ylabel(ylabel)
    
#ax = fig.add_subplot(spec[0])
for t, thres in enumerate(threses):
    ax = fig.add_subplot(spec[t])
    
    if t == 0:
        ax0 = ax
    else:
        ax.sharey(ax0)
        
    simplify_axes(ax)
    plot_accept_by_thres(ax, mean_acceptable, thres, ylabel=f"Proportion with MAE<5% \n {thres} sequences weekly")
    ax.set_xlim((-30,30))
    ax.set_xticks([-30, -14, 0, 14, 30]) # Can find first point you cross accuracy threshold @ each sequencing thres



In [None]:
# Thre

# Plot MAE over lead for each threshold
fig = plt.figure(figsize=(6., 4.), constrained_layout=True)
spec = fig.add_gridspec(ncols=1, nrows=1)
ax = fig.add_subplot(spec[0])

def plot_mae_by_thres_at_lead(ax, mean_mae, leads, cmap = mpl.colormaps["plasma"], ylabel=None, desired_thres=0.05, legend=True):
    simplify_axes(ax)
    _mean_mae = mean_mae.copy()
    if leads is not None:
        _mean_mae = _mean_mae[_mean_mae.lead.isin(leads)]
                
    for n, group in _mean_mae.groupby("lead"):        
        color = cmap((n - leads[0]) / (leads[-1] - leads[0]))
        ax.scatter(group.thres, group.MAE_mean, ec="k", label=f"{n} days", color=color)
        ax.plot(group.thres, group.MAE_mean, linewidth=1.0, color=color)
    ax.axhline(y=desired_thres, color="k", linestyle="--", linewidth=1.5)
    ax.set_xlabel("Maximum number of sequences weekly")
    ax.set_ylabel(ylabel)
    if legend:
        ax.legend()

plot_mae_by_thres_at_lead(ax, mean_mae, LEADS_TO_PLOT, cmap=lead_cmap,
                                  ylabel="Mean MAE" )

In [None]:
# Threshold is on x-axis, Percent with MAE less than 0.05 is on y-axis?
# At different leads?

# Answers: What number of sequences should pick to ensure our forecast error is less than 0.05 x days out?

# Plot MAE over lead for each threshold
fig = plt.figure(figsize=(6., 4.), constrained_layout=True)
spec = fig.add_gridspec(ncols=1, nrows=1)
ax = fig.add_subplot(spec[0])

def plot_prop_accept_by_thres_at_lead(ax, mean_accept, leads, cmap = mpl.colormaps["plasma"], ylabel=None, desired_thres=0.05):
    simplify_axes(ax)
    _mean_accept = mean_accept.copy()
    if leads is not None:
        _mean_accept = _mean_accept[_mean_accept.lead.isin(leads)]
                
    for n, group in _mean_accept.groupby("lead"):
        color = cmap((n - leads[0]) / (leads[-1] - leads[0]))
        ax.scatter(group.thres, group.acceptable_mean, ec="k", label=f"{n} days", color=color)
        ax.plot(group.thres, group.acceptable_mean, linewidth=1.0, color=color)

    #ax.set_xticklabels([n for n,df in _mean_accept.groupby("thres")])
    #ax.fill_between(_mean_accept.thres, 
    #                _mean_accept.acceptable_mean - 2*_mean_accept.acceptable_sem, 
    #                _mean_accept.acceptable_mean + 2*_mean_accept.acceptable_sem,
    #                alpha=0.1)
    ax.set_xlabel("Maximum number of sequences weekly")
    ax.set_ylabel(ylabel)
    ax.set_yticks([0.5, 0.6, 0.7, 0.8, 0.9])
    ax.legend()

plot_prop_accept_by_thres_at_lead(ax, mean_acceptable, LEADS_TO_PLOT, lead_cmap,
                                  ylabel="Porportion of forecasts within 5%\n of true frequency" )

# Constructing figure

In [None]:
import string 
fig = plt.figure(figsize=(12., 10.), constrained_layout=True)
spec = fig.add_gridspec(ncols=2, nrows=2)

# Empirical sequencing
ax_empirical = fig.add_subplot(spec[0:2])
plot_country_sequencing_rates(ax_empirical, _plot_df)

# MAE
ax_mae = fig.add_subplot(spec[2])
plot_mae_by_thres_at_lead(ax_mae, mean_mae, LEADS_TO_PLOT, lead_cmap,
                                  ylabel="Mean MAE", legend=False)

# Prop acceptabble
ax_prop = fig.add_subplot(spec[3])
plot_prop_accept_by_thres_at_lead(ax_prop, mean_acceptable, LEADS_TO_PLOT, lead_cmap,
                                  ylabel="Porportion of forecasts within 5%\n of true frequency" )

# Add axis labels

axes = [ax_empirical, ax_mae, ax_prop]
ax_labels = string.ascii_uppercase 

for ax, ax_label in zip(axes, ax_labels):
    ax.text(-0.05, 1.05, ax_label, transform=ax.transAxes, size=24, weight='bold')

In [None]:
fig.savefig("../manuscript/figures/downscaling_sequencing.png")