# Summarize delay between collection and submission dates

## Setup

In [None]:
from datetime import datetime, timedelta
import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import gamma, nbinom

%matplotlib inline

In [None]:
import scipy
scipy.__version__

In [None]:
mpl.__version__

In [None]:
random_seed = 314159

In [None]:
rng = np.random.default_rng(random_seed)

In [None]:
sns.set_style("ticks")

In [None]:
mpl.rcParams["axes.spines.top"] = False
mpl.rcParams["axes.spines.right"] = False
mpl.rcParams["savefig.dpi"] = 300
mpl.rcParams["figure.dpi"] = 150
mpl.rcParams["font.size"] = 14
mpl.rcParams["axes.labelsize"] = 14
mpl.rcParams["xtick.labelsize"] = 14
mpl.rcParams["ytick.labelsize"] = 14
mpl.rcParams["legend.fontsize"] = 14
mpl.rcParams["figure.facecolor"] = "white"

In [None]:
metadata_path = snakemake.input.metadata
figure_path = snakemake.output.figure

## Prepare metadata

In [None]:
metadata = pd.read_csv(
    metadata_path,
    sep="\t",
    parse_dates=[
        "date",
        "submission_date",
        "ideal_submission_date",
        "realistic_submission_date",
    ],
)

In [None]:
metadata.head()

## Calculate and plot delay between collection and submission

In [None]:
metadata["submission_delay"] = metadata["submission_date"] - metadata["date"]

In [None]:
metadata.head()

In [None]:
metadata["submission_delay"].describe()

## Get the distribution of delays in the last complete year

Filter records to those from the last year. These should have more reasonable and consistent submission delays.

In [None]:
recent_metadata = metadata[
    (metadata["date"] >= "2019-01-01") &
    (metadata["date"] < "2020-01-01") &
    (metadata["submission_date"] < "2020-10-01")
].copy()

In [None]:
metadata.shape

In [None]:
recent_metadata.shape

In [None]:
metadata.shape[0] - recent_metadata.shape[0]

In [None]:
recent_metadata.shape[0] / metadata.shape[0]

In [None]:
delay_in_days = recent_metadata["submission_delay"].dt.days.values

In [None]:
bins = np.linspace(
    0,
    delay_in_days.max(),
    100
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(delay_in_days, bins=bins)
ax.set_xlabel("Delay between collection and submission (days)")
ax.set_ylabel("Number of strains")

In [None]:
np.median(delay_in_days)

In [None]:
delay_in_days.mean()

In [None]:
delay_in_days.min()

In [None]:
(delay_in_days <= 28).sum()

In [None]:
delay_in_days.shape

In [None]:
(delay_in_days <= 28).sum() / delay_in_days.shape[0]

In [None]:
(delay_in_days <= 56).sum()

In [None]:
(delay_in_days <= 56).sum() / delay_in_days.shape[0]

In [None]:
recent_metadata[recent_metadata["submission_delay"].dt.days > 400]["submitting_lab"].value_counts()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(delay_in_days, bins=bins, density=True)
ax.set_xlabel("Delay between collection and submission (days)")
ax.set_ylabel("Density")

## Find a gamma distribution that matches the empirical distribution

Find gamma distribution parameters that best correspond to the observed delay in days for the last year using maximum likelihood estimation.

In [None]:
shape, loc, scale = gamma.fit(delay_in_days)

In [None]:
shape

In [None]:
loc

In [None]:
scale

In [None]:
shape * scale

Generate and plot random values from the gamma distribution matching the MLE parameters from the empirical distribution.

In [None]:
gamma_data = gamma.rvs(
    a=shape,
    loc=loc,
    scale=scale,
    size=delay_in_days.shape[0],
    random_state=random_seed,
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(
    delay_in_days,
    bins=bins,
    density=True,
    alpha=0.5,
    label="empirical distribution",
    color="C4",
)
values, hist_bins, patches = ax.hist(
    gamma_data,
    bins=bins,
    density=True,
    alpha=0.5,
    label="gamma distribution",
    color="C2",
)

ax.set_xlabel("Delay in days")
ax.set_ylabel("Density")
ax.legend(
    frameon=False
)

In [None]:
gamma_data.min()

In [None]:
gamma_data.mean()

In [None]:
delay_in_days.mean()

In [None]:
shape

In [None]:
scale

In [None]:
shape * scale

In [None]:
loc

In [None]:
ideal_scale = scale / 3.0

In [None]:
ideal_scale

In [None]:
shape * ideal_scale

In [None]:
ideal_gamma_data = gamma.rvs(
    a=shape,
    loc=loc,
    scale=ideal_scale,
    size=delay_in_days.shape[0],
    random_state=random_seed,
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(
    delay_in_days,
    bins=bins,
    density=True,
    color="C4",
    alpha=0.5,
    label="empirical distribution"
)

values, hist_bins, patches = ax.hist(
    gamma_data,
    bins=bins,
    density=True,
    color="C2",
    alpha=0.5,
    label="gamma distribution",
    histtype="step",
    linewidth=2,
)

values, hist_bins, patches = ax.hist(
    ideal_gamma_data,
    bins=bins,
    density=True,
    color="C1",
    alpha=0.5,
    label="ideal gamma distribution",
    histtype="step",
    linewidth=2,
)

ax.set_xlabel("Delay in days")
ax.set_ylabel("Density")
ax.legend(
    frameon=False
)

In [None]:
ideal_gamma_data.min()

In [None]:
ideal_shape = shape / 3.0

In [None]:
ideal_shape

In [None]:
ideal_shape * scale

In [None]:
ideal_gamma_data_by_shape = gamma.rvs(
    a=ideal_shape,
    loc=loc,
    scale=scale,
    size=delay_in_days.shape[0],
    random_state=random_seed,
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(
    delay_in_days,
    bins=bins,
    density=True,
    color="C4",
    alpha=0.5,
    label="empirical distribution"
)

values, hist_bins, patches = ax.hist(
    gamma_data,
    bins=bins,
    density=True,
    color="C2",
    alpha=0.5,
    label="gamma distribution",
    histtype="step",
)

values, hist_bins, patches = ax.hist(
    ideal_gamma_data,
    bins=bins,
    density=True,
    color="C1",
    alpha=0.5,
    label="ideal gamma distribution (scale)",
    histtype="step",
)

values, hist_bins, patches = ax.hist(
    ideal_gamma_data_by_shape,
    bins=bins,
    density=True,
    color="C3",
    alpha=0.5,
    label="ideal gamma distribution (shape)",
    histtype="step",
)


ax.set_xlabel("Delay in days")
ax.set_ylabel("Density")
ax.legend(
    frameon=False
)

In [None]:
ideal_gamma_data_by_shape.min()

In [None]:
(ideal_gamma_data < 31).sum() / ideal_gamma_data.shape[0]

In [None]:
(ideal_gamma_data < 62).sum() / ideal_gamma_data.shape[0]

In [None]:
(ideal_gamma_data_by_shape < 31).sum() / ideal_gamma_data_by_shape.shape[0]

In [None]:
(ideal_gamma_data_by_shape < 62).sum() / ideal_gamma_data_by_shape.shape[0]

In [None]:
ideal_gamma_data.mean()

In [None]:
ideal_gamma_data_by_shape.mean()

In [None]:
fig, (ax_horizons, ax_submissions) = plt.subplots(2, 1, figsize=(8, 7))

# Panel A: Forecast horizons

initial_dates = [
    "2023-04-01",
    "2023-07-01",
    "2023-10-01",
    "2024-01-01",
    "2024-01-01",
    "2024-01-01",
]
initial_dates = [
    datetime.strptime(date, "%Y-%m-%d")
    for date in initial_dates
]
heights = list(range(len(initial_dates), 0, -1))
final_date = datetime.strptime("2024-04-01", "%Y-%m-%d")

horizon_labels = [
    "12 months",
    "9 months",
    "6 months",
    "3 months",
    "",
    "",
]

colors = ["C0", "C0", "C0", "C0", "C1", "C2"]

ax_horizons.scatter(
    initial_dates,
    heights,
    color=colors,
    edgecolors="#000000",
    s=70,
)

for initial_date, height, horizon_label in zip(initial_dates, heights, horizon_labels):
    # Draw arrows from initial to final timepoints.
    ax_horizons.annotate(
        "",
        xy=(final_date, height),
        xycoords='data',
        xytext=(initial_date, height),
        textcoords='data',
        arrowprops=dict(facecolor='black', width=3,),
        horizontalalignment='right',
        verticalalignment='center',
        color="#000000",
        fontsize=12,
        zorder=-10,
    )
    
    ax_horizons.annotate(
        horizon_label,
        xy=(final_date + timedelta(weeks=1), height),
        horizontalalignment='left',
        verticalalignment='center',
    )

ax_horizons.axvline(
    x=final_date,
    color="#000000",
    ymax=0.9,
    ymin=0.01,
)

ax_horizons.annotate(
    "initial timepoint",
    xy=(initial_dates[0], heights[0] + 1),
    horizontalalignment='center',
    verticalalignment='center',
)

ax_horizons.annotate(
    "final timepoint",
    xy=(final_date, heights[0] + 1),
    horizontalalignment='center',
    verticalalignment='center',
)

ax_horizons.yaxis.set_visible(False)
ax_horizons.spines[["left", "top", "right", "bottom"]].set_visible(False)

ax_horizons.xaxis.set_ticks(sorted(set(initial_dates) | set([final_date])))
ax_horizons.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))

ax_horizons.set_xlim(right=final_date + timedelta(weeks=4))
ax_horizons.set_ylim(top=heights[0] + 1)

ax_horizons.set_xlabel("Date")

# Panel B: Submission delay distributions.

values, hist_bins, patches = ax_submissions.hist(
    delay_in_days,
    bins=bins,
    density=True,
    color="C4",
    alpha=0.75,
    label="observed delay (3-month average)",
)

values, hist_bins, patches = ax_submissions.hist(
    gamma_data,
    bins=bins,
    density=True,
    color="C2",
    alpha=0.75,
    label="realistic delay (3-month average)",
    histtype="step",
    linewidth=2,
)

values, hist_bins, patches = ax_submissions.hist(
    ideal_gamma_data,
    bins=bins,
    density=True,
    color="C1",
    alpha=0.75,
    label="ideal delay (1-month average)",
    histtype="step",
    linewidth=2,
)

ax_submissions.axvline(
    x=0,
    color="C0",
    label="no delay"
)

ax_submissions.set_xlabel("Delay between sample collection and submission (days)")
ax_submissions.set_ylabel("Density")
ax_submissions.legend(
    frameon=False
)

ax_submissions.set_xlim(right=365)

panel_labels_dict = {
    "weight": "bold",
    "size": 14,
}
plt.figtext(0.01, 0.97, "A", **panel_labels_dict)
plt.figtext(0.01, 0.47, "B", **panel_labels_dict)

plt.tight_layout(h_pad=1.25)
plt.savefig(figure_path)

## Plot observed distribution of delays since the pandemic

In [None]:
pandemic_metadata = metadata[
    (metadata["date"] >= "2022-01-01") &
    (metadata["date"] < "2023-01-01")
].copy()

In [None]:
pandemic_delay_in_days = pandemic_metadata["submission_delay"].dt.days.values

In [None]:
pandemic_bins = np.linspace(
    0,
    pandemic_delay_in_days.max(),
    100
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))

values, hist_bins, patches = ax.hist(
    delay_in_days,
    bins=bins,
    label="Pre-pandemic era",
    alpha=0.5,
    color="C4",
)

values, hist_bins, patches = ax.hist(
    pandemic_delay_in_days,
    bins=pandemic_bins,
    label="Pandemic era",
    alpha=0.5,
    color="C9",
    zorder=-10,
)

ax.set_xlabel("Delay between collection and submission (days)")
ax.set_ylabel("Number of strains")

ax.axvline(x=delay_in_days.mean(), linestyle="dashed", color="C4")
ax.axvline(x=pandemic_delay_in_days.mean(), linestyle="dashed", color="C9")

ax.legend(frameon=False)

ax.text(
    x=0.5,
    y=0.7,
    s=f"Mean pre-pandemic delay: {delay_in_days.mean():.0f} days",
    transform=ax.transAxes,
)

ax.text(
    x=0.5,
    y=0.6,
    s=f"Mean pandemic delay: {pandemic_delay_in_days.mean():.0f} days",
    transform=ax.transAxes,
)

plt.tight_layout()
plt.savefig(snakemake.output.pre_vs_post_pandemic_era)

In [None]:
delay_in_days.mean()

In [None]:
pandemic_delay_in_days.mean()

In [None]:
np.median(delay_in_days)

In [None]:
np.median(pandemic_delay_in_days)

In [None]:
pandemic_delay_in_days.shape

In [None]:
delay_in_days.shape