# Summarize delay between collection and submission dates

## Setup

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import gamma, nbinom

%matplotlib inline

In [None]:
np.random.seed(314159)

In [None]:
sns.set_style("ticks")

In [None]:
mpl.rcParams["axes.spines.top"] = False
mpl.rcParams["axes.spines.right"] = False
mpl.rcParams["savefig.dpi"] = 300
mpl.rcParams["figure.dpi"] = 150
mpl.rcParams["font.size"] = 14
mpl.rcParams["axes.labelsize"] = 14
mpl.rcParams["xtick.labelsize"] = 14
mpl.rcParams["ytick.labelsize"] = 14
mpl.rcParams["legend.fontsize"] = 14
mpl.rcParams["figure.facecolor"] = "white"

In [None]:
metadata_path = snakemake.input.metadata
figure_path = snakemake.output.figure

## Prepare metadata

In [None]:
metadata = pd.read_csv(
    metadata_path,
    sep="\t"
)

In [None]:
metadata.head()

Drop records with ambiguous collection or submission dates. We need complete dates for both fields to calculate the distribution of the delay between collection and submission.

In [None]:
filtered_metadata = metadata[
    (~metadata["date"].str.contains("X")) &
    (~metadata["submission_date"].str.contains("X"))
].copy()

In [None]:
metadata.shape

In [None]:
filtered_metadata.shape

Convert date fields into timestamps.

In [None]:
filtered_metadata["date"] = pd.to_datetime(filtered_metadata["date"])
filtered_metadata["submission_date"] = pd.to_datetime(filtered_metadata["submission_date"])

In [None]:
filtered_metadata.head()

## Calculate and plot delay between collection and submission

In [None]:
filtered_metadata["submission_delay"] = filtered_metadata["submission_date"] - filtered_metadata["date"]

In [None]:
filtered_metadata.head()

In [None]:
filtered_metadata["submission_delay"].describe()

In [None]:
delay_in_days = filtered_metadata["submission_delay"].dt.days.values

In [None]:
bins = np.linspace(
    0,
    delay_in_days.max(),
    100
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(delay_in_days, bins=bins)
ax.set_xlabel("Delay between collection and submission (days)")
ax.set_ylabel("Number of strains")

To understand how the delay between collection and submission changed over time, we inspect the distribution in sliding windows that match our original forecasting analysis (6 year windows, sliding by 6 months).

In [None]:
end_dates = pd.date_range(
    "2006-01-01",
    "2020-01-01",
    freq="6MS"
)

In [None]:
years_back = pd.DateOffset(years=6)

In [None]:
end_dates.shape

In [None]:
delay_in_days.max()

In [None]:
delay_in_days = filtered_metadata["submission_delay"].dt.days.values

In [None]:
bins = np.arange(0, 5000, 30)

In [None]:
start_dates = []
mean_delays = []
std_delays = []

fig, axes = plt.subplots(
    nrows=end_dates.shape[0],
    ncols=1,
    figsize=(8, 12),
    sharex=True,
    sharey=True
)

for end_date, ax in zip(end_dates, axes.flatten()):
    start_date = end_date - years_back
    window_metadata = filtered_metadata[
        (filtered_metadata["date"] >= start_date) &
        (filtered_metadata["date"] < end_date)
    ]
    window_delay_in_days = window_metadata["submission_delay"].dt.days.values
    
    start_dates.append(start_date)
    mean_delays.append(window_delay_in_days.mean())
    std_delays.append(window_delay_in_days.std())
    ax.hist(
        window_delay_in_days,
        bins=bins,
        density=True
    )
    ax.set_yticklabels([])

In [None]:
start_dates = pd.to_datetime(start_dates)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
ax.errorbar(
    start_dates,
    mean_delays,
    std_delays,
    marker="o",
    fmt=""
)
ax.set_xlabel("Start of 6-year window")
ax.set_ylabel("Delay between collection and submission")

## Investigate distribution of delays in the last complete year

Filter records to those from the last year. These should have more reasonable and consistent submission delays.

In [None]:
recent_metadata = filtered_metadata[
    (filtered_metadata["date"] >= "2019-01-01") &
    (filtered_metadata["date"] < "2020-01-01")
].copy()

In [None]:
filtered_metadata.shape

In [None]:
recent_metadata.shape

In [None]:
filtered_metadata.shape[0] - recent_metadata.shape[0]

In [None]:
recent_metadata.shape[0] / filtered_metadata.shape[0]

In [None]:
delay_in_days = recent_metadata["submission_delay"].dt.days.values

In [None]:
bins = np.linspace(
    0,
    delay_in_days.max(),
    100
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(delay_in_days, bins=bins)
ax.set_xlabel("Delay between collection and submission (days)")
ax.set_ylabel("Number of strains")

In [None]:
delay_in_days.mean()

In [None]:
delay_in_days.min()

In [None]:
(delay_in_days <= 31).sum()

In [None]:
delay_in_days.shape

In [None]:
(delay_in_days <= 31).sum() / delay_in_days.shape[0]

In [None]:
(delay_in_days <= 62).sum()

In [None]:
(delay_in_days <= 62).sum() / delay_in_days.shape[0]

In [None]:
recent_metadata[recent_metadata["submission_delay"].dt.days > 400]["submitting_lab"].value_counts()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(delay_in_days, bins=bins, density=True)
ax.set_xlabel("Delay between collection and submission (days)")
ax.set_ylabel("Density")

## Find a gamma distribution that matches the empirical distribution

Find gamma distribution parameters that best correspond to the observed delay in days for the last year using maximum likelihood estimation.

In [None]:
shape, loc, scale = gamma.fit(delay_in_days)

In [None]:
shape

In [None]:
loc

In [None]:
scale

Generate and plot random values from the gamma distribution matching the MLE parameters from the empirical distribution.

In [None]:
gamma_data = gamma.rvs(
    a=shape,
    loc=loc,
    scale=scale,
    size=delay_in_days.shape[0]
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(
    delay_in_days,
    bins=bins,
    density=True,
    alpha=0.5,
    label="empirical distribution"
)
values, hist_bins, patches = ax.hist(
    gamma_data,
    bins=bins,
    density=True,
    color="orange",
    alpha=0.5,
    label="gamma distribution"
)

ax.set_xlabel("Delay in days")
ax.set_ylabel("Density")
ax.legend(
    frameon=False
)

In [None]:
gamma_data.min()

In [None]:
gamma_data.mean()

In [None]:
delay_in_days.mean()

In [None]:
shape

In [None]:
scale

In [None]:
shape * scale

In [None]:
loc

In [None]:
ideal_scale = scale / 3.0

In [None]:
ideal_scale

In [None]:
shape * ideal_scale

In [None]:
ideal_gamma_data = gamma.rvs(
    a=shape,
    loc=loc,
    scale=ideal_scale,
    size=delay_in_days.shape[0]
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(
    delay_in_days,
    bins=bins,
    density=True,
    alpha=0.5,
    label="empirical distribution"
)

values, hist_bins, patches = ax.hist(
    gamma_data,
    bins=bins,
    density=True,
    color="orange",
    alpha=0.5,
    label="gamma distribution"
)

values, hist_bins, patches = ax.hist(
    ideal_gamma_data,
    bins=bins,
    density=True,
    color="yellow",
    alpha=0.5,
    label="ideal gamma distribution"
)

ax.set_xlabel("Delay in days")
ax.set_ylabel("Density")
ax.legend(
    frameon=False
)

In [None]:
ideal_gamma_data.min()

In [None]:
ideal_shape = shape / 3.0

In [None]:
ideal_shape

In [None]:
ideal_shape * scale

In [None]:
ideal_gamma_data_by_shape = gamma.rvs(
    a=ideal_shape,
    loc=loc,
    scale=scale,
    size=delay_in_days.shape[0]
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(
    delay_in_days,
    bins=bins,
    density=True,
    alpha=0.5,
    label="empirical distribution"
)

values, hist_bins, patches = ax.hist(
    gamma_data,
    bins=bins,
    density=True,
    color="orange",
    alpha=0.5,
    label="gamma distribution"
)

values, hist_bins, patches = ax.hist(
    ideal_gamma_data,
    bins=bins,
    density=True,
    color="yellow",
    alpha=0.5,
    label="ideal gamma distribution (scale)"
)

values, hist_bins, patches = ax.hist(
    ideal_gamma_data_by_shape,
    bins=bins,
    density=True,
    color="purple",
    alpha=0.5,
    label="ideal gamma distribution (shape)"
)


ax.set_xlabel("Delay in days")
ax.set_ylabel("Density")
ax.legend(
    frameon=False
)

In [None]:
ideal_gamma_data_by_shape.min()

In [None]:
(ideal_gamma_data < 31).sum() / ideal_gamma_data.shape[0]

In [None]:
(ideal_gamma_data < 62).sum() / ideal_gamma_data.shape[0]

In [None]:
(ideal_gamma_data_by_shape < 31).sum() / ideal_gamma_data_by_shape.shape[0]

In [None]:
(ideal_gamma_data_by_shape < 62).sum() / ideal_gamma_data_by_shape.shape[0]

In [None]:
ideal_gamma_data.mean()

In [None]:
ideal_gamma_data_by_shape.mean()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
values, hist_bins, patches = ax.hist(
    delay_in_days,
    bins=bins,
    density=True,
    alpha=0.75,
    label="empirical distribution"
)

values, hist_bins, patches = ax.hist(
    gamma_data,
    bins=bins,
    density=True,
    color="orange",
    alpha=0.5,
    label="ML gamma distribution"
)

values, hist_bins, patches = ax.hist(
    ideal_gamma_data,
    bins=bins,
    density=True,
    color="purple",
    alpha=0.5,
    label="ideal gamma distribution"
)

ax.axvline(
    x=0,
    color="#999999",
    label="retrospective delay"
)

ax.set_xlabel("Delay between sample collection and submission (days)")
ax.set_ylabel("Density")
ax.legend(
    frameon=False
)

plt.tight_layout()
plt.savefig(figure_path)