In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

In [3]:
sns.set_style("white")

In [4]:
# Display figures at a reasonable default size.
mpl.rcParams['figure.figsize'] = (6, 4)

# Disable top and right spines.
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
    
# Display and save figures at higher resolution for presentations and manuscripts.
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['figure.dpi'] = 300

# Display text at sizes large enough for presentations and manuscripts.
mpl.rcParams['font.weight'] = "normal"
mpl.rcParams['axes.labelweight'] = "normal"
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['legend.fontsize'] = 12
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14

## Load data

Load exhaustive grid search data. For each possible embedding method, corresponding method parameters, and HDBSCAN distance threshold, we produced an embedding for training and validation data (using 2-fold validation with 3 repeats for N=6 cross-validation iterations per parameter combination), assigned clusters to each embedding, and evaluated how well all pairs of strains in the data were assigned to the same or different cluster compared to predetermined clade assignments.

In [5]:
grid = pd.read_csv(snakemake.input.table)

In [6]:
grid.head()

In [7]:
list(grid.columns)

## Identify optimal method parameter values

Find the method parameters for each method that minimizes the mean squared error (MSE) across all replicates.

In [8]:
grid_columns = [
    "method",
    "components",
    "perplexity",
    "learning_rate",
    "nearest_neighbors",
    "min_dist",
]

In [9]:
grid

In [10]:
grid.groupby(grid_columns, dropna=False)["mse"].agg(["mean", "std"]).reset_index()

### PCA

In [11]:
pca_grid = grid.query("method == 'pca'")

In [12]:
pca_grid.shape

In [13]:
pca_grid

In [14]:
pca_mean_accuracy = pca_grid.groupby([
    "method",
    "components",
])["mse"].mean().reset_index().sort_values(
    "mse",
    ascending=False
)

In [15]:
pca_mean_accuracy

In [16]:
pca_best_accuracy = pca_mean_accuracy.sort_values("mse", ascending=True).head(1)

In [17]:
pca_best_accuracy

In [18]:
pca_best_accuracy["virus"] = snakemake.wildcards.virus
pca_best_accuracy["recombination_rate"] = snakemake.wildcards.recombination_rate

In [19]:
pca_best_accuracy

In [20]:
pca_best_accuracy.to_csv(
    snakemake.output.pca_parameters,
    index=False,
)

In [21]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6), dpi=120)
ax = sns.boxplot(
    data=pca_grid,
    x="components",
    y="mse",
    ax=ax,
    color="#CCCCCC",
)

ax = sns.swarmplot(
    data=pca_grid,
    x="components",
    y="mse",
    ax=ax
)

ax.set_ylabel("Mean squared test error\nobserved and predicted Euclidean distance")
ax.set_ylim(bottom=0)

plt.tight_layout()
plt.savefig(snakemake.output.score_by_pca_parameters)

### MDS

In [22]:
mds_grid = grid.query("method == 'mds'")

In [23]:
mds_grid.shape

In [24]:
mds_mean_accuracy = mds_grid.groupby([
    "method",
    "components",
])["mse"].mean().reset_index().sort_values(
    "mse",
    ascending=False
)

In [25]:
mds_mean_accuracy

In [26]:
mds_best_accuracy = mds_mean_accuracy.sort_values("mse", ascending=True).head(1)

In [27]:
mds_best_accuracy

In [28]:
mds_best_accuracy["virus"] = snakemake.wildcards.virus
mds_best_accuracy["recombination_rate"] = snakemake.wildcards.recombination_rate

In [29]:
mds_best_accuracy

In [30]:
mds_best_accuracy.to_csv(
    snakemake.output.mds_parameters,
    index=False,
)

In [31]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6), dpi=120)
ax = sns.boxplot(
    data=mds_grid,
    x="components",
    y="mse",
    ax=ax,
    color="#CCCCCC",
)

ax = sns.swarmplot(
    data=mds_grid,
    x="components",
    y="mse",
    ax=ax
)

ax.set_ylabel("Mean squared test error\nobserved and predicted Euclidean distance")
ax.set_ylim(bottom=0)

plt.tight_layout()
plt.savefig(snakemake.output.score_by_mds_parameters)

### t-SNE

In [32]:
tsne_grid = grid.query("method == 't-sne'")

In [33]:
tsne_grid.shape

In [34]:
tsne_mean_accuracy = tsne_grid.groupby([
    "method",
    "perplexity",
    "learning_rate"
])["mse"].mean().reset_index().sort_values(
    "mse",
    ascending=False
)

In [35]:
tsne_mean_accuracy

In [36]:
tsne_best_accuracy = tsne_mean_accuracy.sort_values("mse", ascending=True).head(1)

In [37]:
tsne_best_accuracy

In [38]:
tsne_best_accuracy["virus"] = snakemake.wildcards.virus
tsne_best_accuracy["recombination_rate"] = snakemake.wildcards.recombination_rate

In [39]:
tsne_best_accuracy

In [40]:
tsne_best_accuracy.to_csv(
    snakemake.output.tsne_parameters,
    index=False,
)

In [41]:
facet_grid = sns.catplot(
    data=tsne_grid,
    x="perplexity",
    y="mse",
    hue="learning_rate",
    dodge=True,
    kind="box",
    aspect=1.41,
    height=6,
    legend=False,
)

for ax in facet_grid.axes.flatten():
    ax.set_xlabel("Perplexity")
    ax.set_ylabel("Mean squared test error\nobserved and predicted Euclidean distance")
    ax.set_ylim(bottom=0)

facet_grid.add_legend(
    title="Learning rate",
    loc="upper right",
)

plt.tight_layout()
plt.savefig(snakemake.output.score_by_tsne_parameters)

### UMAP

In [42]:
umap_grid = grid.query("method == 'umap'")

In [43]:
umap_grid.head()

In [44]:
umap_grid.shape

In [45]:
umap_mean_accuracy = umap_grid.groupby([
    "method",
    "min_dist",
    "nearest_neighbors",
])["mse"].mean().reset_index().sort_values(
    "mse",
    ascending=False
)

In [46]:
umap_mean_accuracy

In [47]:
umap_best_accuracy = umap_mean_accuracy.sort_values("mse", ascending=True).head(1)

In [48]:
umap_best_accuracy

In [49]:
umap_best_accuracy["virus"] = snakemake.wildcards.virus
umap_best_accuracy["recombination_rate"] = snakemake.wildcards.recombination_rate

In [50]:
umap_best_accuracy

In [51]:
umap_best_accuracy.to_csv(
    snakemake.output.umap_parameters,
    index=False,
)

In [52]:
facet_grid = sns.catplot(
    data=umap_grid,
    x="min_dist",
    y="mse",
    hue="nearest_neighbors",
    dodge=True,
    kind="box",
    aspect=1.41,
    height=6,
    legend=False,
)

for ax in facet_grid.axes.flatten():
    ax.set_xlabel("Minimum distance between points")
    ax.set_ylabel("Mean squared test error\nobserved and predicted Euclidean distance")
    ax.set_ylim(bottom=0)

facet_grid.add_legend(
    title="Nearest neighbors",
    loc="upper right",
)

plt.tight_layout()
plt.savefig(snakemake.output.score_by_umap_parameters)

## Find best accuracy per method

In [53]:
best_accuracy = pd.concat([pca_best_accuracy, mds_best_accuracy, tsne_best_accuracy, umap_best_accuracy])

In [54]:
best_accuracy

In [55]:
output_columns = ["virus", "recombination_rate"] + grid_columns + ["mse"]
best_accuracy.to_csv(
    snakemake.output.summary_score_by_method,
    index=False,
    columns=output_columns,
)