In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

In [None]:
sns.set_style("white")

In [None]:
# Display figures at a reasonable default size.
mpl.rcParams['figure.figsize'] = (6, 4)

# Disable top and right spines.
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
    
# Display and save figures at higher resolution for presentations and manuscripts.
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['figure.dpi'] = 300

# Display text at sizes large enough for presentations and manuscripts.
mpl.rcParams['font.weight'] = "normal"
mpl.rcParams['axes.labelweight'] = "normal"
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['legend.fontsize'] = 12
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14

## Load data

Load exhaustive grid search data. For each possible embedding method, corresponding method parameters, and HDBSCAN distance threshold, we produced an embedding for training and validation data (using 2-fold validation with 3 repeats for N=6 cross-validation iterations per parameter combination), assigned clusters to each embedding, and evaluated how well all pairs of strains in the data were assigned to the same or different cluster compared to predetermined clade assignments.

In [None]:
grid = pd.read_csv(snakemake.input.table, sep="\t")

In [None]:
grid.head()

In [None]:
list(grid.columns)

## Identify the distance threshold that maximized the training accuracy

We want to identify the distance threshold for HDBSCAN cluster assignment that produces the most accurate clusters (based on known clade assignments) across each method. To find this optimal threshold, we calculate the mean training MCC value across all cross-validation iterations for a given method and set of method parameters and find the threshold that maximizes this mean across each method.

In [None]:
mean_training_mcc_by_method = grid.groupby(["method", "distance_threshold"])["training_mcc"].mean().reset_index()

In [None]:
mean_training_mcc_by_method

In [None]:
facet_grid = sns.relplot(
    data=grid,
    x="distance_threshold",
    y="training_mcc",
    col="method",
    col_wrap=2,
    alpha=0.5,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)
    
plt.tight_layout()
plt.savefig(snakemake.output.mcc_by_method_and_distance_threshold)

In [None]:
facet_grid = sns.relplot(
    data=grid,
    x="distance_threshold",
    y="validation_mcc",
    col="method",
    col_wrap=2,
    alpha=0.5,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)
    
plt.tight_layout()
plt.savefig(snakemake.output.mcc_by_method_and_distance_threshold)

In [None]:
grid

Calculate the mean training MCC for each method and distance threshold across all cross-validation iterations. The maximum mean training MCC per method identifies the optimal distance threshold for that method.

In [None]:
mean_training_mcc = grid.groupby(["method", "distance_threshold"])["training_mcc"].mean().reset_index()

In [None]:
mean_training_mcc

In [None]:
max_training_mcc = mean_training_mcc.sort_values([
    "method",
    "training_mcc"
], ascending=False).groupby(
    "method",
    sort=False
).first().reset_index()

In [None]:
max_training_mcc

In [None]:
facet_grid = sns.relplot(
    data=grid.query("method == 't-sne'"),
    x="distance_threshold",
    y="training_mcc",
    col="learning_rate",
    row="perplexity",
    alpha=0.75,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)

In [None]:
facet_grid = sns.relplot(
    data=grid.query("method == 'umap'"),
    x="distance_threshold",
    y="training_mcc",
    col="n_neighbors",
    row="min_dist",
    alpha=0.75,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)

### PCA

In [None]:
pca_max_training_mcc = max_training_mcc.query("method == 'pca'")

In [None]:
pca_max_training_mcc

In [None]:
pca_max_training_mcc.to_csv(
    snakemake.output.pca_parameters,
    index=False,
)

### MDS

In [None]:
mds_max_training_mcc = max_training_mcc.query("method == 'mds'")

In [None]:
mds_max_training_mcc

In [None]:
mds_max_training_mcc.to_csv(
    snakemake.output.mds_parameters,
    index=False,
)

## Identify optimal method parameter values

Given the distance threshold that maximizes the training MCC above (threshold=4 for t-SNE and threshold=2 for UMAP), we next identify the combination of method parameters that maximizes the validation MCC for that distance threshold.

### t-SNE

In [None]:
tsne_max_training_mcc = max_training_mcc.query("method == 't-sne'")

In [None]:
tsne_grid = tsne_max_training_mcc.merge(
    grid,
    on=["method", "distance_threshold"]
)

In [None]:
tsne_grid.head()

In [None]:
tsne_mean_validation_mcc = tsne_grid.groupby([
    "method",
    "distance_threshold",
    "perplexity",
    "learning_rate"
])["validation_mcc"].mean().reset_index().sort_values(
    "validation_mcc",
    ascending=False
)

In [None]:
tsne_mean_validation_mcc

In [None]:
tsne_max_validation_mcc = tsne_mean_validation_mcc.sort_values("validation_mcc", ascending=False).head(1)

In [None]:
tsne_max_validation_mcc

In [None]:
tsne_max_validation_mcc.to_csv(
    snakemake.output.tsne_parameters,
    index=False,
)

In [None]:
facet_grid = sns.catplot(
    data=tsne_grid,
    x="perplexity",
    y="validation_mcc",
    hue="learning_rate",
    dodge=True,
    alpha=0.75,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)

As we’ve seen before, learning rate doesn’t matter much, but perplexity does.

### UMAP

In [None]:
umap_max_training_mcc = max_training_mcc.query("method == 'umap'")

In [None]:
umap_grid = umap_max_training_mcc.merge(
    grid,
    on=["method", "distance_threshold"]
)

In [None]:
umap_grid.head()

In [None]:
umap_mean_validation_mcc = umap_grid.groupby([
    "method",
    "distance_threshold",
    "min_dist",
    "n_neighbors"
])["validation_mcc"].mean().reset_index().sort_values(
    "validation_mcc",
    ascending=False
)

In [None]:
umap_mean_validation_mcc

In [None]:
umap_max_validation_mcc = umap_mean_validation_mcc.sort_values("validation_mcc", ascending=False).head(1)

In [None]:
umap_max_validation_mcc

In [None]:
umap_max_validation_mcc.to_csv(
    snakemake.output.umap_parameters,
    index=False,
)

In [None]:
facet_grid = sns.catplot(
    data=umap_grid,
    x="min_dist",
    y="validation_mcc",
    hue="n_neighbors",
    dodge=True,
    alpha=0.75,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)

In contrast with t-SNE's parameters, UMAP appears robust across all tested parameter combinations, producing roughly the same MCC values for all combinations. Although the optimal parameter combination is min_dist=0.05, n_neighbors=50 (MCC=0.681), the second best combination of min_dist=0.05, n_neighbors=100 has an MCC=0.6796).