In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

In [3]:
sns.set_style("white")

In [4]:
# Display figures at a reasonable default size.
mpl.rcParams['figure.figsize'] = (6, 4)

# Disable top and right spines.
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
    
# Display and save figures at higher resolution for presentations and manuscripts.
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['figure.dpi'] = 300

# Display text at sizes large enough for presentations and manuscripts.
mpl.rcParams['font.weight'] = "normal"
mpl.rcParams['axes.labelweight'] = "normal"
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['legend.fontsize'] = 12
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14

## Load data

Load exhaustive grid search data. For each possible embedding method, corresponding method parameters, and HDBSCAN distance threshold, we produced an embedding for training and validation data (using 2-fold validation with 3 repeats for N=6 cross-validation iterations per parameter combination), assigned clusters to each embedding, and evaluated how well all pairs of strains in the data were assigned to the same or different cluster compared to predetermined clade assignments.

In [5]:
grid = pd.read_csv(snakemake.input.table, sep="\t")

In [6]:
grid.head()

In [7]:
list(grid.columns)

## Identify the distance threshold that maximized the training accuracy

We want to identify the distance threshold for HDBSCAN cluster assignment that produces the most accurate clusters (based on known clade assignments) across each method. To find this optimal threshold, we calculate the mean training MCC value across all cross-validation iterations for a given method and set of method parameters and find the threshold that maximizes this mean across each method.

In [8]:
mean_training_mcc_by_method = grid.groupby(["method", "distance_threshold"])["training_mcc"].mean().reset_index()

In [9]:
mean_training_mcc_by_method

In [10]:
facet_grid = sns.relplot(
    data=grid,
    x="distance_threshold",
    y="training_mcc",
    col="method",
    col_wrap=2,
    alpha=0.5,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)
    
plt.tight_layout()
plt.savefig(snakemake.output.mcc_by_method_and_distance_threshold)

In [17]:
facet_grid = sns.relplot(
    data=grid,
    x="distance_threshold",
    y="validation_mcc",
    col="method",
    col_wrap=2,
    alpha=0.5,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)
    
plt.tight_layout()
plt.savefig(snakemake.output.mcc_by_method_and_distance_threshold)

In [28]:
grid

In [62]:
for method in ["pca","mds","t-sne","umap"]:
    grid_search = []
    for distance_threshold in range(0,14,2):
        grid_ = grid.groupby(by=["method", "distance_threshold"]).get_group((method, float(distance_threshold)))["validation_mcc"].mean()#.argmax()
        grid_search.append(grid_)
    max_value = max(grid_search)
    #2 is the step size, which I can use as a variable in a script if needed
    print(2* grid_search.index(max_value))

In [11]:
facet_grid = sns.relplot(
    data=grid.query("method == 't-sne'"),
    x="distance_threshold",
    y="training_mcc",
    col="learning_rate",
    row="perplexity",
    alpha=0.75,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)

In [12]:
facet_grid = sns.relplot(
    data=grid.query("method == 'umap'"),
    x="distance_threshold",
    y="training_mcc",
    col="n_neighbors",
    row="min_dist",
    alpha=0.75,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)

## Identify optimal method parameter values

Given the distance threshold that maximizes the training MCC above (threshold=4 for t-SNE and threshold=2 for UMAP), we next identify the combination of method parameters that maximizes the validation MCC for that distance threshold.

In [13]:
grid.query("distance_threshold == 4").groupby(["method", "perplexity", "learning_rate"])["validation_mcc"].mean().reset_index().sort_values(
    "validation_mcc",
    ascending=False
)

In [14]:
facet_grid = sns.catplot(
    data=grid.query("(method == 't-sne') & (distance_threshold == 4)"),
    x="perplexity",
    y="validation_mcc",
    hue="learning_rate",
    dodge=True,
    alpha=0.75,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)

As we’ve seen before, learning rate doesn’t matter much, but perplexity does. The maximum validation MCC for t-SNE is at perplexity=30 and learning rate = 200, which is what we’ve been using.

In [15]:
grid.query("distance_threshold == 2").groupby(["method", "min_dist", "n_neighbors"])["validation_mcc"].mean().reset_index().sort_values(
    "validation_mcc",
    ascending=False
)

In [16]:
facet_grid = sns.catplot(
    data=grid.query("(method == 'umap') & (distance_threshold == 2)"),
    x="min_dist",
    y="validation_mcc",
    hue="n_neighbors",
    dodge=True,
    alpha=0.75,
)

for ax in facet_grid.axes.flatten():
    ax.set_ylim(0, 1)

In contrast with t-SNE's parameters, UMAP appears robust across all tested parameter combinations, producing roughly the same MCC values for all combinations. Although the optimal parameter combination is min_dist=0.05, n_neighbors=50 (MCC=0.681), the second best combination of min_dist=0.05, n_neighbors=100 has an MCC=0.6796).