In [None]:
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [None]:
sns.set_style("ticks")

# Disable top and right spines.
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

# Display and save figures at higher resolution for presentations and manuscripts.
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['figure.dpi'] = 120

# Display text at sizes large enough for presentations and manuscripts.
mpl.rcParams['font.weight'] = "normal"
mpl.rcParams['axes.labelweight'] = "normal"
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['legend.fontsize'] = 10
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 14
mpl.rc('text', usetex=False)

## Define inputs, outputs, and parameters

In [None]:
scatterplot_metadata = snakemake.input.scatterplot_metadata
scatterplot_pca = snakemake.input.scatterplot_pca
scatterplot_mds = snakemake.input.scatterplot_mds
scatterplot_tsne = snakemake.input.scatterplot_tsne
scatterplot_umap = snakemake.input.scatterplot_umap

In [None]:
scatterplot_chart = snakemake.output.scatterplot

## Load data

In [None]:
metadata = pd.read_csv(scatterplot_metadata)

In [None]:
metadata

In [None]:
pca = pd.read_csv(scatterplot_pca)

In [None]:
pca.head()

In [None]:
pca.shape

In [None]:
mds = pd.read_csv(scatterplot_mds)

In [None]:
mds.head()

In [None]:
tsne = pd.read_csv(scatterplot_tsne)

In [None]:
tsne.head()

In [None]:
umap = pd.read_csv(scatterplot_umap)

In [None]:
umap.head()

## Plot scatterplots per embedding

In [None]:
def distance_tick_formatter(tick_val, tick_pos):
    return str(int(tick_val))

In [None]:
xaxis_locator = MaxNLocator(steps=[1, 4, 5, 10], integer=True)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(8, 8), dpi=300)

all_axes = axes.flatten()
ax1 = all_axes[0]
ax2 = all_axes[1]
ax3 = all_axes[2]
ax4 = all_axes[3]

# PCA
metadata_pca = metadata.query("embedding == 'pca'")
mean_pca = metadata_pca["mean"].values[0]
std_pca = metadata_pca["std"].values[0]

slope_pca = metadata_pca["line_slope"].values[0]
intercept_pca = metadata_pca["line_intercept"].values[0]
intercept_sign_pca = "+" if intercept_pca >= 0 else "-"

ax1 = sns.boxplot(
    x="genetic",
    y="euclidean",
    data=pca,
    ax=ax1,
    fliersize=1,
    linewidth=0.25,
    color="lightblue",
)

ax1.set_xlabel("Genetic distance")
ax1.set_ylabel("Euclidean distance")

ax1.xaxis.set_major_formatter(distance_tick_formatter)
ax1.xaxis.set_major_locator(xaxis_locator)

ax1.yaxis.set_major_formatter(distance_tick_formatter)
ax1.yaxis.set_major_locator(MaxNLocator(steps=[1, 4, 5, 10], integer=True))

ax1.set_title(f"PCA ($R^2={mean_pca:.2f}, y = {slope_pca:.2f}x {intercept_sign_pca} {np.abs(intercept_pca):.2f}$)")

# MDS
metadata_mds = metadata.query("embedding == 'mds'")
mean_mds = metadata_mds["mean"].values[0]
std_mds = metadata_mds["std"].values[0]

slope_mds = metadata_mds["line_slope"].values[0]
intercept_mds = metadata_mds["line_intercept"].values[0]
intercept_sign_mds = "+" if intercept_mds >= 0 else "-"

ax2 = sns.boxplot(
    x="genetic",
    y="euclidean",
    data=mds,
    ax=ax2,
    fliersize=0.5,
    linewidth=0.25,
    color="lightblue",
)

ax2.set_xlabel("Genetic distance")
ax2.set_ylabel("Euclidean distance")

ax2.xaxis.set_major_formatter(distance_tick_formatter)
ax2.xaxis.set_major_locator(xaxis_locator)

ax2.yaxis.set_major_formatter(distance_tick_formatter)
ax2.yaxis.set_major_locator(MaxNLocator(steps=[1, 4, 5, 10], integer=True))

ax2.set_title(f"MDS ($R^2={mean_mds:.2f}, y = {slope_mds:.2f}x {intercept_sign_mds} {np.abs(intercept_mds):.2f}$)")

# t-SNE
metadata_tsne = metadata.query("embedding == 't-sne'")
mean_tsne = metadata_tsne["mean"].values[0]
std_tsne = metadata_tsne["std"].values[0]

slope_tsne = metadata_tsne["line_slope"].values[0]
intercept_tsne = metadata_tsne["line_intercept"].values[0]
intercept_sign_tsne = "+" if intercept_tsne >= 0 else "-"

ax3 = sns.boxplot(
    x="genetic",
    y="euclidean",
    data=tsne,
    ax=ax3,
    fliersize=0.5,
    linewidth=0.25,
    color="lightblue",
)

ax3.set_xlabel("Genetic distance")
ax3.set_ylabel("Euclidean distance")

ax3.xaxis.set_major_formatter(distance_tick_formatter)
ax3.xaxis.set_major_locator(xaxis_locator)

ax3.yaxis.set_major_formatter(distance_tick_formatter)
ax3.yaxis.set_major_locator(MaxNLocator(steps=[1, 2, 4, 5, 10], integer=True))

ax3.set_title(f"t-SNE ($R^2={mean_tsne:.2f}, y = {slope_tsne:.2f}x {intercept_sign_tsne} {np.abs(intercept_tsne):.2f}$)")

# UMAP
metadata_umap = metadata.query("embedding == 'umap'")
mean_umap = metadata_umap["mean"].values[0]
std_umap = metadata_umap["std"].values[0]

slope_umap = metadata_umap["line_slope"].values[0]
intercept_umap = metadata_umap["line_intercept"].values[0]
intercept_sign_umap = "+" if intercept_umap >= 0 else "-"

ax4 = sns.boxplot(
    x="genetic",
    y="euclidean",
    data=umap,
    ax=ax4,
    fliersize=0.5,
    linewidth=0.25,
    color="lightblue",
)

ax4.set_xlabel("Genetic distance")
ax4.set_ylabel("Euclidean distance")

ax4.xaxis.set_major_formatter(distance_tick_formatter)
ax4.xaxis.set_major_locator(xaxis_locator)

ax4.yaxis.set_major_formatter(distance_tick_formatter)
ax4.yaxis.set_major_locator(MaxNLocator(steps=[1, 4, 5, 10], integer=True))

ax4.set_title(f"UMAP ($R^2={mean_umap:.2f}, y = {slope_umap:.2f}x {intercept_sign_umap} {np.abs(intercept_umap):.2f}$)")

plt.tight_layout()
plt.savefig(scatterplot_chart)

## Plot figure in wide layout for poster

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(16, 4), dpi=350)

all_axes = axes.flatten()
ax1 = all_axes[0]
ax2 = all_axes[1]
ax3 = all_axes[2]
ax4 = all_axes[3]

# PCA
metadata_pca = metadata.query("embedding == 'pca'")
mean_pca = metadata_pca["mean"].values[0]
std_pca = metadata_pca["std"].values[0]

slope_pca = metadata_pca["line_slope"].values[0]
intercept_pca = metadata_pca["line_intercept"].values[0]
intercept_sign_pca = "+" if intercept_pca >= 0 else "-"

ax1 = sns.boxplot(
    x="genetic",
    y="euclidean",
    data=pca,
    ax=ax1,
    fliersize=1,
    linewidth=0.25,
    color="lightblue",
)

ax1.set_xlabel("Genetic distance")
ax1.set_ylabel("Euclidean distance")

ax1.xaxis.set_major_formatter(distance_tick_formatter)
ax1.xaxis.set_major_locator(xaxis_locator)

ax1.yaxis.set_major_formatter(distance_tick_formatter)
ax1.yaxis.set_major_locator(MaxNLocator(steps=[1, 4, 5, 10], integer=True))

ax1.set_title(f"PCA ($R^2={mean_pca:.2f}, y = {slope_pca:.2f}x {intercept_sign_pca} {np.abs(intercept_pca):.2f}$)")

# MDS
metadata_mds = metadata.query("embedding == 'mds'")
mean_mds = metadata_mds["mean"].values[0]
std_mds = metadata_mds["std"].values[0]

slope_mds = metadata_mds["line_slope"].values[0]
intercept_mds = metadata_mds["line_intercept"].values[0]
intercept_sign_mds = "+" if intercept_mds >= 0 else "-"

ax2 = sns.boxplot(
    x="genetic",
    y="euclidean",
    data=mds,
    ax=ax2,
    fliersize=0.5,
    linewidth=0.25,
    color="lightblue",
)

ax2.set_xlabel("Genetic distance")
ax2.set_ylabel("Euclidean distance")

ax2.xaxis.set_major_formatter(distance_tick_formatter)
ax2.xaxis.set_major_locator(xaxis_locator)

ax2.yaxis.set_major_formatter(distance_tick_formatter)
ax2.yaxis.set_major_locator(MaxNLocator(steps=[1, 4, 5, 10], integer=True))

ax2.set_title(f"MDS ($R^2={mean_mds:.2f}, y = {slope_mds:.2f}x {intercept_sign_mds} {np.abs(intercept_mds):.2f}$)")

# t-SNE
metadata_tsne = metadata.query("embedding == 't-sne'")
mean_tsne = metadata_tsne["mean"].values[0]
std_tsne = metadata_tsne["std"].values[0]

slope_tsne = metadata_tsne["line_slope"].values[0]
intercept_tsne = metadata_tsne["line_intercept"].values[0]
intercept_sign_tsne = "+" if intercept_tsne >= 0 else "-"

ax3 = sns.boxplot(
    x="genetic",
    y="euclidean",
    data=tsne,
    ax=ax3,
    fliersize=0.5,
    linewidth=0.25,
    color="lightblue",
)

ax3.set_xlabel("Genetic distance")
ax3.set_ylabel("Euclidean distance")

ax3.xaxis.set_major_formatter(distance_tick_formatter)
ax3.xaxis.set_major_locator(xaxis_locator)

ax3.yaxis.set_major_formatter(distance_tick_formatter)
ax3.yaxis.set_major_locator(MaxNLocator(steps=[1, 2, 4, 5, 10], integer=True))

ax3.set_title(f"t-SNE ($R^2={mean_tsne:.2f}, y = {slope_tsne:.2f}x {intercept_sign_tsne} {np.abs(intercept_tsne):.2f}$)")

# UMAP
metadata_umap = metadata.query("embedding == 'umap'")
mean_umap = metadata_umap["mean"].values[0]
std_umap = metadata_umap["std"].values[0]

slope_umap = metadata_umap["line_slope"].values[0]
intercept_umap = metadata_umap["line_intercept"].values[0]
intercept_sign_umap = "+" if intercept_umap >= 0 else "-"

ax4 = sns.boxplot(
    x="genetic",
    y="euclidean",
    data=umap,
    ax=ax4,
    fliersize=0.5,
    linewidth=0.25,
    color="lightblue",
)

ax4.set_xlabel("Genetic distance")
ax4.set_ylabel("Euclidean distance")

ax4.xaxis.set_major_formatter(distance_tick_formatter)
ax4.xaxis.set_major_locator(xaxis_locator)

ax4.yaxis.set_major_formatter(distance_tick_formatter)
ax4.yaxis.set_major_locator(MaxNLocator(steps=[1, 4, 5, 10], integer=True))

ax4.set_title(f"UMAP ($R^2={mean_umap:.2f}, y = {slope_umap:.2f}x {intercept_sign_umap} {np.abs(intercept_umap):.2f}$)")

plt.tight_layout()
plt.savefig(scatterplot_chart.replace(".png", "-poster.png"))