# This is Cartography put together into one place. 

## Modules to add to run this code:

- [BioPython][1]
- [Pandas][2]
- [Numpy][3]
- [Altair][4]
- [Seaborn][5]
- [Scikit-Learn][6]
- [UMAP][7]
- json
- nextstrain-augur
- statsmodels
[1]:https://biopython.org/wiki/Download
[2]:https://pandas.pydata.org/pandas-docs/version/0.23.3/install.html
[3]:https://docs.scipy.org/doc/numpy/user/quickstart.html
[4]:https://altair-viz.github.io/getting_started/installation.html
[5]:https://seaborn.pydata.org/installing.html
[6]:https://scikit-learn.org/stable/install.html
[7]:https://umap-learn.readthedocs.io/en/latest/


# Massive to-do: remove ALL code that scripts are doing for you. 
- finish KDS plot script
- push everything to github
- fix snakemake input/output in main file
- delete everything that isn't visualization (and creating the tree, merging)

# Imports Section 

In [None]:
import pandas as pd
import altair as alt
import numpy as np
from scipy.spatial.distance import squareform, pdist
from Bio import SeqIO
import seaborn as sns
import re
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from augur.utils import json_to_tree
import json
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
import umap
from scipy.stats import linregress
from pathlib import Path
import statsmodels
import statistics
import matplotlib.pyplot as plt
from Helpers import get_euclidean_data_frame, get_hamming_distances, linking_tree_with_plots_brush
from Helpers import linking_tree_with_plots_clickable
from Helpers import scatterplot_xyvalues, scatterplot_tooltips, scatterplot_with_tooltip_interactive
from Helpers import get_y_positions
#import selenium
#from selenium.webdriver import Chrome 
#from svglib.svglib import svg2rlg
#from reportlab.graphics import renderPDF
from Helpers import get_y_positions
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [None]:
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--remote-debugging-port=9222")

browser = webdriver.Chrome(options=chrome_options)

## Pathogen-specific variables

Consider consolidating these into a single configuration file that can be passed to the notebook as a command line argument for more scriptable generation of these figures.

# Flu Specific Variables

In [None]:
path = snakemake.input.alignment
dropped_strains = [line.rstrip('\n') for line in open(snakemake.input.dropped_strains,"r")]
tree_path = snakemake.input.tree
if snakemake.input.clade_names is not None:
    clades_to_plot = [line.rstrip('\n') for line in open(snakemake.input[len(snakemake.input)-2],"r")]
f = open(snakemake.input.disease_name, "r")
virus_name = f.read()

### Old Flu Files

In [None]:
path = "../seasonal-flu-nextstrain/results-20182020/variable_sites.fasta"
dropped_strains = []
tree_path = "../seasonal-flu-nextstrain/auspice-20182020/flu_seasonal_h3n2_ha_2y.json"
virus_name = "Flu"
clades_to_plot = ['3c3.A', 'A3', 'A2/re', 'A2', 'A1b/135N', 'A1b/131K', 'A1b/94N', 'A1b/131K', 'A1b/197R', 'A1b/135K', 'A1b/137F', 'A1b/186D']

# Zika Specific Variables 

In [None]:
path = "../zika-nextstrain/results-20182020/variable_sites.fasta"
dropped_strains = ["Yunnan/01/2019"]
distance_matrix_file = "DistanceMatrixZika.csv"
tree_path = '../zika-nextstrain/auspice-20182020/zika-cartography_tree.json'
virus_name = "Zika"
clades_to_plot = None

# Reading in all the data from the scripts

In [None]:
similarity_matrix = pd.read_csv("../seasonal-flu-nextstrain/results/distance_matrix_tsne.csv", index_col=0)

In [None]:
similarity_matrix.index = principalDf = pd.read_csv("../seasonal-flu-nextstrain/results/embed_pca.csv", index_col=0).index

In [None]:
similarity_matrix.columns = similarity_matrix.index

# Creating the Phylogenetic Tree in Altair
- I used Altair to make this tree (Documentation linked [here][1]
- The data from the JSON and the Data from the tree are usually a little different, so after merging the two dataframes you may get some errors.

[1]: https://altair-viz.github.io/index.html

In [None]:
with open(tree_path) as fh:
    json_tree_handle = json.load(fh)

In [None]:
tree = json_to_tree(json_tree_handle)

In [None]:
tree

In [None]:
heights = get_y_positions(tree)
for node in tree.find_clades():
    node.yvalue = heights[node]

In [None]:
node_data = [
    {
        "strain": node.name,
        "date": node.node_attrs["num_date"]["value"],
        "y": node.yvalue,
        "region": node.node_attrs["region"]["value"],
        "country": node.node_attrs["country"]["value"],
        "parent_date": node.parent is not None and node.parent.node_attrs["num_date"]["value"] or node.node_attrs["num_date"],
        "parent_y": node.parent is not None and node.parent.yvalue or node.yvalue,
        "clade_membership" : node.node_attrs['clade_membership']["value"]
    }
    for node in tree.find_clades(terminal=True)
]

In [None]:
node_data[10]

In [None]:
node_df = pd.DataFrame(node_data)

In [None]:
node_df.head()

In [None]:
node_df["y"] = node_df["y"].max() - node_df["y"]

In [None]:
node_df["parent_y"] = node_df["parent_y"].max() - node_df["parent_y"]

In [None]:
node_df.shape

In [None]:
node_df.head()

In [None]:
node_df["region"].unique()

In [None]:
# Reannotate clades that we aren't interested in as "other" to simplify color assignment in visualizations.
try:
    node_df["clade_membership_color"] = node_df["clade_membership"].apply(lambda clade: clade if clade in clades_to_plot else "other")
except:
    node_df["clade_membership_color"] = node_df["clade_membership"]
    print("clades_to_plot undefined")

In [None]:
indices_to_drop = similarity_matrix[~similarity_matrix.index.isin(node_df["strain"])].dropna(how = 'all')
similarity_matrix = similarity_matrix[similarity_matrix.index.isin(node_df["strain"])].dropna(how = 'all')
similarity_matrix = similarity_matrix.drop(indices_to_drop.index, axis=1)
similarity_matrix

In [None]:
node_df

# Running PCA on Scaled and Centered Data
- I treated each nucleotide as a "site", or dimension, and found the probability of having a certain nucleotide given the frequency of that letter at that site.
- I used [this paper][1] as my source 
- The equation is as follows where C is the matrix of dimensions, M is the mean, and p is the frequency of a nucleotide at that given site. 
![](https://journals.plos.org/plosgenetics/article/file?type=thumbnail&id=info:doi/10.1371/journal.pgen.0020190.e003)

In [None]:
#principalDf = pd.read_csv(snakemake.input.pca)
principalDf = pd.read_csv("../seasonal-flu-nextstrain/results/embed_pca.csv", index_col=0)

principalDf["strain"] = principalDf.index

In [None]:
#explained_variance_df = pd.read_csv(snakemake.input.pca + "explained_variance")
explained_variance_df = pd.read_csv("../seasonal-flu-nextstrain/results/explained_variance_pca.csv")
explained_variance_df

In [None]:
chart = alt.Chart(explained_variance_df).mark_line().encode(
    x='principal components:Q',
    y='explained variance:Q'
)
chart

In [None]:
merged_pca_df = principalDf.merge(node_df, on="strain")

In [None]:
merged_pca_df.head()

In [None]:
explained_variance_PCA = explained_variance_df["explained variance"].values.tolist()

In [None]:
list_of_chart = linking_tree_with_plots_brush(merged_pca_df,['pca1','pca2','pca3','pca4', 'pca5', 'pca6'],
                                         ['PCA1 (Explained Variance : {}%'.format(round(explained_variance_PCA[0]*100,2)) + ")",
                                          'PCA2 (Explained Variance : {}%'.format(round(explained_variance_PCA[1]*100,2)) + ")",
                                          'PCA3 (Explained Variance : {}%'.format(round(explained_variance_PCA[2]*100,2)) + ")",
                                          'PCA4 (Explained Variance : {}%'.format(round(explained_variance_PCA[3]*100,2)) + ")",
                                          'PCA5 (Explained Variance : {}%'.format(round(explained_variance_PCA[4]*100,2)) + ")",
                                          'PCA6 (Explained Variance : {}%'.format(round(explained_variance_PCA[5]*100,2)) + ")"],
                                         "clade_membership:N",['strain','region'])
PCAFluBrush = list_of_chart[0]|list_of_chart[1]|list_of_chart[2]

PCAFluBrush
#PCAFluBrush.save("docs/PCA" + virus_name + "Brush.html")

In [None]:
#total_df = pd.read_csv(snakemake.input.pca.replace(".csv", "") + "scatterplot.csv", index_col=0)
total_df_PCA = pd.read_csv("../seasonal-flu-nextstrain/results/scatterplot_pca.csv",index_col=0)

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot(total_df_PCA["genetic"], total_df_PCA["euclidean"], "o", alpha=0.25)
ax.plot(total_df_PCA["LOWESS_x"], total_df_PCA["LOWESS_y"], label="LOESS")

ax.set_xlabel("Genetic distance")
ax.set_ylabel("Euclidean distance (PCA)")
ax.set_title("PCA Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (total_df_PCA["pearson_coef"].values.tolist()[0]))

sns.despine()

In [None]:
PCA_violin_df = get_euclidean_data_frame(merged_pca_df, "PCA1", "PCA2", "PCA")
g = sns.FacetGrid(
    PCA_violin_df,
    col="embedding",
    col_wrap=3,
    sharey=False,
    height=4
)
g = g.map(sns.violinplot, "clade_status", "distance", order=["within", "between"])
g.set_axis_labels("Clade status", "Distance")
#plt.savefig("docs/PCAViolinPlot" + virus_name + ".png")

# Running UMAP on the first 10 PCs - colored by region and clade

In [None]:
reducer = umap.UMAP(n_neighbors=200,
        min_dist=.05,
        n_components=2,
        init="spectral")
X_transformed_umap_1 = reducer.fit_transform(genomes_df)
UMAP_PCA_df_1 = pd.DataFrame(X_transformed_umap_1,columns=['UMAP_' + str(i) for i in range(1,3)])
merged_umap_pca_df_1 = UMAP_PCA_df_1

In [None]:
reducer = umap.UMAP(n_neighbors=200,
        min_dist=.05,
        n_components=2,
        init="spectral")
X_transformed_umap_pca = reducer.fit_transform(principalComponents)
UMAP_PCA_df = pd.DataFrame(X_transformed_umap_pca,columns=['UMAP' + str(i) for i in range(1,3)])
UMAP_PCA_df["strain"] = strains
merged_umap_pca_df = UMAP_PCA_df.merge(node_df, on="strain")

In [None]:
new_df = merged_umap_pca_df.join(merged_umap_pca_df_1)

# By clade:

In [None]:
list_of_data_and_titles = ['UMAP1','UMAP2', 'UMAP_1', 'UMAP_2']
list_of_chart = linking_tree_with_plots_brush(
   new_df,
    list_of_data_and_titles,
    list_of_data_and_titles,
    'clade_membership_color',
    ["clade_membership","strain:N"]
)
chart = list_of_chart[0]|list_of_chart[1].properties(title = "UMAP on genomic data")|list_of_chart[2].properties(title= "UMAP run on 10 PCs")
chart.save("docs/UmapPCLinkedClade" + virus_name + ".html")
chart.save("docs/UmapPCLinkedClade" + virus_name + ".svg", scale_factor=2.0)
drawing = svg2rlg("docs/UmapPCLinkedClade" + virus_name + ".svg")
renderPDF.drawToFile(drawing, "docs/UmapPCLinkedClade" + virus_name + ".pdf")

In [None]:
UMAP_PCA_violin_df = get_euclidean_data_frame(merged_umap_pca_df, "UMAP1", "UMAP2", "UMAP")
g = sns.FacetGrid(
    UMAP_PCA_violin_df,
    col="embedding",
    col_wrap=3,
    sharey=False,
    height=4
)
g = g.map(sns.violinplot, "clade_status", "distance", order=["within", "between"])
g.set_axis_labels("Clade status", "Distance")
plt.savefig("docs/UMAPPCCladeViolinPlot" + virus_name + ".png")

# Running MDS on the Dataset

In [None]:
#principalDf = pd.read_csv(snakemake.input.pca)
MDS_df = pd.read_csv("../seasonal-flu-nextstrain/results/embed_mds.csv",index_col=0)

MDS_df["strain"] = MDS_df.index

In [None]:
merged_mds_df = MDS_df.merge(node_df, on="strain")

In [None]:
chart_12_mds = scatterplot_with_tooltip_interactive(merged_mds_df,'mds1','mds2',"mds1","mds2",['strain','clade_membership'],'clade_membership_color')
chart_34_mds = scatterplot_with_tooltip_interactive(merged_mds_df,'mds3','mds4',"mds3","mds4",['strain','clade_membership'],'clade_membership_color')
chart_56_mds = scatterplot_with_tooltip_interactive(merged_mds_df,'mds5','mds6',"mds5","mds6",['strain','clade_membership'],'clade_membership_color')
chart_12_mds|chart_34_mds|chart_56_mds

In [None]:
list_of_chart = linking_tree_with_plots_brush(merged_mds_df,['mds1','mds2'],["MDS1", "MDS2"], 'clade_membership_color:N', ['strain','clade_membership'])
list_of_chart[0]|list_of_chart[1]

In [None]:
#total_df = pd.read_csv(snakemake.input.pca.replace(".csv", "") + "scatterplot.csv", index_col=0)
total_df_MDS = pd.read_csv("../seasonal-flu-nextstrain/results/scatterplot_mds.csv",index_col=0)

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot(total_df_MDS["genetic"], total_df_MDS["euclidean"], "o", alpha=0.25)
ax.plot(total_df_MDS["LOWESS_x"], total_df_MDS["LOWESS_y"], label="LOESS")

ax.set_xlabel("Genetic distance")
ax.set_ylabel("Euclidean distance (PCA)")
ax.set_title("MDS Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (total_df_MDS["pearson_coef"].values.tolist()[0]))

sns.despine()

In [None]:
MDS_violin_df = get_euclidean_data_frame(merged_mds_df, "MDS1", "MDS2", "MDS")
scaler = StandardScaler()

MDS_violin_df["scaled_distance"] = scaler.fit_transform(pdist(MDS_df.drop(["strain"], axis = 1)).reshape(-1, 1))

MDS_violin_df["genetic_distance"] = squareform(similarity_matrix).reshape(-1, 1)

fig, axes = plt.subplots(1, 2, figsize=(12, 6))

ax = axes[0]
ax = sns.kdeplot(MDS_violin_df.query("clade_status == 'within'")["scaled_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(MDS_violin_df.query("clade_status == 'between'")["scaled_distance"], label="Different clade", ax=ax)

ax.set_xlabel("Scaled Euclidean distance from embedding")
ax.set_ylabel("KDE density")

ax = axes[1]
ax = sns.kdeplot(MDS_violin_df.query("clade_status == 'within'")["genetic_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(MDS_violin_df.query("clade_status == 'between'")["genetic_distance"], label="Different clade", ax=ax)

ax.set_xlabel("Genetic distance")
ax.set_ylabel("KDE density")

fig.suptitle('MDS KDE Plot', fontsize=16)
sns.despine()

# Running T-SNE on the Dataset 

In [None]:
#principalDf = pd.read_csv(snakemake.input.pca)
TSNE_df = pd.read_csv("../seasonal-flu-nextstrain/results/embed_tsne.csv",index_col=0)

TSNE_df["strain"] = TSNE_df.index

In [None]:
TSNE_df

In [None]:
merged_tsne_df = TSNE_df.merge(node_df, on="strain")

In [None]:
real = scatterplot_with_tooltip_interactive(merged_tsne_df,'tsne_x','tsne_y',"tsne_x","tsne_y",['strain','clade_membership'],'clade_membership')
real

In [None]:
list_of_chart = linking_tree_with_plots_brush(
    merged_tsne_df,
    ['tsne_x','tsne_y'],
    ['tsne_x','tsne_y'],
    'clade_membership',
    ["clade_membership:N","strain:N"]
)
chart = list_of_chart[0]|list_of_chart[1]
chart
#chart.save("docs/TSNE" + virus_name + "Brush.html")

In [None]:
total_df_TSNE = pd.read_csv("../seasonal-flu-nextstrain/results/scatterplot_tsne.csv",index_col=0)

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot(total_df_TSNE["genetic"], total_df_TSNE["euclidean"], "o", alpha=0.25)
ax.plot(total_df_TSNE["LOWESS_x"], total_df_TSNE["LOWESS_y"], label="LOESS")

ax.set_xlabel("Genetic distance")
ax.set_ylabel("Euclidean distance (t-SNE)")
ax.set_title("t-SNE Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (total_df_TSNE["pearson_coef"].values.tolist()[0]))

sns.despine()
#plt.savefig("docs/TSNEScatterplot" + virus_name + ".png")

In [None]:
TSNE_violin_df = get_euclidean_data_frame(merged_tsne_df, "TSNE1", "TSNE2", "TSNE")

scaler = StandardScaler()

TSNE_violin_df["scaled_distance"] = scaler.fit_transform(pdist(TSNE_df.drop(["strain"], axis = 1)).reshape(-1, 1))

TSNE_violin_df["genetic_distance"] = squareform(similarity_matrix).reshape(-1, 1)

fig, axes = plt.subplots(1, 2, figsize=(12, 6))

ax = axes[0]
ax = sns.kdeplot(TSNE_violin_df.query("clade_status == 'within'")["scaled_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(TSNE_violin_df.query("clade_status == 'between'")["scaled_distance"], label="Different clade", ax=ax)

ax.set_xlabel("Scaled Euclidean distance from embedding")
ax.set_ylabel("KDE density")

ax = axes[1]
ax = sns.kdeplot(TSNE_violin_df.query("clade_status == 'within'")["genetic_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(TSNE_violin_df.query("clade_status == 'between'")["genetic_distance"], label="Different clade", ax=ax)

ax.set_xlabel("Genetic distance")
ax.set_ylabel("KDE density")

fig.suptitle('TSNE KDE Plot', fontsize=16)
sns.despine()

# Running UMAP on the Dataset

In [None]:
#principalDf = pd.read_csv(snakemake.input.pca)
UMAP_df = pd.read_csv("../seasonal-flu-nextstrain/results/embed_umap.csv",index_col=0)

UMAP_df["strain"] = UMAP_df.index

In [None]:
merged_umap_df = UMAP_df.merge(node_df, on="strain")

In [None]:
scatterplot_with_tooltip_interactive(merged_umap_df,'umap_x','umap_y',"umap_x","umap_y",['strain','clade_membership'],'clade_membership_color')

In [None]:
list_of_data_and_titles = ['umap_x','umap_y']
list_of_chart = linking_tree_with_plots_brush(
    merged_umap_df,
    list_of_data_and_titles,
    list_of_data_and_titles,
    'clade_membership',
    ["clade_membership","strain:N"]
)
chart = list_of_chart[0]|list_of_chart[1]
chart
#chart.save("docs/UMAP" + virus_name + "Brush.html")
#chart.save("docs/UMAP" + virus_name + "Brush.png", scale_factor=2.0)

In [None]:
UMAP_violin_df = get_euclidean_data_frame(merged_umap_df, "umap_x", "umap_y", "UMAP")

scaler = StandardScaler()

UMAP_violin_df["scaled_distance"] = scaler.fit_transform(pdist(UMAP_df.drop(["strain"], axis = 1)).reshape(-1, 1))

UMAP_violin_df["genetic_distance"] = squareform(similarity_matrix).reshape(-1, 1)

fig, axes = plt.subplots(1, 2, figsize=(12, 6))

ax = axes[0]
ax = sns.kdeplot(UMAP_violin_df.query("clade_status == 'within'")["scaled_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(UMAP_violin_df.query("clade_status == 'between'")["scaled_distance"], label="Different clade", ax=ax)

ax.set_xlabel("Scaled Euclidean distance from embedding")
ax.set_ylabel("KDE density")

ax = axes[1]
ax = sns.kdeplot(UMAP_violin_df.query("clade_status == 'within'")["genetic_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(UMAP_violin_df.query("clade_status == 'between'")["genetic_distance"], label="Different clade", ax=ax)

ax.set_xlabel("Genetic distance")
ax.set_ylabel("KDE density")

fig.suptitle('UMAP KDE Plot', fontsize=16)
sns.despine()

In [None]:
total_df_UMAP = pd.read_csv("../seasonal-flu-nextstrain/results/scatterplot_umap.csv",index_col=0)

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot(total_df_UMAP["genetic"], total_df_UMAP["euclidean"], "o", alpha=0.25)
ax.plot(total_df_UMAP["LOWESS_x"], total_df_UMAP["LOWESS_y"], label="LOESS")

ax.set_xlabel("Genetic distance")
ax.set_ylabel("Euclidean distance (UMAP)")
ax.set_title("UMAP Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (total_df_UMAP["pearson_coef"].values.tolist()[0]))

sns.despine()
#plt.savefig("docs/UMAPScatterplot" + virus_name + ".png")

# Linking all plots together clickable with Tree

In [None]:
merged_df = node_df.merge(
    principalDf,
    on="strain"
).merge(
    MDS_df,
    on="strain"
).merge(
    TSNE_df,
    on="strain"
).merge(
    UMAP_df,
    on="strain"
)

In [None]:
merged_df.shape

In [None]:
merged_df.head()

In [None]:
data = linking_tree_with_plots_clickable(
    merged_df,
    ['MDS1', 'MDS2','TSNE1', 'TSNE2', 'PCA1', 'PCA2', 'UMAP1', 'UMAP2'],
    ['MDS1', 'MDS2', 'TSNE1', 'TSNE2','PCA1 (Expected Variance : {}%'.format(round(explained_variance_PCA[0]*100,2)) + ")",
    'PCA2 (Expected Variance : {}%'.format(round(explained_variance_PCA[1]*100,2)) + ")",'UMAP1','UMAP2'],
    'clade_membership_color:N',
    ['clade_membership'],
    ['strain','clade_membership']
)

In [None]:
data1 = linking_tree_with_plots_brush(
    merged_df,
    ['TSNE1', 'TSNE2','UMAP1', 'UMAP2'],
    ['TSNE1', 'TSNE2','UMAP1','UMAP2'],
    'clade_membership:N',
    ['strain','clade_membership']
)

In [None]:
TSNEUMAP = data1[0]|data1[1]|data1[2]
TSNEUMAP.save("docs/TSNEUMAPClickable" + virus_name + ".html")
TSNEUMAP

In [None]:
PCAMDS = data[3]|data[1]|data[5]
TSNEUMAP = data[2]|data[4]
embeddings = alt.vconcat(PCAMDS,TSNEUMAP)
embeddings
fullChart = alt.hconcat(data[0],embeddings)
fullChart
#fullChart.save("docs/FullLinkedChartClickable" + virus_name + ".html")
#fullChart.save("docs/FullLinkedChartClickable" + virus_name + ".svg", scale_factor=2.0)
#drawing = svg2rlg("docs/FullLinkedChartClickable" + virus_name + ".svg")
#renderPDF.drawToFile(drawing, "docs/FullLinkedChartClickable" + virus_name + ".pdf")

## Scatterplots for all embeddings 
Concatenating all embedding data frames to plot genetic vs Euclidean distance for each embedding

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(24, 6))
fig.tight_layout(pad=4.0)

ax1.plot(total_df_PCA["genetic"], total_df_PCA["euclidean"], "o", alpha=0.25)
ax1.plot(total_df_PCA["LOWESS_x"], total_df_PCA["LOWESS_y"], label="LOESS")

ax1.set_xlabel("Genetic distance")
ax1.set_ylabel("Euclidean distance (PCA)")
ax1.set_title("PCA Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (total_df_PCA["pearson_coef"].values.tolist()[0]))

ax2.plot(total_df_MDS["genetic"], total_df_MDS["euclidean"], "o", alpha=0.25)
ax2.plot(total_df_MDS["LOWESS_x"], total_df_MDS["LOWESS_y"], label="LOESS")

ax2.set_xlabel("Genetic distance")
ax2.set_ylabel("Euclidean distance (MDS)")
ax2.set_title("MDS Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (total_df_MDS["pearson_coef"].values.tolist()[0]))

ax3.plot(total_df_TSNE["genetic"], total_df_TSNE["euclidean"], "o", alpha=0.25)
ax3.plot(total_df_TSNE["LOWESS_x"], total_df_TSNE["LOWESS_y"], label="LOESS")

ax3.set_xlabel("Genetic distance")
ax3.set_ylabel("Euclidean distance (t-SNE)")
ax3.set_title("t-SNE Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (total_df_TSNE["pearson_coef"].values.tolist()[0]))

ax4.plot(total_df_UMAP["genetic"], total_df_UMAP["euclidean"], "o", alpha=0.25)
ax4.plot(total_df_UMAP["LOWESS_x"], total_df_UMAP["LOWESS_y"], label="LOESS")

ax4.set_xlabel("Genetic distance")
ax4.set_ylabel("Euclidean distance (UMAP)")
ax4.set_title("UMAP Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (total_df_UMAP["pearson_coef"].values.tolist()[0]))


sns.despine()
#plt.savefig("docs/FullScatterplot" + virus_name + ".png")

## Within- and between-clade Euclidean distances for all embeddings

Use the complete embedding data frame to calculate pairwise Euclidean distances between samples and plot the results in a single figure.

In [None]:
scaler = StandardScaler()

fig, axes = plt.subplots(1, 4, figsize=(28, 6))

ax = axes[0]
ax = sns.kdeplot(UMAP_violin_df.query("clade_status == 'within'")["genetic_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(UMAP_violin_df.query("clade_status == 'between'")["genetic_distance"], label="Different clade", ax=ax)
ax.set_title('genetic')
ax.set_xlabel("Genetic distance")
ax.set_ylabel("KDE density")

#MDS
ax = axes[1]
ax = sns.kdeplot(MDS_violin_df.query("clade_status == 'within'")["scaled_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(MDS_violin_df.query("clade_status == 'between'")["scaled_distance"], label="Different clade", ax=ax)
ax.set_title('MDS')
ax.set_xlabel("Scaled Euclidean distance from embedding (MDS)")
ax.set_ylabel("KDE density")

#TSNE
ax = axes[2]
ax = sns.kdeplot(TSNE_violin_df.query("clade_status == 'within'")["scaled_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(TSNE_violin_df.query("clade_status == 'between'")["scaled_distance"], label="Different clade", ax=ax)
ax.set_title('TSNE')
ax.set_xlabel("Scaled Euclidean distance from embedding (TSNE)")
ax.set_ylabel("KDE density")

#UMAP
ax = axes[3]
ax = sns.kdeplot(UMAP_violin_df.query("clade_status == 'within'")["scaled_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(UMAP_violin_df.query("clade_status == 'between'")["scaled_distance"], label="Different clade", ax=ax)
ax.set_title('UMAP')
ax.set_xlabel("Scaled Euclidean distance from embedding (UMAP)")
ax.set_ylabel("KDE density")


fig.suptitle('Total KDE Plot', fontsize=16)
sns.despine()

# Supplementary Figures

## UMAP neighbors and distance 

In [None]:
neighbors = [5, 25, 200]
min_dist = [.05, .5]
X_transformed_umap = []
new_df = []
UMAP_dataframe = pd.DataFrame()
UMAP_dataframe["strain"] = similarity_matrix.index
for i in range(0, len(neighbors)):
    for j in range(0, len(min_dist)):
        reducer = umap.UMAP(n_neighbors=neighbors[i],
        min_dist= min_dist[j],
        n_components=2,
        init="spectral")  
        new_list = pd.DataFrame(reducer.fit_transform(similarity_matrix))
        new_list_df = pd.DataFrame(new_list)
        new_list_df.columns = ['UMAP' + str(i) + str(j) + '_' + str(k) for k in range(1,3)]
        UMAP_dataframe = UMAP_dataframe.join(pd.DataFrame(new_list))  
        new_df = UMAP_dataframe.merge(node_df, on="strain")
        

In [None]:
new_df

# Across the x axis, the neighbors increases (5, 25, 200)
# From top to bottom, the min dist increases from .05 to .5

In [None]:
data1 = linking_tree_with_plots_brush(
    new_df,
    ['UMAP00_1',"UMAP00_2", 'UMAP01_1',"UMAP01_2", 'UMAP10_1',"UMAP10_2", 'UMAP11_1',"UMAP11_2", 'UMAP20_1',"UMAP20_2", 'UMAP21_1',"UMAP21_2"],
    ['UMAP00_1',"UMAP00_2", 'UMAP01_1',"UMAP01_2", 'UMAP10_1',"UMAP10_2", 'UMAP11_1',"UMAP11_2", 'UMAP20_1',"UMAP20_2", 'UMAP21_1',"UMAP21_2"],
    'clade_membership_color:N',
    ['strain','clade_membership']
)
top_row = data1[2].properties(title = "UMAP Neighbors = 5, min dist = .05")|data1[4].properties(title = "UMAP Neighbors = 25, min dist = .05")|data1[6].properties(title = "UMAP Neighbors = 200, min dist = .05")
bottom_row = data1[1].properties(title = "UMAP Neighbors = 5, min dist = .5")|data1[3].properties(title = "UMAP Neighbors = 25, min dist = .5")|data1[5].properties(title = "UMAP Neighbors = 200, min dist = .5")
total = alt.vconcat(top_row, bottom_row)
total = alt.hconcat(data1[0], total)
total
total.save("docs/UMAP_neighbors_mindist_chart_" + virus_name + ".html")
total.save("docs/UMAP_neighbors_mindist_chart_" + virus_name + ".svg", scale_factor=2.0)
drawing = svg2rlg("docs/UMAP_neighbors_mindist_chart_" + virus_name + ".svg")
renderPDF.drawToFile(drawing, "docs/UMAP_neighbors_mindist_chart_" + virus_name + ".pdf")

In [None]:
learning_rate = [100.0, 200.0, 300.0, 400.0, 500.0, 1000.0]
new_df = []
new_list_df = []
TSNE_dataframe = pd.DataFrame()
TSNE_dataframe["strain"] = similarity_matrix.index
for i in range (0, len(learning_rate)):
    embedding = TSNE(n_components=2,metric='precomputed',learning_rate=learning_rate[i],perplexity = 25.95)
    X_transformed = pd.DataFrame(embedding.fit_transform(similarity_matrix))
    new_list_df = pd.DataFrame(X_transformed)
    new_list_df.columns = ['TSNE' + str(i) + '_' + str(k) for k in range(1,3)]
    TSNE_dataframe = TSNE_dataframe.join(pd.DataFrame(new_list_df))  
    new_df = TSNE_dataframe.merge(node_df, on="strain")

In [None]:
new_df

In [None]:
data1 = linking_tree_with_plots_brush(
    new_df,
    ["TSNE0_1", "TSNE0_2", "TSNE1_1", "TSNE1_2", "TSNE2_1", "TSNE2_2", "TSNE3_1", "TSNE3_2", "TSNE4_1", "TSNE4_2", "TSNE5_1", "TSNE5_2"],
    ["TSNE0_1", "TSNE0_2", "TSNE1_1", "TSNE1_2", "TSNE2_1", "TSNE2_2", "TSNE3_1", "TSNE3_2", "TSNE4_1", "TSNE4_2", "TSNE5_1", "TSNE5_2"],
    'clade_membership_color:N',
    ['strain','clade_membership']
)
top_row = data1[1].properties(title = "TSNE learning rate = 100.0")|data1[2].properties(title = "TSNE learning rate = 200.0 (default)")|data1[3].properties(title = "TSNE learning rate = 300.0")
bottom_row = data1[4].properties(title = "TSNE learning rate = 400.0")|data1[5].properties(title = "TSNE learning rate = 500.0")|data1[6].properties(title = "TSNE learning rate = 1000.0")
total = alt.vconcat(top_row, bottom_row)
total = alt.hconcat(data1[0], total)
total
total.save("docs/TSNELearningRates" + virus_name + ".html")
total.save("docs/TSNELearningRates" + virus_name + ".svg", scale_factor=2.0)
drawing = svg2rlg("docs/TSNELearningRates" + virus_name + ".svg")
renderPDF.drawToFile(drawing, "docs/TSNELearningRates" + virus_name + ".pdf")

In [None]:
perplexity = [15, 20, 25, 30, 100]
new_df = []
new_list_df = []
TSNE_dataframe = pd.DataFrame()
TSNE_dataframe["strain"] = similarity_matrix.index
for i in range (0, len(perplexity)):
    embedding = TSNE(n_components=2,metric='precomputed', perplexity = perplexity[i])
    X_transformed = pd.DataFrame(embedding.fit_transform(similarity_matrix))
    new_list_df = pd.DataFrame(X_transformed)
    new_list_df.columns = ['TSNE' + str(i) + '_' + str(k) for k in range(1,3)]
    TSNE_dataframe = TSNE_dataframe.join(pd.DataFrame(new_list_df))  
    new_df = TSNE_dataframe.merge(node_df, on="strain")

In [None]:
data1 = linking_tree_with_plots_brush(
    new_df,
    ["TSNE0_1", "TSNE0_2", "TSNE1_1", "TSNE1_2", "TSNE2_1", "TSNE2_2", "TSNE3_1", "TSNE3_2", "TSNE4_1", "TSNE4_2"],
    ["TSNE0_1", "TSNE0_2", "TSNE1_1", "TSNE1_2", "TSNE2_1", "TSNE2_2", "TSNE3_1", "TSNE3_2", "TSNE4_1", "TSNE4_2"],
    'clade_membership_color:N',
    ['strain','clade_membership']
)
top_row = data1[1].properties(title = "TSNE perplexity = 15")|data1[2].properties(title = "TSNE perplexity = 20")|data1[3].properties(title = "TSNE perplexity = 25")
bottom_row = data1[4].properties(title = "TSNE perplexity = 30")|data1[5].properties(title = "TSNE perplexity = 100")
total = alt.vconcat(top_row, bottom_row)
total = alt.hconcat(data1[0], total)
total
total.save("docs/TSNEPerplexity" + virus_name + ".html")
chart.save("docs/TSNEPerplexity" + virus_name + ".svg", scale_factor=2.0)
drawing = svg2rlg("docs/TSNEPerplexity" + virus_name + ".svg")
renderPDF.drawToFile(drawing, "docs/TSNEPerplexity" + virus_name + ".pdf")