# These are the Cartography visuals.

# Imports Section 

In [None]:
import sys
sys.path.extend(["./notebooks/scripts"])

In [None]:
sys.path

In [None]:
import altair as alt
from altair_saver import save
from augur.utils import json_to_tree, read_node_data
import json
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import re
import seaborn as sns

from Helpers import linking_tree_with_plots_clickable, linking_tree_with_plots_brush, scatterplot_with_tooltip_interactive
from Helpers import get_y_positions, get_euclidean_data_frame

#%matplotlib inline

In [None]:
alt.renderers.set_embed_options(
    padding={"left": 0, "right": 0, "bottom": 1, "top": 1}
)

In [None]:
sns.set_style("ticks")
# Disable top and right spines.
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
# Display and save figures at higher resolution for presentations and manuscripts.
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['figure.dpi'] = 100
# Display text at sizes large enough for presentations and manuscripts.
mpl.rcParams['font.weight'] = "normal"
mpl.rcParams['axes.labelweight'] = "normal"
mpl.rcParams['font.size'] = 10
mpl.rcParams['axes.labelsize'] = 10
mpl.rcParams['legend.fontsize'] = 8
mpl.rcParams['xtick.labelsize'] = 10
mpl.rcParams['ytick.labelsize'] = 10
mpl.rcParams['axes.titlesize'] = 8
mpl.rc('text', usetex=False)

In [None]:
import os
#current dir
cwd = os.getcwd()
cwd

In [None]:
node_data = read_node_data("ha-na-nextstrain/results/mccs.json")

## Pathogen-specific variables

Consider consolidating these into a single configuration file that can be passed to the notebook as a command line argument for more scriptable generation of these figures.

# Flu Specific Variables

In [None]:
node_df_ha = snakemake.input.node_df_ha
alignment_ha = snakemake.input.alignment_ha
alignment_concatenated = snakemake.input.alignment_concatenated

pca_df_ha = snakemake.input.pca_df_ha
explained_variance_pca_ha = snakemake.input.explained_variance_pca_ha

pca_df_concatenated = snakemake.input.pca_df_concatenated
explained_variance_pca_concatenated = snakemake.input.explained_variance_pca_concatenated

mds_df_ha = snakemake.input.mds_df_ha
mds_df_concatenated = snakemake.input.mds_df_concatenated

tsne_df_ha = snakemake.input.tsne_df_ha
tsne_df_concatenated = snakemake.input.tsne_df_concatenated

umap_df_ha = snakemake.input.umap_df_ha
umap_df_concatenated = snakemake.input.umap_df_concatenated

output_mds_html = snakemake.output.HANAFullChartBrushableMDSHTML
#output_mds_png = snakemake.output.HANAFullChartBrushableMDSPNG
output_tsne_html = snakemake.output.HANAFullChartBrushableTSNEHTML
#output_tsne_png = snakemake.output.HANAFullChartBrushableTSNEPNG
output_full_html = snakemake.output.fullChartHTML
#output_full_png = snakemake.output.fullChartPNG

# Reading in all the data from the scripts

In [None]:
colors = pd.read_csv("notebooks/config/color_schemes.tsv", sep="\t", names=[i for i in range(0,101)])

In [None]:
node_df_ha = pd.read_csv(node_df_ha, sep="\t")

In [None]:
node_df_ha.rename(columns={'num_date':'date', 'y_value':"y"}, inplace=True)

In [None]:
node_df_ha.head()

In [None]:
# Parametrizing node_df
clade_membership = "MCC"

In [None]:
import numpy as np

In [None]:
mcc_calc_df = node_df_ha.copy()
mcc_calc_df[clade_membership] = mcc_calc_df[clade_membership].replace("unassigned", np.NaN)
mcc_calc_df.dropna(subset=[clade_membership])
# use MCC calc df for all calculations, node_df_ha for graphing
mcc_calc_df

# Running PCA on Scaled and Centered Data
- I treated each nucleotide as a "site", or dimension, and found the probability of having a certain nucleotide given the frequency of that letter at that site.
- I used [this paper][1] as my source 
- The equation is as follows where C is the matrix of dimensions, M is the mean, and p is the frequency of a nucleotide at that given site. 
![](https://journals.plos.org/plosgenetics/article/file?type=thumbnail&id=info:doi/10.1371/journal.pgen.0020190.e003)

In [None]:
principalDf_ha = pd.read_csv(pca_df_ha, index_col=0)
principalDf_concatenated = pd.read_csv(pca_df_concatenated, index_col=0)

In [None]:
explained_variance_df_ha = pd.read_csv(explained_variance_pca_ha)
explained_variance_df_concatenated = pd.read_csv(explained_variance_pca_concatenated)

In [None]:
explained_variance_df_ha

In [None]:
explained_variance_df_concatenated

In [None]:
plt.plot(explained_variance_df_ha['principal components'].values.tolist(), explained_variance_df_ha["explained variance"].values.tolist(), 'o')

plt.xlabel("Principal Component")
plt.ylabel("Explained Variance")

plt.title(f"Explained Variance Plot (Flu, HA-only)")

In [None]:
plt.plot(
    explained_variance_df_concatenated['principal components'].values.tolist(),
    explained_variance_df_concatenated["explained variance"].values.tolist(),
    'o'
)

plt.xlabel("Principal Component")
plt.ylabel("Explained Variance")

plt.title(f"Explained Variance Plot (Flu, HA and NA)")

In [None]:
merged_pca_df_ha = principalDf_ha.merge(node_df_ha[["strain", "date", "y", clade_membership]], on="strain")
merged_pca_df_concatenated = principalDf_concatenated.merge(node_df_ha[["strain", "date", "y", clade_membership]], on="strain")

In [None]:
explained_variance_PCA_ha = explained_variance_df_ha["explained variance"].values.tolist()
explained_variance_PCA_concatenated = explained_variance_df_concatenated["explained variance"].values.tolist()

### Bases Missing Analysis for PCA 

In [None]:
from Bio import SeqIO
strains = []
genomes = []
for record in SeqIO.parse(alignment_concatenated, "fasta"):
    strains.append(str(record.id))
    genomes.append(str(record.seq))

#Checking missing_bases

genomes_missing_bases = []
for x in genomes:
    x = re.sub(r'[^AGCT]', '5', x)
    numberOfN = x.count("5") #This logic is here because MERS uses both "N" and "-" to dileneate missing sequences.
    genomes_missing_bases.append(numberOfN)
    
bases_df = pd.DataFrame([strains, genomes_missing_bases]).transpose()
bases_df.columns = ["strain", "bases_missing"]

In [None]:
strains = []
genomes = []
for record in SeqIO.parse(alignment_ha, "fasta"):
    strains.append(str(record.id))
    genomes.append(str(record.seq))

#Checking missing_bases

genomes_missing_bases = []
for x in genomes:
    x = re.sub(r'[^AGCT]', '5', x)
    numberOfN = x.count("5") #This logic is here because MERS uses both "N" and "-" to dileneate missing sequences.
    genomes_missing_bases.append(numberOfN)
    
bases_df_ha = pd.DataFrame([strains, genomes_missing_bases]).transpose()
bases_df_ha.columns = ["strain", "bases_missing"]
merged_total_ha = bases_df.merge(merged_pca_df_ha, on="strain")

In [None]:
merged_total = bases_df.merge(merged_pca_df_concatenated, on="strain")
merged_total

In [None]:
domain = sorted(merged_total_ha[clade_membership].drop_duplicates().values)
range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_total_ha,['pca1','pca2','pca3','pca4', 'pca5', 'pca6'],
                                         ['PCA1 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[0]*100,2)) + ")",
                                          'PCA2 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[1]*100,2)) + ")",
                                          'PCA3 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[2]*100,2)) + ")",
                                          'PCA4 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[3]*100,2)) + ")",
                                          'PCA5 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[4]*100,2)) + ")",
                                          'PCA6 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[5]*100,2)) + ")"],
                                          clade_membership+":N",['strain', clade_membership, "bases_missing"], domain, range_)
domain =  sorted(merged_total[clade_membership].drop_duplicates().values)
range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_total,['pca1','pca2','pca3','pca4', 'pca5', 'pca6'],
                                         ['PCA1 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[0]*100,2)) + ")",
                                          'PCA2 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[1]*100,2)) + ")",
                                          'PCA3 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[2]*100,2)) + ")",
                                          'PCA4 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[3]*100,2)) + ")",
                                          'PCA5 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[4]*100,2)) + ")",
                                          'PCA6 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[5]*100,2)) + ")"],
                                         clade_membership+":N",['strain', clade_membership, "bases_missing"], domain, range_)
PCAFluBrush_ha = list_of_chart_ha[0]|list_of_chart_ha[1]|list_of_chart_ha[2]
PCAFluBrush_concatenated = list_of_chart_concatenated[0]|list_of_chart_concatenated[1]|list_of_chart_concatenated[2]

alt.vconcat(PCAFluBrush_ha,PCAFluBrush_concatenated)
#PCAFluBrush.save("docs/PCAHaNaBrush.html")

In [None]:
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_total,['pca1','pca2'],
                                         ['PCA1 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[0]*100,2)) + ")",
                                          'PCA2 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[1]*100,2)) + ")"],
                                         clade_membership+":N",['strain', clade_membership, "bases_missing"], domain, range_)

PCAFluBrush_concatenated = list_of_chart_concatenated[0]|list_of_chart_concatenated[1]
#PCAFluBrush_concatenated.save("../docs/PCAFluBrushHA_.html")

# Running MDS on the Dataset

In [None]:
MDS_df_ha = pd.read_csv(mds_df_ha,index_col=0)
MDS_df_concatenated = pd.read_csv(mds_df_concatenated,index_col=0)

In [None]:
merged_mds_df_ha = MDS_df_ha.merge(node_df_ha[["strain", "date", "y", clade_membership]], on="strain")
merged_mds_df_concatenated = MDS_df_concatenated.merge(node_df_ha[["strain", "date", "y", clade_membership]], on="strain")

In [None]:
merged_mds_df_ha

In [None]:
domain =  sorted(merged_mds_df_ha[clade_membership].drop_duplicates().values)
range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
chart_12_mds = scatterplot_with_tooltip_interactive(merged_mds_df_ha,'mds1','mds2',"mds1","mds2",['strain',clade_membership],clade_membership+":N", domain, range_)
chart_12_mds

In [None]:
domain =  sorted(merged_mds_df_ha[clade_membership].drop_duplicates().values)
range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_mds_df_ha,['mds1','mds2'],["MDS1", "MDS2"], clade_membership+":N", ['strain',clade_membership], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1]
domain =  sorted(merged_mds_df_concatenated[clade_membership].drop_duplicates().values)
range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_mds_df_concatenated,['mds1','mds2'],["MDS1", "MDS2"], clade_membership+":N", ['strain',clade_membership], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1]
alt.vconcat(chart_ha, chart_concat)

# HDBSCAN project:
- cluster on HA, find MCC value, same for HA+NA (from cluster_results script)
- check if HA+NA MCC > HA only

## MDS

In [None]:
MDS_df_ha = pd.read_csv(mds_df_ha,index_col=0)
MDS_df_concatenated = pd.read_csv(mds_df_concatenated,index_col=0)

In [None]:
merged_mds_df_ha = MDS_df_ha.merge(node_df_ha[["strain", "date", "y", clade_membership]], on="strain")
merged_mds_df_concatenated = MDS_df_concatenated.merge(node_df_ha[["strain", "date", "y", clade_membership]], on="strain")

In [None]:
mcc_calc_mds = MDS_df_ha.merge(mcc_calc_df[["strain", "date", "y", clade_membership]], on="strain")
mcc_calc_mds_concatenated = MDS_df_concatenated.merge(mcc_calc_df[["strain", "date", "y", clade_membership]], on="strain")
KDE_df_normal = get_euclidean_data_frame(sampled_df=mcc_calc_mds, column_for_analysis=clade_membership, embedding="method", column_list=['mds1', 'mds2'])

In [None]:
domain =  merged_mds_df_ha["mds_label"].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
chart_12_mds = scatterplot_with_tooltip_interactive(merged_mds_df_ha,'mds1','mds2',"mds1","mds2",['strain',clade_membership],'mds_label:N', domain, range_)
chart_12_mds

In [None]:
from sklearn.metrics import confusion_matrix, matthews_corrcoef

In [None]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=mcc_calc_mds[["mds1", "mds2", "strain", "mds_label"]], column_for_analysis="mds_label", embedding="mds", column_list=["mds1", "mds2"])
confusion_matrix_val_ha = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_ha = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [None]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=mcc_calc_mds_concatenated[["mds1", "mds2", "strain", "mds_label"]], column_for_analysis="mds_label", embedding="mds", column_list=["mds1", "mds2"])
confusion_matrix_val_concatenated = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_concatenated = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [None]:
domain =  merged_mds_df_ha[clade_membership].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_mds_df_ha,['mds1','mds2'],["MDS1", "MDS2"], clade_membership+":N", ['strain',clade_membership], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1].properties(title="MCC: " + str(round(matthews_cc_val_ha,4)))
domain =  merged_mds_df_concatenated[clade_membership].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_mds_df_concatenated,['mds1','mds2'],["MDS1", "MDS2"], clade_membership+":N", ['strain',clade_membership], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="MCC: " + str(round(matthews_cc_val_concatenated,4)))
chart_total = alt.vconcat(chart_ha, chart_concat)
chart_total


In [None]:
domain =  sorted(merged_mds_df_ha["mds_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_mds_df_ha,['mds1','mds2'],["MDS1", "MDS2"], 'mds_label:N', ['strain',clade_membership], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1].properties(title="MCC: " + str(round(matthews_cc_val_ha,4)))
domain =  sorted(merged_mds_df_ha["mds_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_mds_df_concatenated,['mds1','mds2'],["MDS1", "MDS2"], 'mds_label:N', ['strain',clade_membership], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="MCC: " + str(round(matthews_cc_val_concatenated,4)))
final_chart = alt.vconcat(chart_ha, chart_concat)
final_chart

In [None]:
final_chart.save(output_mds_html)
#save(final_chart, output_mds_png, scale_factor=2.0)

## HDBSCAN clustering on t-SNE 

In [None]:
TSNE_df_ha = pd.read_csv(tsne_df_ha, index_col=0)
TSNE_df_concatenated = pd.read_csv(tsne_df_concatenated,index_col=0)

In [None]:
merged_tsne_df_ha = TSNE_df_ha.merge(node_df_ha[["strain", "date", "y", clade_membership]], on="strain")
merged_tsne_df_concatenated = TSNE_df_concatenated.merge(node_df_ha[["strain", "date", "y", clade_membership]], on="strain")

In [None]:
mcc_calc_tsne = TSNE_df_ha.merge(mcc_calc_df[["strain", "date", "y", clade_membership]], on="strain")
mcc_calc_tsne_concatenated = TSNE_df_concatenated.merge(mcc_calc_df[["strain", "date", "y", clade_membership]], on="strain")
KDE_df_normal = get_euclidean_data_frame(sampled_df=mcc_calc_tsne, column_for_analysis=clade_membership, embedding="method", column_list=['tsne_x', 'tsne_y'])

In [None]:
domain =  merged_tsne_df_ha["t-sne_label"].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
chart_12_tsne = scatterplot_with_tooltip_interactive(merged_tsne_df_ha,'tsne_x','tsne_y',"tsne_x","tsne_y",['strain',clade_membership],'t-sne_label:N', domain, range_)
chart_12_tsne

In [None]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=mcc_calc_tsne[["tsne_x", "tsne_y", "strain", "t-sne_label"]], column_for_analysis="t-sne_label", embedding="tsne", column_list=["tsne_x", "tsne_y"])
confusion_matrix_val_ha = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_ha = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [None]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=mcc_calc_tsne_concatenated[["tsne_x", "tsne_y", "strain", "t-sne_label"]], column_for_analysis="t-sne_label", embedding="tsne", column_list=["tsne_x", "tsne_y"])
confusion_matrix_val_concatenated = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_concatenated = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [None]:
domain =  sorted(merged_tsne_df_ha["t-sne_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_tsne_df_ha,['tsne_x','tsne_y'],["MDS1", "MDS2"], 't-sne_label:N', ['strain',clade_membership], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1].properties(title="MCC: " + str(round(matthews_cc_val_ha,4)))
domain =  merged_tsne_df_concatenated["t-sne_label"].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_tsne_df_concatenated,['tsne_x','tsne_y'],["MDS1", "MDS2"], 't-sne_label:N', ['strain',clade_membership], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="MCC: " + str(round(matthews_cc_val_concatenated,4)))
final_chart = alt.vconcat(chart_ha, chart_concat).resolve_scale(color='independent')
final_chart

In [None]:
save(final_chart, output_tsne_html)
#save(final_chart, output_tsne_png, scale_factor=2.0)

# Running T-SNE on the Dataset 

In [None]:
domain =  sorted(merged_tsne_df_ha[clade_membership].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
scatterplot_with_tooltip_interactive(merged_tsne_df_ha,'tsne_x','tsne_y','tsne_x','tsne_y',['strain', clade_membership],clade_membership+":N", domain, range_)

In [None]:
domain =  sorted(merged_tsne_df_concatenated[clade_membership].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
scatterplot_with_tooltip_interactive(merged_tsne_df_concatenated,'tsne_x','tsne_y','tsne_x','tsne_y',['strain', clade_membership],clade_membership+":N", domain, range_)

In [None]:
list_of_chart_ha = linking_tree_with_plots_brush(
    merged_tsne_df_ha,
    ['tsne_x','tsne_y'],
    ['tsne_x','tsne_y'],
    clade_membership+":N",
    ["strain:N", clade_membership+":N"],
    domain,
    range_
)
chart_tsne_ha = list_of_chart_ha[0]|list_of_chart_ha[1]
chart_tsne_ha

In [None]:
list_of_chart_concatenated = linking_tree_with_plots_brush(
    merged_tsne_df_concatenated,
    ['tsne_x','tsne_y'],
    ['tsne_x','tsne_y'],
    clade_membership+":N",
    ["strain:N", clade_membership+":N"],
    domain,
    range_
)
chart_tsne_concatenated = list_of_chart_concatenated[0]|list_of_chart_concatenated[1]
chart_tsne_concatenated

In [None]:
chart_tsne_ha & chart_tsne_concatenated

# Running UMAP on the Dataset

In [None]:
UMAP_df_ha = pd.read_csv(umap_df_ha, index_col=0)
UMAP_df_concatenated = pd.read_csv(umap_df_concatenated, index_col=0)

In [None]:
UMAP_df_concatenated

In [None]:
merged_umap_df_ha = UMAP_df_ha.merge(node_df_ha[["strain", "date", "y", clade_membership]], on="strain")
merged_umap_df_concatenated = UMAP_df_concatenated.merge(node_df_ha[["strain", "date", "y", clade_membership]], on="strain")

In [None]:
UMAP_df_ha.index.tolist() == UMAP_df_concatenated.index.values.tolist()

# Linking all plots together clickable with Tree

In [None]:
#merged_df = node_df.merge(
#    PrincipalDf_concatenated.merge)
merged_df = node_df_ha[["strain", "date", "y", clade_membership]].merge(
    principalDf_concatenated,
    on="strain"
).merge(
    MDS_df_concatenated,
    on="strain"
).merge(
    TSNE_df_concatenated,
    on="strain"
).merge(
    UMAP_df_concatenated,
    on="strain"
)

In [None]:
merged_df

In [None]:
data = linking_tree_with_plots_brush(
    node_df_ha.merge(merged_df[["strain"]], on="strain"),
    ['mds1', 'mds2','tsne_x','tsne_y', 'pca1', 'pca2', 'umap_x','umap_y'],
    ['MDS1', 'MDS2', 'TSNE1', 'TSNE2','PCA1 (Expected Variance : {}%'.format(round(explained_variance_PCA_ha[0]*100,2)) + ")",
    'PCA2 (Expected Variance : {}%'.format(round(explained_variance_PCA_ha[1]*100,2)) + ")",'UMAP1','UMAP2'],
    clade_membership+":N",
    ['strain'],
    domain,
    range_
)

In [None]:
PCAMDS = data[3]|data[1]
TSNEUMAP = data[2]|data[4]
embeddings = alt.vconcat(PCAMDS,TSNEUMAP)
embeddings
fullChart = alt.vconcat(data[0],embeddings)
fullChart

In [None]:
filtered_merged_df = merged_df[merged_df["pca1"] < 10].copy()

In [None]:
filtered_merged_df.shape

In [None]:
merged_df.shape

In [None]:
data = linking_tree_with_plots_brush(
    merged_df,
    ['mds1', 'mds2','tsne_x','tsne_y', 'pca3', 'pca4', 'umap_x','umap_y'],
    ['MDS1', 'MDS2', 'TSNE1', 'TSNE2','PCA3 (Expected Variance : {}%'.format(round(explained_variance_PCA_concatenated[2]*100,2)) + ")",
    'PCA4 (Expected Variance : {}%'.format(round(explained_variance_PCA_concatenated[3]*100,2)) + ")",'UMAP1','UMAP2'],
    clade_membership+":N",
    ['strain', clade_membership],
    domain,
    range_
)

In [None]:
PCAMDS = data[3]|data[1]
TSNEUMAP = data[2]|data[4]
embeddings = alt.vconcat(PCAMDS,TSNEUMAP)
embeddings
fullChart = alt.vconcat(data[0],embeddings)
fullChart

In [None]:
# Final Chart

In [None]:
merged_df.columns

In [None]:
merged_df

In [None]:
node_df_ha

In [None]:
total_df = merged_df.merge(node_df_ha, on="strain", suffixes=["", "_ha"])
total_df.rename(columns={'y_x':"y"}, inplace=True)

In [None]:
total_df

In [None]:
total_df["date"].max() + 0.2

In [None]:
data = linking_tree_with_plots_brush(
    total_df,
    ['mds1', 'mds2', 'mds1_ha', 'mds2_ha','tsne_x','tsne_y', 'tsne_x_ha', 'tsne_y_ha', 'pca1', 'pca2','pca1_ha', 'pca2_ha', 'umap_x','umap_y', 'umap_x_ha', 'umap_y_ha'],
    ['MDS1', 'MDS2', 'MDS1', 'MDS2', 'TSNE1', 'TSNE2', 'TSNE1', 'TSNE2', 
    'PCA1 (Expected Variance : {}%'.format(round(explained_variance_PCA_concatenated[0]*100,2)) + ")",
    'PCA2 (Expected Variance : {}%'.format(round(explained_variance_PCA_concatenated[1]*100,2)) + ")",
    'PCA1 (Expected Variance : {}%'.format(round(explained_variance_PCA_ha[0]*100,2)) + ")",
    'PCA2 (Expected Variance : {}%'.format(round(explained_variance_PCA_ha[1]*100,2)) + ")",'UMAP1','UMAP2','UMAP1','UMAP2'],
    clade_membership+":N",
    ['strain', clade_membership],
    domain,
    range_
)

In [None]:
chart_embeddings = alt.vconcat(data[6]|data[5], data[2]|data[1], data[4]|data[3], data[8]|data[7])
chart_embeddings

In [None]:
chart_embeddings.save(output_full_html)
#save(chart_embeddings, output_full_png, scale_factor=2.0)