# These are the Cartography visuals.

# Imports Section 

In [1]:
import sys
sys.path.extend(["../notebooks/scripts/"])

In [2]:
import altair as alt
from altair_saver import save
from augur.utils import json_to_tree
import json
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import re
#from reportlab.graphics import renderPDF
import seaborn as sns
#from svglib.svglib import svg2rlg

from Helpers import linking_tree_with_plots_clickable, linking_tree_with_plots_brush, scatterplot_with_tooltip_interactive
from Helpers import get_y_positions, get_euclidean_data_frame

#%matplotlib inline

In [3]:
alt.renderers.set_embed_options(
    padding={"left": 0, "right": 0, "bottom": 1, "top": 1}
)

RendererRegistry.enable('default')

In [4]:
sns.set_style("ticks")
# Disable top and right spines.
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
# Display and save figures at higher resolution for presentations and manuscripts.
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['figure.dpi'] = 100
# Display text at sizes large enough for presentations and manuscripts.
mpl.rcParams['font.weight'] = "normal"
mpl.rcParams['axes.labelweight'] = "normal"
mpl.rcParams['font.size'] = 10
mpl.rcParams['axes.labelsize'] = 10
mpl.rcParams['legend.fontsize'] = 8
mpl.rcParams['xtick.labelsize'] = 10
mpl.rcParams['ytick.labelsize'] = 10
mpl.rcParams['axes.titlesize'] = 8
mpl.rc('text', usetex=False)

In [5]:
try: 
    snakemake.input.node_df
    import selenium
    from selenium.webdriver import Chrome 
    from selenium import webdriver

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument("--remote-debugging-port=9222")

    browser = webdriver.Chrome(options=chrome_options)
except:
    print("not in Snakemake, imports unnecessary")

not in Snakemake, imports unnecessary


## Pathogen-specific variables

Consider consolidating these into a single configuration file that can be passed to the notebook as a command line argument for more scriptable generation of these figures.

# Flu Specific Variables

In [6]:
try:
    node_df_ha = snakemake.input.node_df_ha,

    pca_df_ha = snakemake.input.pca_df_ha,
    explained_variance_pca_ha = snakemake.input.explained_variance_pca_ha,

    pca_df_concatenated = snakemake.input.pca_df_concatenated,
    explained_variance_pca_concatenated = snakemake.input.explained_variance_pca_concatenated,

    mds_df_ha = snakemake.input.mds_df_ha,
    mds_df_concatenated = snakemake.input.mds_df_concatenated,
    mds_df_ma_concatenated = snakemake.input.mds_df_ma_concatenated,
    
    tsne_df_ha = snakemake.input.tsne_df_ha,
    tsne_df_concatenated = snakemake.input.tsne_df_concatenated,
    tsne_df_ma_concatenated = snakemake.input.tsne_df_ma_concatenated,
    
    threshold_information = snakemake.input.threshold_information,

    #KDE Density:

    KDE_mds_ha = snakemake.input.KDE_mds_ha,
    KDE_mds_metadata_ha = snakemake.input.KDE_mds_metadata_ha,

    KDE_mds_concatenated = snakemake.input.KDE_mds_concatenated,
    KDE_mds_metadata_concatenated = snakemake.input.KDE_mds_metadata_concatenated,

    KDE_umap_ha = snakemake.input.KDE_umap_ha,
    KDE_umap_metadata_ha = snakemake.input.KDE_umap_metadata_ha,

    KDE_umap_concatenated = snakemake.input.KDE_umap_concatenated,
    KDE_umap_metadata_concatenated = snakemake.inputKDE_umap_metadata_concatenated,
    
except:
    print("not in Snakemake, imports unnecessary")

not in Snakemake, imports unnecessary


# Reading in all the data from the scripts

In [7]:
import os

os.getcwd()

'/mnt/c/Work/BedfordProjects/cartography/ha-na-nextstrain'

In [8]:
colors = pd.read_csv("../notebooks/config/color_schemes.tsv", sep="\t", names=[i for i in range(0,101)])

In [9]:
#node_df = pd.read_csv(node_df_ha, sep="\t")
node_df_ha = pd.read_csv("results/table_ha.tsv", sep="\t")

In [10]:
node_df_ha.rename(columns={'num_date':'date', 'y_value':"y"}, inplace=True)

In [11]:
node_df_ha.head()

Unnamed: 0,strain,date,pca1,pca2,pca3,pca4,mds1,mds2,tsne_x,tsne_y,umap_x,umap_y,clade_membership,pca_label,mds_label,umap_label,t-sne_label,y
0,A/MUWRP-Uganda/579/2016,2016.67,5.01,1.2,-0.43,0.12,-27.31,-28.94,43.02,-5.57,-8.66,2.51,3c3,2,-1,0,2,1
1,A/Louisiana/17/2017,2017.17,6.63,1.07,1.18,-1.24,18.08,-41.19,33.79,-8.4,-9.44,2.71,3c3.A,1,-1,0,1,2
2,A/Texas/305/2017,2017.91,6.63,1.08,1.13,-1.37,17.23,-39.21,33.81,-8.37,-9.59,2.84,3c3.A,1,-1,0,1,3
3,A/Brisbane/1/2017,2017.0,6.7,1.14,1.02,-1.32,12.27,-35.29,32.92,-9.46,-8.97,3.17,3c3.A,1,0,0,1,4
4,A/Colorado/04/2017,2017.06,6.54,0.54,1.06,-2.08,13.21,-33.79,32.92,-9.47,-8.91,2.95,3c3.A,1,0,0,1,5


# Running PCA on Scaled and Centered Data
- I treated each nucleotide as a "site", or dimension, and found the probability of having a certain nucleotide given the frequency of that letter at that site.
- I used [this paper][1] as my source 
- The equation is as follows where C is the matrix of dimensions, M is the mean, and p is the frequency of a nucleotide at that given site. 
![](https://journals.plos.org/plosgenetics/article/file?type=thumbnail&id=info:doi/10.1371/journal.pgen.0020190.e003)

In [12]:
#principalDf = pd.read_csv(pca_df, index_col=0)
principalDf_ha = pd.read_csv("results/embed_pca_ha.csv", index_col=0)
principalDf_concatenated = pd.read_csv("results/embed_pca_concatenated.csv", index_col=0)

In [13]:
#explained_variance_df = pd.read_csv(explained_variance_pca)
explained_variance_df_ha = pd.read_csv("results/explained_variance_pca_ha.csv")
explained_variance_df_concatenated = pd.read_csv("results/explained_variance_pca_concatenated.csv")

In [14]:
explained_variance_df_ha

Unnamed: 0,explained variance,principal components
0,0.2599,1
1,0.1996,2
2,0.0805,3
3,0.0541,4
4,0.0333,5
5,0.0247,6
6,0.0201,7
7,0.014,8
8,0.011,9
9,0.0102,10


In [15]:
explained_variance_df_concatenated

Unnamed: 0,explained variance,principal components
0,0.4091,1
1,0.1167,2
2,0.0942,3
3,0.0656,4
4,0.0399,5
5,0.0228,6
6,0.016,7
7,0.0131,8
8,0.0111,9
9,0.0103,10


In [16]:
plt.plot(explained_variance_df_ha['principal components'].values.tolist(), explained_variance_df_ha["explained variance"].values.tolist(), 'o')

plt.xlabel("Principal Component")
plt.ylabel("Explained Variance")

plt.title(f"Explained Variance Plot (Flu, HA-only)")

Text(0.5, 1.0, 'Explained Variance Plot (Flu, HA-only)')

In [17]:
plt.plot(
    explained_variance_df_concatenated['principal components'].values.tolist(),
    explained_variance_df_concatenated["explained variance"].values.tolist(),
    'o'
)

plt.xlabel("Principal Component")
plt.ylabel("Explained Variance")

plt.title(f"Explained Variance Plot (Flu, HA and NA)")

Text(0.5, 1.0, 'Explained Variance Plot (Flu, HA and NA)')

In [18]:
merged_pca_df_ha = principalDf_ha.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_pca_df_concatenated = principalDf_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")

In [19]:
explained_variance_PCA_ha = explained_variance_df_ha["explained variance"].values.tolist()
explained_variance_PCA_concatenated = explained_variance_df_concatenated["explained variance"].values.tolist()

### Bases Missing Analysis for PCA 

In [20]:
from Bio import SeqIO
strains = []
genomes = []
for record in SeqIO.parse("results/aligned_concatenated.fasta", "fasta"):
    strains.append(str(record.id))
    genomes.append(str(record.seq))

#Checking missing_bases

genomes_missing_bases = []
for x in genomes:
    x = re.sub(r'[^AGCT]', '5', x)
    numberOfN = x.count("5") #This logic is here because MERS uses both "N" and "-" to dileneate missing sequences.
    genomes_missing_bases.append(numberOfN)
    
bases_df = pd.DataFrame([strains, genomes_missing_bases]).transpose()
bases_df.columns = ["strain", "bases_missing"]

In [21]:
strains = []
genomes = []
for record in SeqIO.parse("results/aligned_ha.fasta", "fasta"):
    strains.append(str(record.id))
    genomes.append(str(record.seq))

#Checking missing_bases

genomes_missing_bases = []
for x in genomes:
    x = re.sub(r'[^AGCT]', '5', x)
    numberOfN = x.count("5") #This logic is here because MERS uses both "N" and "-" to dileneate missing sequences.
    genomes_missing_bases.append(numberOfN)
    
bases_df_ha = pd.DataFrame([strains, genomes_missing_bases]).transpose()
bases_df_ha.columns = ["strain", "bases_missing"]
merged_total_ha = bases_df.merge(merged_pca_df_ha, on="strain")

In [22]:
merged_total = bases_df.merge(merged_pca_df_concatenated, on="strain")
merged_total

Unnamed: 0,strain,bases_missing,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,pca_label,pca_label_default,date,y,clade_membership
0,A/Santiago/op004d0/2017,0,-3.185021,-1.940736,0.257223,4.319822,-0.796626,0.609973,-0.349581,-0.091315,-0.233527,-0.351683,2,5,2017.39,275,A2
1,A/Keelung/0089/2016,0,-3.176534,-1.866684,0.343122,4.143157,-0.763848,0.550454,-0.371115,-0.140355,-0.143543,-0.233923,2,5,2016.94,254,3c2.A
2,A/Linkou/0187/2016,1,-3.186428,-1.932152,0.292599,4.323086,-0.806002,0.624562,-0.346388,-0.138427,-0.145734,-0.257071,2,5,2016.99,285,A2
3,A/Linkou/0192/2017,0,-3.184653,-1.928738,0.279663,4.315856,-0.801493,0.621753,-0.341751,-0.135076,-0.149669,-0.245903,2,5,2017.01,288,A2
4,A/Taipei/0122/2017,0,-3.180151,-1.918730,0.291766,4.236607,-0.768382,0.593673,-0.318777,-0.130006,-0.114051,-0.211754,2,5,2017.09,311,A2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,A/Baltimore/P0222/2017,0,-3.386216,4.776130,-0.810514,0.354236,-0.235261,0.432351,0.801856,-2.734245,2.606473,3.738545,0,0,2017.97,16,3c3.A
740,A/Baltimore/0295/2017,0,-3.427499,7.088185,-0.850191,0.724849,-0.338181,0.722585,0.427667,-2.881333,3.102734,4.801115,0,0,2017.11,14,3c3.A
741,A/Baltimore/0244/2017,0,-3.443411,7.553117,-0.820070,0.345383,0.559628,0.723839,0.359374,-2.635600,2.884864,4.407106,0,0,2017.06,8,3c3.A
742,A/Rochester/U040/2017,26,11.851697,7.380112,-1.533720,0.651940,-0.387770,1.057212,0.399351,-3.303443,2.598185,4.906468,-1,-1,2017.14,13,3c3.A


In [23]:
domain =  sorted(merged_total_ha["clade_membership"].drop_duplicates().values)
range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_total_ha,['pca1','pca2','pca3','pca4', 'pca5', 'pca6'],
                                         ['PCA1 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[0]*100,2)) + ")",
                                          'PCA2 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[1]*100,2)) + ")",
                                          'PCA3 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[2]*100,2)) + ")",
                                          'PCA4 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[3]*100,2)) + ")",
                                          'PCA5 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[4]*100,2)) + ")",
                                          'PCA6 (Explained Variance : {}%'.format(round(explained_variance_PCA_ha[5]*100,2)) + ")"],
                                         "clade_membership:N",['strain', "clade_membership", "bases_missing"], domain, range_)
domain =  sorted(merged_total["clade_membership"].drop_duplicates().values)
range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_total,['pca1','pca2','pca3','pca4', 'pca5', 'pca6'],
                                         ['PCA1 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[0]*100,2)) + ")",
                                          'PCA2 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[1]*100,2)) + ")",
                                          'PCA3 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[2]*100,2)) + ")",
                                          'PCA4 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[3]*100,2)) + ")",
                                          'PCA5 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[4]*100,2)) + ")",
                                          'PCA6 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[5]*100,2)) + ")"],
                                         "clade_membership:N",['strain', "clade_membership", "bases_missing"], domain, range_)
PCAFluBrush_ha = list_of_chart_ha[0]|list_of_chart_ha[1]|list_of_chart_ha[2]
PCAFluBrush_concatenated = list_of_chart_concatenated[0]|list_of_chart_concatenated[1]|list_of_chart_concatenated[2]

alt.vconcat(PCAFluBrush_ha,PCAFluBrush_concatenated)
#PCAFluBrush.save("docs/PCAHaNaBrush.html")

In [24]:
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_total_ha,['pca1','pca2'],
                                         ['PCA1 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[0]*100,2)) + ")",
                                          'PCA2 (Explained Variance : {}%'.format(round(explained_variance_PCA_concatenated[1]*100,2)) + ")"],
                                         "clade_membership:N",['strain', "clade_membership", "bases_missing"], domain, range_)

PCAFluBrush_concatenated = list_of_chart_concatenated[0]|list_of_chart_concatenated[1]
#PCAFluBrush_concatenated.save("../docs/PCAFluBrushHA_.html")

# Running MDS on the Dataset

In [25]:
#MDS_df = pd.read_csv(mds_df,index_col=0)
MDS_df_ha = pd.read_csv("results/embed_mds_ha.csv",index_col=0)
MDS_df_concatenated = pd.read_csv("results/embed_mds_concatenated.csv",index_col=0)

In [26]:
merged_mds_df_ha = MDS_df_ha.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_mds_df_concatenated = MDS_df_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")

In [27]:
merged_mds_df_ha

Unnamed: 0,strain,mds1,mds2,mds_label,mds_label_default,date,y,clade_membership
0,A/Santiago/op004d0/2017,-18.302112,2.166639,1,1,2017.39,275,A2
1,A/Keelung/0089/2016,-11.888039,3.038053,1,1,2016.94,254,3c2.A
2,A/Linkou/0187/2016,-11.495330,3.471363,1,1,2016.99,285,A2
3,A/Linkou/0192/2017,-12.452713,3.472346,1,1,2017.01,288,A2
4,A/Taipei/0122/2017,-10.689564,3.519868,1,1,2017.09,311,A2
...,...,...,...,...,...,...,...,...
739,A/Baltimore/P0222/2017,-18.423615,-40.127207,0,0,2017.97,16,3c3.A
740,A/Baltimore/0295/2017,-14.721068,-35.634124,0,0,2017.11,14,3c3.A
741,A/Baltimore/0244/2017,-13.370844,-35.242191,0,0,2017.06,8,3c3.A
742,A/Rochester/U040/2017,-15.018202,-35.556671,0,0,2017.14,13,3c3.A


In [28]:
domain =  sorted(merged_mds_df_ha["clade_membership"].drop_duplicates().values)
range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
chart_12_mds = scatterplot_with_tooltip_interactive(merged_mds_df_ha,'mds1','mds2',"mds1","mds2",['strain','clade_membership'],'clade_membership:N', domain, range_)
chart_12_mds

In [29]:
domain =  sorted(merged_mds_df_ha["clade_membership"].drop_duplicates().values)
range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_mds_df_ha,['mds1','mds2'],["MDS1", "MDS2"], 'clade_membership:N', ['strain','clade_membership'], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1]
domain =  sorted(merged_mds_df_concatenated["clade_membership"].drop_duplicates().values)
range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_mds_df_concatenated,['mds1','mds2'],["MDS1", "MDS2"], 'clade_membership:N', ['strain','clade_membership'], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1]
alt.vconcat(chart_ha, chart_concat)

# HDBSCAN project:
- cluster on HA, find MCC value, same for HA+NA (from cluster_results script)
- check if HA+NA MCC > HA only

## MDS

In [30]:
MDS_df_ha = pd.read_csv("results/embed_mds_ha.csv",index_col=0)
MDS_df_concatenated = pd.read_csv("results/embed_mds_concatenated.csv",index_col=0)
MDS_df_ma_concatenated = pd.read_csv("results/embed_mds_ma_concatenated.csv", index_col=0)

In [31]:
merged_mds_df_ha = MDS_df_ha.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_mds_df_concatenated = MDS_df_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_mds_df_ma_concatenated = MDS_df_ma_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")

In [32]:
KDE_df_normal = get_euclidean_data_frame(sampled_df=merged_mds_df_ha, column_for_analysis="clade_membership", embedding="method", column_list=['mds1', 'mds2'])

In [33]:
full_output_df = pd.read_csv("../seasonal-flu-nextstrain/results/full_output_HDBSCAN.csv",index_col=0)

In [34]:
full_output_df

Unnamed: 0,method,distance_threshold_number,confusion_matrix_training,matthews_cc_training,num_undefined_training,confusion_matrix_validation,matthews_cc_validation,threshold,num_undefined_validation
0,t-sne,t-sne_label_0,[[1318585 8838]\n [ 122004 60026]],0.504252,146,[[1328470 9941]\n [ 116214 54828]],0.489772,0.0,138
1,t-sne,t-sne_label_1,[[1314976 12447]\n [ 123966 58064]],0.477784,180,[[1330518 7893]\n [ 119258 51784]],0.482883,2.0,129
2,t-sne,t-sne_label_2,[[1322569 4854]\n [ 66074 115956]],0.760115,0,[[1332791 5620]\n [ 90320 80722]],0.638445,4.0,4
3,t-sne,t-sne_label_3,[[1322462 4961]\n [ 58689 123341]],0.786867,0,[[1325747 12664]\n [ 53306 117736]],0.765971,6.0,0
4,t-sne,t-sne_label_4,[[1315582 11841]\n [ 52189 129841]],0.786531,0,[[1304520 33891]\n [ 65098 105944]],0.649508,8.0,0
...,...,...,...,...,...,...,...,...,...
259,pca,pca_label_259,[[ 236908 1097723]\n [ 1 174821]],0.156155,1,[[ 252479 1078516]\n [ 1 178457]],0.164103,12.0,0
260,pca,pca_label_260,[[ 236908 1097723]\n [ 1 174821]],0.156155,1,[[ 252479 1078516]\n [ 1 178457]],0.164103,14.0,0
261,pca,pca_label_261,[[ 236908 1097723]\n [ 1 174821]],0.156155,1,[[ 252479 1078516]\n [ 1 178457]],0.164103,16.0,0
262,pca,pca_label_262,[[ 236908 1097723]\n [ 1 174821]],0.156155,1,[[ 252479 1078516]\n [ 1 178457]],0.164103,18.0,0


In [35]:
method_dict = dict(full_output_df.groupby("method").get_group("mds").iloc[full_output_df.groupby("method").get_group("mds").groupby("threshold")["matthews_cc_validation"].mean().argmax()])
val = float(method_dict["threshold"])

In [36]:
import hdbscan

In [37]:
#cluster on HA
clusterer = hdbscan.HDBSCAN(cluster_selection_epsilon=val)
clusterer.fit(merged_mds_df_ha[["mds1", "mds2"]])
merged_mds_df_ha["mds_label_1"] = clusterer.labels_.astype(str)

In [38]:
domain =  merged_mds_df_ha["mds_label_1"].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
chart_12_mds = scatterplot_with_tooltip_interactive(merged_mds_df_ha,'mds1','mds2',"mds1","mds2",['strain','clade_membership'],'mds_label_1:N', domain, range_)
chart_12_mds

In [39]:
from sklearn.metrics import confusion_matrix, matthews_corrcoef

In [40]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_mds_df_ha[["mds1", "mds2", "strain", "mds_label"]], column_for_analysis="mds_label", embedding="mds", column_list=["mds1", "mds2"])
confusion_matrix_val_ha = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_ha = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [41]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_mds_df_concatenated[["mds1", "mds2", "strain", "mds_label"]], column_for_analysis="mds_label", embedding="mds", column_list=["mds1", "mds2"])
confusion_matrix_val_concatenated = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_concatenated = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [42]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_mds_df_ma_concatenated[["mds1", "mds2", "strain", "mds_label"]], column_for_analysis="mds_label", embedding="mds", column_list=["mds1", "mds2"])
confusion_matrix_val_ma_concatenated = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_ma_concatenated = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [43]:
#domain =  merged_mds_df_ha["mds_label_1"].drop_duplicates().values
domain =  sorted(merged_mds_df_ha["mds_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_mds_df_ha,['mds1','mds2'],["MDS1", "MDS2"], 'mds_label:N', ['strain','clade_membership'], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1].properties(title="HA only MCC: " + str(round(matthews_cc_val_ha,4)))
#domain =  merged_mds_df_concatenated["mds_label_1"].drop_duplicates().values
domain =  sorted(merged_mds_df_concatenated["mds_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_mds_df_concatenated,['mds1','mds2'],["MDS1", "MDS2"], 'mds_label:N', ['strain','clade_membership'], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="HA + NA MCC: " + str(round(matthews_cc_val_concatenated,4)))
alt.vconcat(chart_ha, chart_concat).resolve_scale(color='independent')
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_mds_df_ma_concatenated,['mds1','mds2'],["MDS1", "MDS2"], 'mds_label:N', ['strain','clade_membership'], domain, range_)
chart_concat_ma = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="HA + NA + MA MCC: " + str(round(matthews_cc_val_ma_concatenated,4)))
final_chart = alt.vconcat(chart_ha, chart_concat, chart_concat_ma).resolve_scale(color='independent')
save(final_chart, "../docs/HANAMAFullChartBrushableMDS.html")
save(final_chart, "../docs/HANAMAFullChartBrushableMDS.png", scale_factor=2.0)

In [44]:
domain =  merged_mds_df_ha["clade_membership"].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_mds_df_ha,['mds1','mds2'],["MDS1", "MDS2"], 'clade_membership:N', ['strain','clade_membership'], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1].properties(title="MCC: " + str(round(matthews_cc_val_ha,4)))
domain =  merged_mds_df_concatenated["clade_membership"].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_mds_df_concatenated,['mds1','mds2'],["MDS1", "MDS2"], 'clade_membership:N', ['strain','clade_membership'], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="MCC: " + str(round(matthews_cc_val_concatenated,4)))
chart_total = alt.vconcat(chart_ha, chart_concat)
chart_total


In [45]:
domain =  sorted(merged_mds_df_ha["mds_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_mds_df_ha,['mds1','mds2'],["MDS1", "MDS2"], 'mds_label:N', ['strain','clade_membership'], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1].properties(title="MCC: " + str(round(matthews_cc_val_ha,4)))
domain =  sorted(merged_mds_df_ha["mds_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_mds_df_concatenated,['mds1','mds2'],["MDS1", "MDS2"], 'mds_label:N', ['strain','clade_membership'], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="MCC: " + str(round(matthews_cc_val_concatenated,4)))
final_chart = alt.vconcat(chart_ha, chart_concat)
final_chart
save(final_chart, "../docs/HANAFullChartBrushableMDS.html")
save(final_chart, "../docs/HANAFullChartBrushableMDS.png", scale_factor=2.0)

## HDBSCAN clustering on t-SNE 

In [46]:
TSNE_df_ha = pd.read_csv("results/embed_t-sne_ha.csv",index_col=0)
TSNE_df_concatenated = pd.read_csv("results/embed_t-sne_concatenated.csv",index_col=0)
TSNE_df_ma_concatenated = pd.read_csv("results/embed_t-sne_ma_concatenated.csv", index_col=0)

In [47]:
merged_tsne_df_ha = TSNE_df_ha.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_tsne_df_concatenated = TSNE_df_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_tsne_df_ma_concatenated = TSNE_df_ma_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")

In [48]:
KDE_df_normal = get_euclidean_data_frame(sampled_df=merged_tsne_df_ha, column_for_analysis="clade_membership", embedding="method", column_list=['tsne_x', 'tsne_y'])

In [49]:
full_output_df = pd.read_csv("../seasonal-flu-nextstrain/results/full_output_HDBSCAN.csv",index_col=0)

In [50]:
full_output_df

Unnamed: 0,method,distance_threshold_number,confusion_matrix_training,matthews_cc_training,num_undefined_training,confusion_matrix_validation,matthews_cc_validation,threshold,num_undefined_validation
0,t-sne,t-sne_label_0,[[1318585 8838]\n [ 122004 60026]],0.504252,146,[[1328470 9941]\n [ 116214 54828]],0.489772,0.0,138
1,t-sne,t-sne_label_1,[[1314976 12447]\n [ 123966 58064]],0.477784,180,[[1330518 7893]\n [ 119258 51784]],0.482883,2.0,129
2,t-sne,t-sne_label_2,[[1322569 4854]\n [ 66074 115956]],0.760115,0,[[1332791 5620]\n [ 90320 80722]],0.638445,4.0,4
3,t-sne,t-sne_label_3,[[1322462 4961]\n [ 58689 123341]],0.786867,0,[[1325747 12664]\n [ 53306 117736]],0.765971,6.0,0
4,t-sne,t-sne_label_4,[[1315582 11841]\n [ 52189 129841]],0.786531,0,[[1304520 33891]\n [ 65098 105944]],0.649508,8.0,0
...,...,...,...,...,...,...,...,...,...
259,pca,pca_label_259,[[ 236908 1097723]\n [ 1 174821]],0.156155,1,[[ 252479 1078516]\n [ 1 178457]],0.164103,12.0,0
260,pca,pca_label_260,[[ 236908 1097723]\n [ 1 174821]],0.156155,1,[[ 252479 1078516]\n [ 1 178457]],0.164103,14.0,0
261,pca,pca_label_261,[[ 236908 1097723]\n [ 1 174821]],0.156155,1,[[ 252479 1078516]\n [ 1 178457]],0.164103,16.0,0
262,pca,pca_label_262,[[ 236908 1097723]\n [ 1 174821]],0.156155,1,[[ 252479 1078516]\n [ 1 178457]],0.164103,18.0,0


In [51]:
method_dict = dict(full_output_df.groupby("method").get_group("t-sne").iloc[full_output_df.groupby("method").get_group("t-sne").groupby("threshold")["matthews_cc_validation"].mean().argmax()])
val = float(method_dict["threshold"])

In [52]:
import hdbscan

In [53]:
#cluster on HA
clusterer = hdbscan.HDBSCAN(cluster_selection_epsilon=val)
clusterer_concat = hdbscan.HDBSCAN(cluster_selection_epsilon=val)
clusterer_concat_ma = hdbscan.HDBSCAN(cluster_selection_epsilon=val)
clusterer.fit(merged_tsne_df_ha[["tsne_x", "tsne_y"]])
clusterer_concat.fit(merged_tsne_df_concatenated[["tsne_x", "tsne_y"]])
clusterer_concat_ma.fit(merged_tsne_df_ma_concatenated[["tsne_x", "tsne_y"]])
merged_tsne_df_ha["tsne_label_1"] = clusterer.labels_.astype(str)
merged_tsne_df_concatenated["tsne_label_1"] = clusterer_concat.labels_.astype(str)
merged_tsne_df_ma_concatenated["tsne_label_1"] = clusterer_concat_ma.labels_.astype(str)

In [54]:
domain =  merged_tsne_df_ha["tsne_label_1"].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
chart_12_tsne = scatterplot_with_tooltip_interactive(merged_tsne_df_ha,'tsne_x','tsne_y',"tsne_x","tsne_y",['strain','clade_membership'],'tsne_label_1:N', domain, range_)
chart_12_tsne

In [55]:
from sklearn.metrics import confusion_matrix, matthews_corrcoef

In [56]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_tsne_df_ha[["tsne_x", "tsne_y", "strain", "tsne_label_1"]], column_for_analysis="tsne_label_1", embedding="tsne", column_list=["tsne_x", "tsne_y"])
confusion_matrix_val_ha = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_ha = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [57]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_tsne_df_concatenated[["tsne_x", "tsne_y", "strain", "tsne_label_1"]], column_for_analysis="tsne_label_1", embedding="tsne", column_list=["tsne_x", "tsne_y"])
confusion_matrix_val_concatenated = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_concatenated = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [58]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_tsne_df_ma_concatenated[["tsne_x", "tsne_y", "strain", "tsne_label_1"]], column_for_analysis="tsne_label_1", embedding="tsne", column_list=["tsne_x", "tsne_y"])
confusion_matrix_val_ma_concatenated = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_ma_concatenated = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [59]:
#domain =  merged_tsne_df_ha["tsne_label_1"].drop_duplicates().values
domain =  sorted(merged_tsne_df_ha["tsne_label_1"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_tsne_df_ha,['tsne_x','tsne_y'],["MDS1", "MDS2"], 'tsne_label_1:N', ['strain','clade_membership'], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1].properties(title="HA only MCC: " + str(round(matthews_cc_val_ha,4)))
#domain =  merged_tsne_df_concatenated["tsne_label_1"].drop_duplicates().values
domain =  sorted(merged_tsne_df_concatenated["tsne_label_1"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_tsne_df_concatenated,['tsne_x','tsne_y'],["MDS1", "MDS2"], 'tsne_label_1:N', ['strain','clade_membership'], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="HA + NA MCC: " + str(round(matthews_cc_val_concatenated,4)))
alt.vconcat(chart_ha, chart_concat).resolve_scale(color='independent')
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_tsne_df_ma_concatenated,['tsne_x','tsne_y'],["MDS1", "MDS2"], 'tsne_label_1:N', ['strain','clade_membership'], domain, range_)
chart_concat_ma = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="HA + NA + MA MCC: " + str(round(matthews_cc_val_ma_concatenated,4)))
final_chart = alt.vconcat(chart_ha, chart_concat, chart_concat_ma).resolve_scale(color='independent')
save(final_chart, "../docs/HANAMAFullChartBrushableTSNE.html",)
save(final_chart, "../docs/HANAMAFullChartBrushableTSNE.png", scale_factor=2.0)

In [60]:
domain =  sorted(merged_tsne_df_ha["tsne_label_1"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_tsne_df_ha,['tsne_x','tsne_y'],["MDS1", "MDS2"], 'tsne_label_1:N', ['strain','clade_membership'], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1].properties(title="MCC: " + str(round(matthews_cc_val_ha,4)))
domain =  merged_tsne_df_concatenated["tsne_label_1"].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_tsne_df_concatenated,['tsne_x','tsne_y'],["MDS1", "MDS2"], 'tsne_label_1:N', ['strain','clade_membership'], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="MCC: " + str(round(matthews_cc_val_concatenated,4)))
final_chart = alt.vconcat(chart_ha, chart_concat).resolve_scale(color='independent')
final_chart
save(final_chart, "../docs/HANAFullChartBrushableTSNE.html")
save(final_chart, "../docs/HANAFullChartBrushableTSNE.png", scale_factor=2.0)

# Running T-SNE on the Dataset 

In [67]:
TSNE_df_ha = pd.read_csv("results/embed_t-sne_ha.csv",index_col=0)
TSNE_df_concatenated = pd.read_csv("results/embed_t-sne_concatenated.csv",index_col=0)

In [68]:
merged_tsne_df_ha = TSNE_df_ha.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_tsne_df_concatenated = TSNE_df_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")

In [70]:
domain =  sorted(merged_tsne_df_ha["clade_membership"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
scatterplot_with_tooltip_interactive(merged_tsne_df_ha,'tsne_x','tsne_y','tsne_x','tsne_y',['strain', "clade_membership"],'clade_membership:N', domain, range_)

In [72]:
domain =  sorted(merged_tsne_df_concatenated["clade_membership"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
scatterplot_with_tooltip_interactive(merged_tsne_df_concatenated,'tsne_x','tsne_y','tsne_x','tsne_y',['strain', "clade_membership"],'clade_membership:N', domain, range_)

In [73]:
list_of_chart_ha = linking_tree_with_plots_brush(
    merged_tsne_df_ha,
    ['tsne_x','tsne_y'],
    ['tsne_x','tsne_y'],
    'clade_membership:N',
    ["strain:N", "clade_membership:N"],
    domain,
    range_
)
chart_tsne_ha = list_of_chart_ha[0]|list_of_chart_ha[1]
chart_tsne_ha

In [74]:
list_of_chart_concatenated = linking_tree_with_plots_brush(
    merged_tsne_df_concatenated,
    ['tsne_x','tsne_y'],
    ['tsne_x','tsne_y'],
    'clade_membership:N',
    ["strain:N", "clade_membership:N"],
    domain,
    range_
)
chart_tsne_concatenated = list_of_chart_concatenated[0]|list_of_chart_concatenated[1]
chart_tsne_concatenated

In [None]:
chart_tsne_ha & chart_tsne_concatenated

# Running UMAP on the Dataset

In [75]:
UMAP_df_ha = pd.read_csv("results/embed_umap_ha.csv",index_col=0)
UMAP_df_concatenated = pd.read_csv("results/embed_umap_concatenated.csv",index_col=0)

In [76]:
UMAP_df_concatenated

Unnamed: 0_level_0,umap_x,umap_y,umap_label,umap_label_default
strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A/Santiago/op004d0/2017,10.823834,2.324666,1,4
A/Keelung/0089/2016,10.146812,2.419663,1,4
A/Linkou/0187/2016,9.969683,4.434700,1,4
A/Linkou/0192/2017,10.187099,3.684332,1,4
A/Taipei/0122/2017,9.470983,4.036418,1,4
...,...,...,...,...
A/Baltimore/P0222/2017,-2.873133,16.333400,0,0
A/Baltimore/0295/2017,-1.949247,16.234972,0,0
A/Baltimore/0244/2017,-2.276079,16.466290,0,0
A/Rochester/U040/2017,-2.317351,16.291090,0,0


In [77]:
merged_umap_df_ha = UMAP_df_ha.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_umap_df_concatenated = UMAP_df_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")

In [78]:
UMAP_df_ha.index.tolist() == UMAP_df_concatenated.index.values.tolist()

True

# Linking all plots together clickable with Tree

In [79]:
#merged_df = node_df.merge(
#    PrincipalDf_concatenated.merge)
merged_df = node_df_ha[["strain", "date", "y", "clade_membership"]].merge(
    principalDf_concatenated,
    on="strain"
).merge(
    MDS_df_concatenated,
    on="strain"
).merge(
    TSNE_df_concatenated,
    on="strain"
).merge(
    UMAP_df_concatenated,
    on="strain"
)

In [80]:
merged_df

Unnamed: 0,strain,date,y,clade_membership,pca1,pca2,pca3,pca4,pca5,pca6,...,mds_label,mds_label_default,tsne_x,tsne_y,t-sne_label,t-sne_label_default,umap_x,umap_y,umap_label,umap_label_default
0,A/MUWRP-Uganda/579/2016,2016.67,1,3c3,-3.721439,6.364175,-0.700366,0.598538,-1.075811,-0.049365,...,-1,-1,-34.307045,27.437191,3,3,-2.405527,16.799713,0,0
1,A/Louisiana/17/2017,2017.17,2,3c3.A,-3.438285,7.695400,-0.849551,0.310337,1.076032,0.758395,...,-1,-1,-25.732840,17.357395,3,3,-2.149100,16.728508,0,0
2,A/Texas/305/2017,2017.91,3,3c3.A,-2.920179,7.818879,-0.818226,0.331052,1.049173,0.821110,...,-1,-1,-25.884304,17.307245,3,3,-2.308701,16.891626,0,0
3,A/Brisbane/1/2017,2017.00,4,3c3.A,11.836096,7.860436,-1.531334,0.276456,0.502111,1.056194,...,0,0,-24.909449,17.660292,3,3,-2.242783,16.581036,0,0
4,A/Colorado/04/2017,2017.06,5,3c3.A,-3.776287,7.395584,-0.751809,-0.342179,0.608877,1.379814,...,0,0,-24.864594,17.525242,3,3,-2.300069,16.396772,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,A/Maryland/47/2017,2017.69,740,A1b/135N,-3.246497,-1.324287,0.179491,-3.190549,-1.582956,1.355593,...,4,4,-2.366085,-23.162296,7,6,12.894024,12.574149,1,1
740,A/Maryland/53/2017,2017.74,741,A1b/135N,-3.575301,-1.250429,0.108631,-3.317520,-1.752572,1.565829,...,4,4,-2.985781,-23.405695,7,6,12.861002,12.835609,1,1
741,A/Missouri/31/2017,2017.80,742,A1b/135N,-3.250741,-1.330341,0.132081,-3.189558,-1.578110,1.361089,...,4,4,-2.592756,-22.667840,7,6,13.056715,12.680791,1,1
742,A/Michigan/276/2017,2017.61,743,A1b/135N,-3.249422,-1.331240,0.131545,-3.191381,-1.578756,1.357296,...,4,4,-2.533548,-22.691792,7,6,12.944910,12.419623,1,1


In [81]:
data = linking_tree_with_plots_brush(
    node_df_ha.merge(merged_df[["strain"]], on="strain"),
    ['mds1', 'mds2','tsne_x','tsne_y', 'pca1', 'pca2', 'umap_x','umap_y'],
    ['MDS1', 'MDS2', 'TSNE1', 'TSNE2','PCA1 (Expected Variance : {}%'.format(round(explained_variance_PCA_ha[0]*100,2)) + ")",
    'PCA2 (Expected Variance : {}%'.format(round(explained_variance_PCA_ha[1]*100,2)) + ")",'UMAP1','UMAP2'],
    'clade_membership:N',
    ['strain'],
    domain,
    range_
)

In [82]:
PCAMDS = data[3]|data[1]
TSNEUMAP = data[2]|data[4]
embeddings = alt.vconcat(PCAMDS,TSNEUMAP)
embeddings
fullChart = alt.vconcat(data[0],embeddings)
fullChart

In [83]:
filtered_merged_df = merged_df[merged_df["pca1"] < 10].copy()

In [84]:
filtered_merged_df.shape

(601, 28)

In [85]:
merged_df.shape

(744, 28)

In [86]:
data = linking_tree_with_plots_brush(
    merged_df,
    ['mds1', 'mds2','tsne_x','tsne_y', 'pca3', 'pca4', 'umap_x','umap_y'],
    ['MDS1', 'MDS2', 'TSNE1', 'TSNE2','PCA3 (Expected Variance : {}%'.format(round(explained_variance_PCA_concatenated[2]*100,2)) + ")",
    'PCA4 (Expected Variance : {}%'.format(round(explained_variance_PCA_concatenated[3]*100,2)) + ")",'UMAP1','UMAP2'],
    'clade_membership:N',
    ['strain', 'clade_membership'],
    domain,
    range_
)

In [87]:
PCAMDS = data[3]|data[1]
TSNEUMAP = data[2]|data[4]
embeddings = alt.vconcat(PCAMDS,TSNEUMAP)
embeddings
fullChart = alt.vconcat(data[0],embeddings)
fullChart

In [88]:
# Final Chart

In [91]:
merged_df.columns

Index(['strain', 'date', 'y', 'clade_membership', 'pca1', 'pca2', 'pca3',
       'pca4', 'pca5', 'pca6', 'pca7', 'pca8', 'pca9', 'pca10', 'pca_label',
       'pca_label_default', 'mds1', 'mds2', 'mds_label', 'mds_label_default',
       'tsne_x', 'tsne_y', 't-sne_label', 't-sne_label_default', 'umap_x',
       'umap_y', 'umap_label', 'umap_label_default'],
      dtype='object')

In [92]:
merged_df

Unnamed: 0,strain,date,y,clade_membership,pca1,pca2,pca3,pca4,pca5,pca6,...,mds_label,mds_label_default,tsne_x,tsne_y,t-sne_label,t-sne_label_default,umap_x,umap_y,umap_label,umap_label_default
0,A/MUWRP-Uganda/579/2016,2016.67,1,3c3,-3.721439,6.364175,-0.700366,0.598538,-1.075811,-0.049365,...,-1,-1,-34.307045,27.437191,3,3,-2.405527,16.799713,0,0
1,A/Louisiana/17/2017,2017.17,2,3c3.A,-3.438285,7.695400,-0.849551,0.310337,1.076032,0.758395,...,-1,-1,-25.732840,17.357395,3,3,-2.149100,16.728508,0,0
2,A/Texas/305/2017,2017.91,3,3c3.A,-2.920179,7.818879,-0.818226,0.331052,1.049173,0.821110,...,-1,-1,-25.884304,17.307245,3,3,-2.308701,16.891626,0,0
3,A/Brisbane/1/2017,2017.00,4,3c3.A,11.836096,7.860436,-1.531334,0.276456,0.502111,1.056194,...,0,0,-24.909449,17.660292,3,3,-2.242783,16.581036,0,0
4,A/Colorado/04/2017,2017.06,5,3c3.A,-3.776287,7.395584,-0.751809,-0.342179,0.608877,1.379814,...,0,0,-24.864594,17.525242,3,3,-2.300069,16.396772,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,A/Maryland/47/2017,2017.69,740,A1b/135N,-3.246497,-1.324287,0.179491,-3.190549,-1.582956,1.355593,...,4,4,-2.366085,-23.162296,7,6,12.894024,12.574149,1,1
740,A/Maryland/53/2017,2017.74,741,A1b/135N,-3.575301,-1.250429,0.108631,-3.317520,-1.752572,1.565829,...,4,4,-2.985781,-23.405695,7,6,12.861002,12.835609,1,1
741,A/Missouri/31/2017,2017.80,742,A1b/135N,-3.250741,-1.330341,0.132081,-3.189558,-1.578110,1.361089,...,4,4,-2.592756,-22.667840,7,6,13.056715,12.680791,1,1
742,A/Michigan/276/2017,2017.61,743,A1b/135N,-3.249422,-1.331240,0.131545,-3.191381,-1.578756,1.357296,...,4,4,-2.533548,-22.691792,7,6,12.944910,12.419623,1,1


In [93]:
node_df_ha.columns = ['strain', 'date_ha', 'pca1_ha', 'pca2_ha', 'pca3_ha', 'pca4_ha', 'mds1_ha', 'mds2_ha',
       'tsne_x_ha', 'tsne_y_ha', 'umap_x_ha', 'umap_y_ha',
       'clade_membership_ha', 'pca_label_ha', 'mds_label_ha', 'umap_label_ha',
       't-sne_label_ha', 'y']

In [94]:
node_df_ha

Unnamed: 0,strain,date_ha,pca1_ha,pca2_ha,pca3_ha,pca4_ha,mds1_ha,mds2_ha,tsne_x_ha,tsne_y_ha,umap_x_ha,umap_y_ha,clade_membership_ha,pca_label_ha,mds_label_ha,umap_label_ha,t-sne_label_ha,y
0,A/MUWRP-Uganda/579/2016,2016.67,5.01,1.20,-0.43,0.12,-27.31,-28.94,43.02,-5.57,-8.66,2.51,3c3,2,-1,0,2,1
1,A/Louisiana/17/2017,2017.17,6.63,1.07,1.18,-1.24,18.08,-41.19,33.79,-8.40,-9.44,2.71,3c3.A,1,-1,0,1,2
2,A/Texas/305/2017,2017.91,6.63,1.08,1.13,-1.37,17.23,-39.21,33.81,-8.37,-9.59,2.84,3c3.A,1,-1,0,1,3
3,A/Brisbane/1/2017,2017.00,6.70,1.14,1.02,-1.32,12.27,-35.29,32.92,-9.46,-8.97,3.17,3c3.A,1,0,0,1,4
4,A/Colorado/04/2017,2017.06,6.54,0.54,1.06,-2.08,13.21,-33.79,32.92,-9.47,-8.91,2.95,3c3.A,1,0,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,A/Maryland/47/2017,2017.69,-0.26,-3.03,-2.03,-1.64,14.38,8.93,17.63,-34.29,5.16,13.31,A1b/135N,3,5,2,0,740
740,A/Maryland/53/2017,2017.74,-0.26,-3.03,-2.02,-1.64,14.38,8.91,17.13,-34.75,5.10,13.53,A1b/135N,3,5,2,0,741
741,A/Missouri/31/2017,2017.80,-0.26,-3.03,-2.03,-1.64,14.34,9.03,17.00,-34.69,5.13,13.39,A1b/135N,3,5,2,0,742
742,A/Michigan/276/2017,2017.61,-0.26,-3.03,-2.02,-1.64,13.34,8.16,17.33,-34.43,4.79,13.01,A1b/135N,3,5,2,0,743


In [95]:
total_df = merged_df.merge(node_df_ha, on="strain")
total_df.rename(columns={'y_x':"y"}, inplace=True)

In [96]:
total_df

Unnamed: 0,strain,date,y,clade_membership,pca1,pca2,pca3,pca4,pca5,pca6,...,tsne_x_ha,tsne_y_ha,umap_x_ha,umap_y_ha,clade_membership_ha,pca_label_ha,mds_label_ha,umap_label_ha,t-sne_label_ha,y_y
0,A/MUWRP-Uganda/579/2016,2016.67,1,3c3,-3.721439,6.364175,-0.700366,0.598538,-1.075811,-0.049365,...,43.02,-5.57,-8.66,2.51,3c3,2,-1,0,2,1
1,A/Louisiana/17/2017,2017.17,2,3c3.A,-3.438285,7.695400,-0.849551,0.310337,1.076032,0.758395,...,33.79,-8.40,-9.44,2.71,3c3.A,1,-1,0,1,2
2,A/Texas/305/2017,2017.91,3,3c3.A,-2.920179,7.818879,-0.818226,0.331052,1.049173,0.821110,...,33.81,-8.37,-9.59,2.84,3c3.A,1,-1,0,1,3
3,A/Brisbane/1/2017,2017.00,4,3c3.A,11.836096,7.860436,-1.531334,0.276456,0.502111,1.056194,...,32.92,-9.46,-8.97,3.17,3c3.A,1,0,0,1,4
4,A/Colorado/04/2017,2017.06,5,3c3.A,-3.776287,7.395584,-0.751809,-0.342179,0.608877,1.379814,...,32.92,-9.47,-8.91,2.95,3c3.A,1,0,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,A/Maryland/47/2017,2017.69,740,A1b/135N,-3.246497,-1.324287,0.179491,-3.190549,-1.582956,1.355593,...,17.63,-34.29,5.16,13.31,A1b/135N,3,5,2,0,740
740,A/Maryland/53/2017,2017.74,741,A1b/135N,-3.575301,-1.250429,0.108631,-3.317520,-1.752572,1.565829,...,17.13,-34.75,5.10,13.53,A1b/135N,3,5,2,0,741
741,A/Missouri/31/2017,2017.80,742,A1b/135N,-3.250741,-1.330341,0.132081,-3.189558,-1.578110,1.361089,...,17.00,-34.69,5.13,13.39,A1b/135N,3,5,2,0,742
742,A/Michigan/276/2017,2017.61,743,A1b/135N,-3.249422,-1.331240,0.131545,-3.191381,-1.578756,1.357296,...,17.33,-34.43,4.79,13.01,A1b/135N,3,5,2,0,743


In [97]:
total_df["date"].max() + 0.2

2018.17

In [98]:
data = linking_tree_with_plots_brush(
    total_df,
    ['mds1', 'mds2', 'mds1_ha', 'mds2_ha','tsne_x','tsne_y', 'tsne_x_ha', 'tsne_y_ha', 'pca1', 'pca2','pca1_ha', 'pca2_ha', 'umap_x','umap_y', 'umap_x_ha', 'umap_y_ha'],
    ['MDS1', 'MDS2', 'MDS1', 'MDS2', 'TSNE1', 'TSNE2', 'TSNE1', 'TSNE2', 
    'PCA1 (Expected Variance : {}%'.format(round(explained_variance_PCA_concatenated[0]*100,2)) + ")",
    'PCA2 (Expected Variance : {}%'.format(round(explained_variance_PCA_concatenated[1]*100,2)) + ")",
    'PCA1 (Expected Variance : {}%'.format(round(explained_variance_PCA_ha[0]*100,2)) + ")",
    'PCA2 (Expected Variance : {}%'.format(round(explained_variance_PCA_ha[1]*100,2)) + ")",'UMAP1','UMAP2','UMAP1','UMAP2'],
    'clade_membership:N',
    ['strain', 'clade_membership'],
    domain,
    range_
)

In [100]:
chart_embeddings = alt.vconcat(data[6]|data[5], data[2]|data[1], data[4]|data[3], data[8]|data[7])
chart_embeddings
chart_embeddings.save("../docs/flu-embeddingsHaNa.html")
save(chart_embeddings, "../docs/flu-embeddingsHaNa.png", scale_factor=2.0)
#chart_embeddings.save(snakemake.output.fullChartHTML)
#save(chart_embeddings, snakemake.output.fullChartPNG, scale_factor=2.0)

## Within- and between-clade Euclidean distances for all embeddings

Use the complete embedding data frame to calculate pairwise Euclidean distances between samples and plot the results in a single figure.

In [123]:
Genetic_KDE_df_ha = pd.read_csv("results/KDEDensity_genetic_ha.csv")
Genetic_KDE_df_concatenated = pd.read_csv("results/KDEDensity_genetic_concatenated.csv")
Genetic_KDE_metadata_df_ha = pd.read_csv("results/KDEDensity_genetic_metadata_ha.csv")
Genetic_KDE_metadata_df_concatenated = pd.read_csv("results/KDEDensity_genetic_metadata_concatenated.csv")

In [124]:
MDS_KDE_df_ha = pd.read_csv("results/KDEDensity_mds_ha.csv")
MDS_KDE_df_concatenated = pd.read_csv("results/KDEDensity_mds_concatenated.csv")
MDS_KDE_metadata_df_ha = pd.read_csv("results/KDEDensity_mds_metadata_ha.csv")
MDS_KDE_metadata_df_concatenated = pd.read_csv("results/KDEDensity_mds_metadata_concatenated.csv")

UMAP_KDE_df_ha = pd.read_csv("results/KDEDensity_umap_ha.csv")
UMAP_KDE_df_concatenated = pd.read_csv("results/KDEDensity_umap_concatenated.csv")
UMAP_KDE_metadata_df_ha = pd.read_csv("results/KDEDensity_umap_metadata_ha.csv")
UMAP_KDE_metadata_df_concatenated = pd.read_csv("results/KDEDensity_umap_metadata_concatenated.csv")

In [125]:
MDS_KDE_metadata_df_concatenated

Unnamed: 0,MCC,accuracy,median_within,median_between,threshold,embedding,TN,FN,TP,FP
0,0.577,0.87,-1.348,-0.008,-0.838,mds,211893,4531,28469,31503


In [126]:
import matplotlib.gridspec as gridspec

In [127]:
#fig, axes = plt.subplots(1, 5, figsize=(40, 8))
sns.set_style("ticks")
fig = plt.figure(figsize=(16, 8), constrained_layout=False)
gs = gridspec.GridSpec(2, 3, figure=fig, hspace=0.4, wspace=0.6)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[0, 2])
ax4 = fig.add_subplot(gs[1, 0])
ax5 = fig.add_subplot(gs[1, 1])
ax6 = fig.add_subplot(gs[1, 2])


#GENETIC 
ax1 = sns.kdeplot(Genetic_KDE_df_ha.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax1)
ax1 = sns.kdeplot(Genetic_KDE_df_ha.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax1)
ax1.axvline(x=Genetic_KDE_metadata_df_ha["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax1.legend(frameon=False)
ax1.set_title('Genetic: MCC Value(' + str(round(Genetic_KDE_metadata_df_ha["MCC"].values.tolist()[0],2))+ ')')
ax1.set_xlabel("HA-only Scaled Euclidean distance (Genetic)")
ax1.set_ylabel("KDE density")

#Genetic
ax4 = sns.kdeplot(Genetic_KDE_df_concatenated.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax4)
ax4 = sns.kdeplot(Genetic_KDE_df_concatenated.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax4)
ax4.axvline(x=Genetic_KDE_metadata_df_concatenated["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax4.legend(frameon=False)
ax4.set_title('Genetic: MCC Value(' + str(round(Genetic_KDE_metadata_df_concatenated["MCC"].values.tolist()[0],2))+ ')')
ax4.set_xlabel("HA and NA Scaled Euclidean distance (Genetic)")
ax4.set_ylabel("KDE density")

#MDS
ax5 = sns.kdeplot(MDS_KDE_df_concatenated.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax5)
ax5 = sns.kdeplot(MDS_KDE_df_concatenated.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax5)
ax5.axvline(x=MDS_KDE_metadata_df_concatenated["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax5.legend(frameon=False)
ax5.set_title('MDS: MCC Value(' + str(round(MDS_KDE_metadata_df_concatenated["MCC"].values.tolist()[0],2))+ ')')
ax5.set_xlabel("HA and NA Scaled Euclidean distance (MDS)")
ax5.set_ylabel("KDE density")

#MDS
ax2 = sns.kdeplot(MDS_KDE_df_ha.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax2)
ax2 = sns.kdeplot(MDS_KDE_df_ha.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax2)
ax2.axvline(x=MDS_KDE_metadata_df_ha["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax2.legend(frameon=False)
ax2.set_title('MDS: MCC Value(' + str(round(MDS_KDE_metadata_df_ha["MCC"].values.tolist()[0],2))+ ')')
ax2.set_xlabel("HA-only Scaled Euclidean distance (MDS)")
ax2.set_ylabel("KDE density")

#UMAP
ax6 = sns.kdeplot(UMAP_KDE_df_concatenated.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax6)
ax6 = sns.kdeplot(UMAP_KDE_df_concatenated.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax6)
ax6.axvline(x=UMAP_KDE_metadata_df_concatenated["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax6.legend(frameon=False)
ax6.set_title('UMAP: MCC Value(' + str(round(UMAP_KDE_metadata_df_concatenated["MCC"].values.tolist()[0],2))+ ')')
ax6.set_xlabel("HA and NA Scaled Euclidean distance (UMAP)")
ax6.set_ylabel("KDE density")

#UMAP
ax3 = sns.kdeplot(UMAP_KDE_df_ha.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax3)
ax3 = sns.kdeplot(UMAP_KDE_df_ha.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax3)
ax3.axvline(x=UMAP_KDE_metadata_df_ha["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax3.legend(frameon=False)
ax3.set_title('UMAP: MCC Value(' + str(round(UMAP_KDE_metadata_df_ha["MCC"].values.tolist()[0],2))+ ')')
ax3.set_xlabel(" HA-only Scaled Euclidean distance (UMAP)")
ax3.set_ylabel("KDE density")

sns.despine()
plt.tight_layout()
#plt.savefig(snakemake.output.KDE_density)
plt.savefig("../docs/FinalKDEPlotHACONCAT.png", dpi=600, bbox_inches='tight')



In [None]:
#fig, axes = plt.subplots(1, 5, figsize=(40, 8))
fig = plt.figure(figsize=(16, 8), constrained_layout=False)
gs = gridspec.GridSpec(2, 4, figure=fig, hspace=0.4, wspace=0.6)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[0, 2])
ax4 = fig.add_subplot(gs[1, 1])
ax5 = fig.add_subplot(gs[1, 2])



#GENETIC 
ax1 = sns.kdeplot(Genetic_KDE_df_concatenated.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax1)
ax1 = sns.kdeplot(Genetic_KDE_df_concatenated.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax1)
ax1.axvline(x=Genetic_KDE_metadata_df_concatenated["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax1.legend(frameon=False)
ax1.set_title('Genetic')
ax1.set_xlabel("Scaled Euclidean distance (Genetic)")
ax1.set_ylabel("KDE density")

#PCA
ax2 = sns.kdeplot(PCA_KDE_df_concatenated.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax2)
ax2 = sns.kdeplot(PCA_KDE_df_concatenated.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax2)
ax2.axvline(x=PCA_KDE_metadata_df_concatenated["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax2.legend(frameon=False)
ax2.set_title('PCA')
ax2.set_xlabel("Scaled Euclidean distance (PCA)")
ax2.set_ylabel("KDE density")

#MDS
ax3 = sns.kdeplot(MDS_KDE_df_concatenated.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax3)
ax3 = sns.kdeplot(MDS_KDE_df_concatenated.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax3)
ax3.axvline(x=MDS_KDE_metadata_df_concatenated["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax3.legend(frameon=False)
ax3.set_title('MDS')
ax3.set_xlabel("Scaled Euclidean distance (MDS)")
ax3.set_ylabel("KDE density")

#TSNE
ax4 = sns.kdeplot(TSNE_KDE_df_concatenated.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax4)
ax4 = sns.kdeplot(TSNE_KDE_df_concatenated.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax4)
ax4.axvline(x=TSNE_KDE_metadata_df_concatenated["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax4.legend(frameon=False)
ax4.set_title('TSNE')
ax4.set_xlabel("Scaled Euclidean distance (TSNE)")
ax4.set_ylabel("KDE density")

#UMAP
ax5 = sns.kdeplot(UMAP_KDE_df_concatenated.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax5)
ax5 = sns.kdeplot(UMAP_KDE_df_concatenated.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax5)
ax5.axvline(x=UMAP_KDE_metadata_df_concatenated["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax5.legend(frameon=False)
ax5.set_title('UMAP')
ax5.set_xlabel("Scaled Euclidean distance (UMAP)")
ax5.set_ylabel("KDE density")


fig.suptitle('Total KDE Plot', fontsize=16)
sns.despine()
plt.savefig("../docs/FinalCONCATKDEPlot.png", dpi=600, bbox_inches='tight')

# Procrustes Analysis