# These are the Cartography visuals.

# Imports Section 

In [2]:
import sys
sys.path.extend(["../notebooks/scripts/"])

In [3]:
import altair as alt
from altair_saver import save
from augur.utils import json_to_tree
import json
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import re
#from reportlab.graphics import renderPDF
import seaborn as sns
#from svglib.svglib import svg2rlg

from Helpers import linking_tree_with_plots_clickable, linking_tree_with_plots_brush, scatterplot_with_tooltip_interactive
from Helpers import get_y_positions, get_euclidean_data_frame

#%matplotlib inline

In [4]:
alt.renderers.set_embed_options(
    padding={"left": 0, "right": 0, "bottom": 1, "top": 1}
)

RendererRegistry.enable('default')

In [4]:
sns.set_style("ticks")
# Disable top and right spines.
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
# Display and save figures at higher resolution for presentations and manuscripts.
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['figure.dpi'] = 100
# Display text at sizes large enough for presentations and manuscripts.
mpl.rcParams['font.weight'] = "normal"
mpl.rcParams['axes.labelweight'] = "normal"
mpl.rcParams['font.size'] = 10
mpl.rcParams['axes.labelsize'] = 10
mpl.rcParams['legend.fontsize'] = 8
mpl.rcParams['xtick.labelsize'] = 10
mpl.rcParams['ytick.labelsize'] = 10
mpl.rcParams['axes.titlesize'] = 8
mpl.rc('text', usetex=False)

In [5]:
try: 
    snakemake.input.node_df
    import selenium
    from selenium.webdriver import Chrome 
    from selenium import webdriver

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument("--remote-debugging-port=9222")

    browser = webdriver.Chrome(options=chrome_options)
except:
    print("not in Snakemake, imports unnecessary")

not in Snakemake, imports unnecessary


## Pathogen-specific variables

Consider consolidating these into a single configuration file that can be passed to the notebook as a command line argument for more scriptable generation of these figures.

# Flu Specific Variables

In [6]:
try:
    node_df_ha = snakemake.input.node_df_ha,

    pca_df_ha = snakemake.input.pca_df_ha,
    explained_variance_pca_ha = snakemake.input.explained_variance_pca_ha,

    pca_df_concatenated = snakemake.input.pca_df_concatenated,
    explained_variance_pca_concatenated = snakemake.input.explained_variance_pca_concatenated,

    mds_df_ha = snakemake.input.mds_df_ha,
    mds_df_concatenated = snakemake.input.mds_df_concatenated,
    mds_df_ma_concatenated = snakemake.input.mds_df_ma_concatenated,
    
    tsne_df_ha = snakemake.input.tsne_df_ha,
    tsne_df_concatenated = snakemake.input.tsne_df_concatenated,
    tsne_df_ma_concatenated = snakemake.input.tsne_df_ma_concatenated,
        
    umap_df_ha = snakemake.input.umap_df_ha,
    umap_df_ma_concatenated = snakemake.input.umap_df_ma_concatenated,
    
except:
    print("not in Snakemake, imports unnecessary")

not in Snakemake, imports unnecessary


# Reading in all the data from the scripts

In [7]:
import os

os.getcwd()

'/mnt/c/Work/BedfordProjects/cartography/ha-na-nextstrain'

In [8]:
colors = pd.read_csv("../notebooks/config/color_schemes.tsv", sep="\t", names=[i for i in range(0,101)])

In [9]:
#node_df = pd.read_csv(node_df_ha, sep="\t")
node_df_ha = pd.read_csv("results/table_ha.tsv", sep="\t")

In [10]:
node_df_ha.rename(columns={'num_date':'date', 'y_value':"y"}, inplace=True)

In [11]:
node_df_ha.head()

Unnamed: 0,strain,date,pca1,pca2,pca3,pca4,mds1,mds2,tsne_x,tsne_y,umap_x,umap_y,clade_membership,pca_label,mds_label,umap_label,t-sne_label,y
0,A/MUWRP-Uganda/579/2016,2016.67,5.01,1.2,-0.43,0.12,-27.31,-28.94,43.02,-5.57,-8.66,2.51,3c3,2,-1,0,2,1
1,A/Louisiana/17/2017,2017.17,6.63,1.07,1.18,-1.24,18.08,-41.19,33.79,-8.4,-9.44,2.71,3c3.A,1,-1,0,1,2
2,A/Texas/305/2017,2017.91,6.63,1.08,1.13,-1.37,17.23,-39.21,33.81,-8.37,-9.59,2.84,3c3.A,1,-1,0,1,3
3,A/Brisbane/1/2017,2017.0,6.7,1.14,1.02,-1.32,12.27,-35.29,32.92,-9.46,-8.97,3.17,3c3.A,1,0,0,1,4
4,A/Colorado/04/2017,2017.06,6.54,0.54,1.06,-2.08,13.21,-33.79,32.92,-9.47,-8.91,2.95,3c3.A,1,0,0,1,5


# HDBSCAN project:
- cluster on HA, find MCC value, same for HA+NA (from cluster_results script)
- check if HA+NA MCC > HA only

## MDS

In [30]:
MDS_df_ha = pd.read_csv("results/embed_mds_ha.csv",index_col=0)
MDS_df_concatenated = pd.read_csv("results/embed_mds_concatenated.csv",index_col=0)
MDS_df_ma_concatenated = pd.read_csv("results/embed_mds_ma_concatenated.csv", index_col=0)

In [31]:
merged_mds_df_ha = MDS_df_ha.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_mds_df_concatenated = MDS_df_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_mds_df_ma_concatenated = MDS_df_ma_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")

In [32]:
KDE_df_normal = get_euclidean_data_frame(sampled_df=merged_mds_df_ha, column_for_analysis="clade_membership", embedding="method", column_list=['mds1', 'mds2'])

In [38]:
domain =  merged_mds_df_ha["mds_label"].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
chart_12_mds = scatterplot_with_tooltip_interactive(merged_mds_df_ha,'mds1','mds2',"mds1","mds2",['strain','clade_membership'],'mds_label:N', domain, range_)
chart_12_mds

In [39]:
from sklearn.metrics import confusion_matrix, matthews_corrcoef

In [40]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_mds_df_ha[["mds1", "mds2", "strain", "mds_label"]], column_for_analysis="mds_label", embedding="mds", column_list=["mds1", "mds2"])
confusion_matrix_val_ha = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_ha = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [41]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_mds_df_concatenated[["mds1", "mds2", "strain", "mds_label"]], column_for_analysis="mds_label", embedding="mds", column_list=["mds1", "mds2"])
confusion_matrix_val_concatenated = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_concatenated = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [42]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_mds_df_ma_concatenated[["mds1", "mds2", "strain", "mds_label"]], column_for_analysis="mds_label", embedding="mds", column_list=["mds1", "mds2"])
confusion_matrix_val_ma_concatenated = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_ma_concatenated = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [43]:
#domain =  merged_mds_df_ha["mds_label"].drop_duplicates().values
domain =  sorted(merged_mds_df_ha["mds_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_mds_df_ha,['mds1','mds2'],["MDS1", "MDS2"], 'mds_label:N', ['strain','clade_membership'], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1].properties(title="HA only MCC: " + str(round(matthews_cc_val_ha,4)))
#domain =  merged_mds_df_concatenated["mds_label"].drop_duplicates().values
domain =  sorted(merged_mds_df_concatenated["mds_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_mds_df_concatenated,['mds1','mds2'],["MDS1", "MDS2"], 'mds_label:N', ['strain','clade_membership'], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="HA + NA MCC: " + str(round(matthews_cc_val_concatenated,4)))
alt.vconcat(chart_ha, chart_concat).resolve_scale(color='independent')
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_mds_df_ma_concatenated,['mds1','mds2'],["MDS1", "MDS2"], 'mds_label:N', ['strain','clade_membership'], domain, range_)
chart_concat_ma = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="HA + NA + MA MCC: " + str(round(matthews_cc_val_ma_concatenated,4)))
final_chart = alt.vconcat(chart_ha, chart_concat, chart_concat_ma).resolve_scale(color='independent')
save(final_chart, "../docs/HANAMAFullChartBrushableMDS.html")
save(final_chart, "../docs/HANAMAFullChartBrushableMDS.png", scale_factor=2.0)

## HDBSCAN clustering on t-SNE 

In [5]:
TSNE_df_ha = pd.read_csv("results/embed_t-sne_ha.csv",index_col=0)
TSNE_df_concatenated = pd.read_csv("results/embed_t-sne_concatenated.csv",index_col=0)
TSNE_df_ma_concatenated = pd.read_csv("results/embed_t-sne_ma_concatenated.csv", index_col=0)

In [6]:
merged_tsne_df_ha = TSNE_df_ha.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_tsne_df_concatenated = TSNE_df_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_tsne_df_ma_concatenated = TSNE_df_ma_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")

NameError: name 'node_df_ha' is not defined

In [48]:
KDE_df_normal = get_euclidean_data_frame(sampled_df=merged_tsne_df_ha, column_for_analysis="clade_membership", embedding="method", column_list=['tsne_x', 'tsne_y'])

In [54]:
domain =  merged_tsne_df_ha["t-sne_label"].drop_duplicates().values
range_ = colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
chart_12_tsne = scatterplot_with_tooltip_interactive(merged_tsne_df_ha,'tsne_x','tsne_y',"tsne_x","tsne_y",['strain','clade_membership'],'t-sne_label:N', domain, range_)
chart_12_tsne

In [55]:
from sklearn.metrics import confusion_matrix, matthews_corrcoef

In [56]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_tsne_df_ha[["tsne_x", "tsne_y", "strain", "t-sne_label"]], column_for_analysis="t-sne_label", embedding="tsne", column_list=["tsne_x", "tsne_y"])
confusion_matrix_val_ha = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_ha = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [57]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_tsne_df_concatenated[["tsne_x", "tsne_y", "strain", "t-sne_label"]], column_for_analysis="t-sne_label", embedding="tsne", column_list=["tsne_x", "tsne_y"])
confusion_matrix_val_concatenated = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_concatenated = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [58]:
KDE_df_cluster = get_euclidean_data_frame(sampled_df=merged_tsne_df_ma_concatenated[["tsne_x", "tsne_y", "strain", "t-sne_label"]], column_for_analysis="t-sne_label", embedding="tsne", column_list=["tsne_x", "tsne_y"])
confusion_matrix_val_ma_concatenated = confusion_matrix(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])
matthews_cc_val_ma_concatenated = matthews_corrcoef(KDE_df_normal["clade_status"], KDE_df_cluster["clade_status"])

In [59]:
#domain =  merged_tsne_df_ha["t-sne_label"].drop_duplicates().values
domain =  sorted(merged_tsne_df_ha["t-sne_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_ha = linking_tree_with_plots_brush(merged_tsne_df_ha,['tsne_x','tsne_y'],["MDS1", "MDS2"], 't-sne_label:N', ['strain','clade_membership'], domain, range_)
chart_ha = list_of_chart_ha[0]|list_of_chart_ha[1].properties(title="HA only MCC: " + str(round(matthews_cc_val_ha,4)))
#domain =  merged_tsne_df_concatenated["t-sne_label"].drop_duplicates().values
domain =  sorted(merged_tsne_df_concatenated["t-sne_label"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_tsne_df_concatenated,['tsne_x','tsne_y'],["MDS1", "MDS2"], 't-sne_label:N', ['strain','clade_membership'], domain, range_)
chart_concat = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="HA + NA MCC: " + str(round(matthews_cc_val_concatenated,4)))
alt.vconcat(chart_ha, chart_concat).resolve_scale(color='independent')
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
list_of_chart_concatenated = linking_tree_with_plots_brush(merged_tsne_df_ma_concatenated,['tsne_x','tsne_y'],["MDS1", "MDS2"], 't-sne_label:N', ['strain','clade_membership'], domain, range_)
chart_concat_ma = list_of_chart_concatenated[0]|list_of_chart_concatenated[1].properties(title="HA + NA + MA MCC: " + str(round(matthews_cc_val_ma_concatenated,4)))
final_chart = alt.vconcat(chart_ha, chart_concat, chart_concat_ma).resolve_scale(color='independent')
save(final_chart, "../docs/HANAMAFullChartBrushableTSNE.html",)
save(final_chart, "../docs/HANAMAFullChartBrushableTSNE.png", scale_factor=2.0)

# Running T-SNE on the Dataset 

In [67]:
TSNE_df_ha = pd.read_csv("results/embed_t-sne_ha.csv",index_col=0)
TSNE_df_concatenated = pd.read_csv("results/embed_t-sne_concatenated.csv",index_col=0)

In [68]:
merged_tsne_df_ha = TSNE_df_ha.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_tsne_df_concatenated = TSNE_df_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")

In [70]:
domain =  sorted(merged_tsne_df_ha["clade_membership"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
scatterplot_with_tooltip_interactive(merged_tsne_df_ha,'tsne_x','tsne_y','tsne_x','tsne_y',['strain', "clade_membership"],'clade_membership:N', domain, range_)

In [72]:
domain =  sorted(merged_tsne_df_concatenated["clade_membership"].drop_duplicates().values)
if -1 in domain:
    range_ = ["#999999"] + colors[len(domain)-1:len(domain)].dropna(axis=1).values.tolist()[0]
else: 
    range_ = colors[len(domain):len(domain)+1].dropna(axis=1).values.tolist()[0]
scatterplot_with_tooltip_interactive(merged_tsne_df_concatenated,'tsne_x','tsne_y','tsne_x','tsne_y',['strain', "clade_membership"],'clade_membership:N', domain, range_)

In [73]:
list_of_chart_ha = linking_tree_with_plots_brush(
    merged_tsne_df_ha,
    ['tsne_x','tsne_y'],
    ['tsne_x','tsne_y'],
    'clade_membership:N',
    ["strain:N", "clade_membership:N"],
    domain,
    range_
)
chart_tsne_ha = list_of_chart_ha[0]|list_of_chart_ha[1]
chart_tsne_ha

In [74]:
list_of_chart_concatenated = linking_tree_with_plots_brush(
    merged_tsne_df_concatenated,
    ['tsne_x','tsne_y'],
    ['tsne_x','tsne_y'],
    'clade_membership:N',
    ["strain:N", "clade_membership:N"],
    domain,
    range_
)
chart_tsne_concatenated = list_of_chart_concatenated[0]|list_of_chart_concatenated[1]
chart_tsne_concatenated

In [None]:
chart_tsne_ha & chart_tsne_concatenated

# Running UMAP on the Dataset

In [75]:
UMAP_df_ha = pd.read_csv("results/embed_umap_ha.csv",index_col=0)
UMAP_df_concatenated = pd.read_csv("results/embed_umap_concatenated.csv",index_col=0)

In [76]:
UMAP_df_concatenated

Unnamed: 0_level_0,umap_x,umap_y,umap_label,umap_label_default
strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A/Santiago/op004d0/2017,10.823834,2.324666,1,4
A/Keelung/0089/2016,10.146812,2.419663,1,4
A/Linkou/0187/2016,9.969683,4.434700,1,4
A/Linkou/0192/2017,10.187099,3.684332,1,4
A/Taipei/0122/2017,9.470983,4.036418,1,4
...,...,...,...,...
A/Baltimore/P0222/2017,-2.873133,16.333400,0,0
A/Baltimore/0295/2017,-1.949247,16.234972,0,0
A/Baltimore/0244/2017,-2.276079,16.466290,0,0
A/Rochester/U040/2017,-2.317351,16.291090,0,0


In [77]:
merged_umap_df_ha = UMAP_df_ha.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")
merged_umap_df_concatenated = UMAP_df_concatenated.merge(node_df_ha[["strain", "date", "y", "clade_membership"]], on="strain")

In [78]:
UMAP_df_ha.index.tolist() == UMAP_df_concatenated.index.values.tolist()

True