# This is Cartography put together into one place. 

## Modules to add to run this code:

- [BioPython][1]
- [Pandas][2]
- [Numpy][3]
- [Altair][4]
- [Seaborn][5]
- [Scikit-Learn][6]
- [UMAP][7]
- json
- nextstrain-augur
- statsmodels
[1]:https://biopython.org/wiki/Download
[2]:https://pandas.pydata.org/pandas-docs/version/0.23.3/install.html
[3]:https://docs.scipy.org/doc/numpy/user/quickstart.html
[4]:https://altair-viz.github.io/getting_started/installation.html
[5]:https://seaborn.pydata.org/installing.html
[6]:https://scikit-learn.org/stable/install.html
[7]:https://umap-learn.readthedocs.io/en/latest/


# Imports Section 

In [None]:
import pandas as pd
import altair as alt
import numpy as np
from scipy.spatial.distance import squareform, pdist
from Bio import SeqIO
import seaborn as sns
import re
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from augur.utils import json_to_tree
import json
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
import umap
from scipy.stats import linregress
from pathlib import Path
import statsmodels
import statistics
import matplotlib.pyplot as plt
from Helpers import get_euclidean_data_frame, get_hamming_distances, linking_tree_with_plots_brush
from Helpers import linking_tree_with_plots_clickable
from Helpers import scatterplot_xyvalues, scatterplot_tooltips, scatterplot_with_tooltip_interactive
import selenium
from selenium.webdriver import Chrome 
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPDF
%matplotlib inline

In [None]:
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--remote-debugging-port=9222")

browser = webdriver.Chrome(options=chrome_options)

## Pathogen-specific variables

Consider consolidating these into a single configuration file that can be passed to the notebook as a command line argument for more scriptable generation of these figures.

# Flu Specific Variables

In [None]:
path = snakemake.input[0]
dropped_strains = [line.rstrip('\n') for line in open(snakemake.input[1],"r")]
tree_path = snakemake.input[2]
if(len(snakemake.input) > 4):
    clades_to_plot = [line.rstrip('\n') for line in open(snakemake.input[len(snakemake.input)-2],"r")]
f = open(snakemake.input[len(snakemake.input)-1], "r")
virus_name = f.read()

# Zika Specific Variables 

### Reading in the Fasta File
- I used BioPython to parse the Fasta file into two numpy Arrays: Genomes and Strains. 

In [None]:
#work on making this work
strains = []
genomes = []
for record in SeqIO.parse(path, "fasta"):
    if(record.id not in dropped_strains):
        strains.append(str(record.id))
        genomes.append(str(record.seq))

#### Checking to make sure the file I picked is a aligned Fasta file / is the file I wanted

In [None]:
len(strains)
print(len(genomes[0]) == len(genomes[1]))
print(len(genomes))

In [None]:
strains[:5]

In [None]:
genomes_df = pd.DataFrame(genomes)
strains_df = pd.DataFrame(strains)

In [None]:
genomes_missing_bases = []
for x in genomes:
    x = re.sub(r'[^AGCT]', '5', x)
    numberOfN = x.count("5")
    genomes_missing_bases.append(numberOfN)

In [None]:
genomes_missing_bases_df = pd.DataFrame(genomes_missing_bases)
genomes_missing_bases_df = genomes_missing_bases_df.merge(strains_df, how='outer', left_index = True, right_index = True)
genomes_missing_bases_df.columns = ["bases missing", "strain"]

# Missing Bases

In [None]:
genomes_missing_bases_df.head()

In [None]:
alt.Chart(genomes_missing_bases_df).mark_bar().encode(
    alt.X("bases missing:Q", bin=True),
    y="count()"
)

In [None]:
# Also, consider storing the list of dropped strains in a text file outside of the notebook.
dropped_strains.extend(list(genomes_missing_bases_df[genomes_missing_bases_df["bases missing"]>1000]["strain"]))

In [None]:
#work on making this work
strains = []
genomes = []
for record in SeqIO.parse(path, "fasta"):
    if(record.id not in dropped_strains):
        strains.append(str(record.id))
        genomes.append(str(record.seq))

In [None]:
len(strains)
print(len(genomes[0]) == len(genomes[1]))
print(len(genomes))

In [None]:
strains[:5]

In [None]:
genomes_df = pd.DataFrame(genomes)
strains_df = pd.DataFrame(strains)
genomes_df.columns = ["strain"]

# Creating the Distance Matrix
- I used Hamming Distance to find the pairwise distance between each genome and each other genome, effectively creating a similarity/distance matrix
    - In my Hamming Distance method, I only counted something as a different if it was a mismatch between the nucleotides (A,G,C, or T), not gaps (as that was throwing off the algorithm too much for smaller strains)
- I then used Seaborn to generate a heatmap to make sure the matrix looked correct

In [None]:
%%time
# Try to load an existing distance matrix. Create it, if it doesn't already exist.
"""try:
    # The index should be the first column and correspond to strain name for the row.
    similarity_matrix = pd.read_csv(distance_matrix_file, index_col=0)
    print("Loaded existing distance matrix")
except FileNotFoundError:
    print("Could not find existing distance matrix, creating it now", end="...")
"""    
# Calculate Hamming distances.
hamming_distances = get_hamming_distances(genomes)

# Convert distinct pairwise distances into the more redundant but more interpretable square matrix.
similarity_matrix = squareform(hamming_distances)

# Convert the numpy matrix to a pandas data frame with strain annotations for rows and columns.
similarity_matrix = pd.DataFrame(
    similarity_matrix,
    columns=strains,
    index=strains
)

# Write out the resulting data frame to cache distance calculations.
# Keep the index in the output file, so it is immediately available on read.
#similarity_matrix.to_csv(distance_matrix_file)
print("done!")

In [None]:
similarity_matrix

In [None]:
sns.heatmap(similarity_matrix)

# Creating the Phylogenetic Tree in Altair
- I used Altair to make this tree (Documentation linked [here][1]
- I opened and imported the json from a build from NextStrain ([flu][2], [zika][3], etc)
- The data from the JSON and the Data from the tree are usually a little different, so after merging the two dataframes you may get some errors.

[1]: https://altair-viz.github.io/index.html
[2]: https://github.com/nextstrain/seasonal-flu
[3]: https://altair-viz.github.io/index.html

In [None]:
with open(tree_path) as fh:
    json_tree_handle = json.load(fh)

In [None]:
tree = json_to_tree(json_tree_handle)

In [None]:
tree

In [None]:
node_data = [
    {
        "strain": node.name,
        "date": node.attr["num_date"],
        "y": node.yvalue,
        "region": node.attr["region"],
        "country": node.attr["country"],
        "parent_date": node.parent is not None and node.parent.attr["num_date"] or node.attr["num_date"],
        "parent_y": node.parent is not None and node.parent.yvalue or node.yvalue,
        "clade_membership" : node.attr['clade_membership']
    }
    for node in tree.find_clades(terminal=True)
]

In [None]:
node_data[10]

In [None]:
node_df = pd.DataFrame(node_data)

In [None]:
node_df.head()

In [None]:
node_df["y"] = node_df["y"].max() - node_df["y"]

In [None]:
node_df["parent_y"] = node_df["parent_y"].max() - node_df["parent_y"]

In [None]:
node_df.shape

In [None]:
node_df.head()

In [None]:
node_df["region"].unique()

In [None]:
# Reannotate clades that we aren't interested in as "other" to simplify color assignment in visualizations.
try:
    node_df["clade_membership_color"] = node_df["clade_membership"].apply(lambda clade: clade if clade in clades_to_plot else "other")
except:
    node_df["clade_membership_color"] = node_df["clade_membership"]
    print("clades_to_plot undefined")

In [None]:
indices_to_drop = similarity_matrix[~similarity_matrix.index.isin(node_df["strain"])].dropna(how = 'all')
similarity_matrix = similarity_matrix[similarity_matrix.index.isin(node_df["strain"])].dropna(how = 'all')
similarity_matrix = similarity_matrix.drop(indices_to_drop.index, axis=1)
similarity_matrix

In [None]:
node_df.head()

## Checking for Outliers in Pairwise Distance

In [None]:
mean_distances = similarity_matrix.mean().reset_index(name="mean_distance").rename(columns={"index": "strain"})

In [None]:
mean_distances.head()

In [None]:
alt.Chart(mean_distances, height=150).mark_boxplot().encode(
    x = alt.X('mean_distance', title="mean of pairwise distances"),
    tooltip = ["strain"]
)

# Running PCA on Scaled and Centered Data
- I treated each nucleotide as a "site", or dimension, and found the probability of having a certain nucleotide given the frequency of that letter at that site.
- I used [this paper][1] as my source 
- The equation is as follows where C is the matrix of dimensions, M is the mean, and p is the frequency of a nucleotide at that given site. 
![](https://journals.plos.org/plosgenetics/article/file?type=thumbnail&id=info:doi/10.1371/journal.pgen.0020190.e003)

In [None]:
numbers = genomes[:]
for i in range(0,len(genomes)):
    numbers[i] = re.sub(r'[^AGCT]', '5', numbers[i])
    numbers[i] = list(numbers[i].replace('A','1').replace('G','2').replace('C', '3').replace('T','4'))
    numbers[i] = [int(j) for j in numbers[i]]
genomes_df = pd.DataFrame(numbers)
genomes_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]

In [None]:
genomes_df.head()

In [None]:
#performing PCA on my pandas dataframe 
pca = PCA(n_components=10,svd_solver='full') #can specify n, since with no prior knowledge, I use None
principalComponents = pca.fit_transform(genomes_df)

In [None]:
# Create a data frame from the PCA embedding.
principalDf = pd.DataFrame(data = principalComponents, columns = ["PCA" + str(i) for i in range(1,11)])

# Annotate rows by their original strain names. PCA rows are in the same order as
# the `genomes` rows which are in the same order as the `strains` rows.
principalDf["strain"] = strains

In [None]:
df = pd.concat([pd.DataFrame(np.arange(1,11)), pd.DataFrame([round(pca.explained_variance_ratio_[i],4) for i in range(0,len(pca.explained_variance_ratio_))])], axis = 1)
df.columns = ['principal components','explained variance']
df

In [None]:
alt.Chart(df).mark_point().encode(
    x='principal components:Q',
    y='explained variance:Q'
)

In [None]:
merged_pca_df = principalDf.merge(node_df, on="strain")

In [None]:
merged_pca_df.head()

In [None]:
list_of_chart = linking_tree_with_plots_brush(merged_pca_df,['PCA1','PCA2','PCA3','PCA4'],
                                         ['PCA1 (Explained Variance : {}%'.format(round(pca.explained_variance_ratio_[0]*100,2)) + ")",
                                          'PCA2 (Explained Variance : {}%'.format(round(pca.explained_variance_ratio_[1]*100,2)) + ")",
                                          'PCA3 (Explained Variance : {}%'.format(round(pca.explained_variance_ratio_[2]*100,2)) + ")",
                                          'PCA4 (Explained Variance : {}%'.format(round(pca.explained_variance_ratio_[3]*100,2)) + ")"],
                                         "clade_membership:N",['strain','region'])
PCAFluBrush = list_of_chart[0]|list_of_chart[1]|list_of_chart[2]

PCAFluBrush

In [None]:
PCA_violin_df = get_euclidean_data_frame(merged_pca_df, "PCA1", "PCA2", "PCA")
g = sns.FacetGrid(
    PCA_violin_df,
    col="embedding",
    col_wrap=3,
    sharey=False,
    height=4
)
g = g.map(sns.violinplot, "clade_status", "distance", order=["within", "between"])
g.set_axis_labels("Clade status", "Distance")
plt.savefig("docs/PCAViolinPlot" + virus_name + ".png")

In [None]:
total_df = scatterplot_xyvalues(similarity_matrix.index, similarity_matrix, merged_pca_df, "PCA1", "PCA2", "PCA")
y_values = statsmodels.nonparametric.smoothers_lowess.lowess(
    total_df["euclidean"],
    total_df["genetic"],
    frac=0.6666666666666666,
    it=3,
    delta=0.0,
    is_sorted=False,
    missing='drop',
    return_sorted=True
)

PD_Y_values = pd.DataFrame(y_values)
PD_Y_values.columns = ["LOWESS_x", "LOWESS_y"]

regression = linregress(total_df["genetic"], total_df["euclidean"])
slope, intercept, r_value, p_value, std_err = regression

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot(total_df["genetic"], total_df["euclidean"], "o", alpha=0.25)
ax.plot(PD_Y_values["LOWESS_x"], PD_Y_values["LOWESS_y"], label="LOESS")

ax.set_xlabel("Genetic distance")
ax.set_ylabel("Euclidean distance (PCA)")
ax.set_title("PCA Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (r_value ** 2))

sns.despine()
plt.savefig("docs/PCAScatterplotLOESS" + virus_name + ".png")

# Running MDS on the Dataset

In [None]:
embedding = MDS(n_components=10,metric=True,dissimilarity='precomputed')
X_transformed_mds = embedding.fit_transform(similarity_matrix)

In [None]:
raw_stress = embedding.stress_
normalized_stress = np.sqrt(raw_stress /((similarity_matrix.values.ravel() ** 2).sum() / 2))
print(normalized_stress.round(2))

In [None]:
MDS_df = pd.DataFrame(X_transformed_mds,columns=['MDS' + str(i) for i in range(1,11)])

In [None]:
MDS_df.shape

In [None]:
# Annotate rows by their original strain names. The same logic from PCA holds here
# and for later embeddings.
MDS_df["strain"] = similarity_matrix.index

In [None]:
merged_mds_df = MDS_df.merge(node_df, on="strain")

In [None]:
merged_mds_df.head()

In [None]:
chart_12_mds = scatterplot_with_tooltip_interactive(merged_mds_df,'MDS1','MDS2',"MDS1","MDS2",['strain','clade_membership'],'clade_membership_color')
chart_34_mds = scatterplot_with_tooltip_interactive(merged_mds_df,'MDS3','MDS4',"MDS3","MDS4",['strain','clade_membership'],'clade_membership_color')
chart_56_mds = scatterplot_with_tooltip_interactive(merged_mds_df,'MDS5','MDS6',"MDS5","MDS6",['strain','clade_membership'],'clade_membership_color')
chart_12_mds|chart_34_mds|chart_56_mds

In [None]:
chart_MDS = scatterplot_tooltips(similarity_matrix.index, similarity_matrix, merged_mds_df, "MDS1", "MDS2", "MDS", 4000)
chart_34_scatter = scatterplot_tooltips(similarity_matrix.index, similarity_matrix, merged_mds_df, "MDS3", "MDS4", "MDS", 4000)
chart_56_scatter = scatterplot_tooltips(similarity_matrix.index, similarity_matrix, merged_mds_df, "MDS5", "MDS6", "MDS", 4000)
chart_MDS | chart_34_scatter | chart_56_scatter

In [None]:
total_df = scatterplot_xyvalues(similarity_matrix.index, similarity_matrix, merged_mds_df, "MDS1", "MDS2", "MDS")
y_values = statsmodels.nonparametric.smoothers_lowess.lowess(
    total_df["euclidean"],
    total_df["genetic"],
    frac=0.6666666666666666,
    it=3,
    delta=0.0,
    is_sorted=False,
    missing='drop',
    return_sorted=True
)

PD_Y_values = pd.DataFrame(y_values)
PD_Y_values.columns = ["LOWESS_x", "LOWESS_y"]

regression = linregress(total_df["genetic"], total_df["euclidean"])
slope, intercept, r_value, p_value, std_err = regression

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot(total_df["genetic"], total_df["euclidean"], "o", alpha=0.25)
ax.plot(PD_Y_values["LOWESS_x"], PD_Y_values["LOWESS_y"], label="LOESS")

ax.set_xlabel("Genetic distance")
ax.set_ylabel("Euclidean distance (MDS)")
ax.set_title("MDS Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (r_value ** 2))

sns.despine()
plt.savefig("docs/MDSScatterplotLOESS" + virus_name + ".png")

In [None]:
MDS_violin_df = get_euclidean_data_frame(merged_mds_df, "MDS1", "MDS2", "MDS")
g = sns.FacetGrid(
    MDS_violin_df,
    col="embedding",
    col_wrap=3,
    sharey=False,
    height=4
)
g = g.map(sns.violinplot, "clade_status", "distance", order=["within", "between"])
g.set_axis_labels("Clade status", "Distance")
plt.savefig("docs/MDSViolinPlot" + virus_name + ".png")

In [None]:
list_of_data_and_titles = ['MDS1','MDS2','MDS3','MDS4','MDS5','MDS6']
list_of_chart = linking_tree_with_plots_brush(
    merged_mds_df,
    list_of_data_and_titles,
    list_of_data_and_titles,
    'clade_membership_color',
    ["clade_membership","strain:N"]
)
chart = list_of_chart[0]|list_of_chart[1]|list_of_chart[2]|list_of_chart[3]
chart

In [None]:
chart.save("docs/MDS" + virus_name + "Brush.html")

# Running T-SNE on the Dataset 

In [None]:
embedding = TSNE(n_components=2,metric='precomputed',perplexity = 25.95)
X_transformed_tsne = embedding.fit_transform(similarity_matrix)

In [None]:
TSNE_df = pd.DataFrame(X_transformed_tsne,columns=['TSNE' + str(i) for i in range(1,3)])

In [None]:
TSNE_df["strain"] = similarity_matrix.index

In [None]:
TSNE_df.head()

In [None]:
merged_tsne_df = TSNE_df.merge(node_df, on="strain")

In [None]:
scatterplot_with_tooltip_interactive(merged_tsne_df,'TSNE1','TSNE2',"TSNE1","TSNE2",['strain','clade_membership'],'clade_membership_color')

In [None]:
list_of_chart = linking_tree_with_plots_brush(
    merged_tsne_df,
    ['TSNE1','TSNE2'],
    ['TSNE1','TSNE2'],
    'clade_membership_color',
    ["clade_membership:N","strain:N"]
)
chart = list_of_chart[0]|list_of_chart[1]
chart
chart.save("docs/TSNE" + virus_name + "Brush.html")

In [None]:
TSNE_violin_df = get_euclidean_data_frame(merged_tsne_df, "TSNE1", "TSNE2", "TSNE")
g = sns.FacetGrid(
    TSNE_violin_df,
    col="embedding",
    col_wrap=3,
    sharey=False,
    height=4
)
g = g.map(sns.violinplot, "clade_status", "distance", order=["within", "between"])
g.set_axis_labels("Clade status", "Distance")
plt.savefig("docs/TSNEViolinPlot" + virus_name + ".png")

In [None]:
total_df = scatterplot_xyvalues(similarity_matrix.index, similarity_matrix, merged_tsne_df, "TSNE1", "TSNE2", "TSNE")
y_values = statsmodels.nonparametric.smoothers_lowess.lowess(
    total_df["euclidean"],
    total_df["genetic"],
    frac=0.6666666666666666,
    it=3,
    delta=0.0,
    is_sorted=False,
    missing='drop',
    return_sorted=True
)

PD_Y_values = pd.DataFrame(y_values)
PD_Y_values.columns = ["LOWESS_x", "LOWESS_y"]

regression = linregress(total_df["genetic"], total_df["euclidean"])
slope, intercept, r_value, p_value, std_err = regression

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot(total_df["genetic"], total_df["euclidean"], "o", alpha=0.25)
ax.plot(PD_Y_values["LOWESS_x"], PD_Y_values["LOWESS_y"], label="LOESS")

ax.set_xlabel("Genetic distance")
ax.set_ylabel("Euclidean distance (t-SNE)")
ax.set_title("t-SNE Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (r_value ** 2))

sns.despine()
plt.savefig("docs/TSNEScatterplot" + virus_name + ".png")

# Running UMAP on the Dataset

In [None]:
reducer = umap.UMAP(n_neighbors=200,
        min_dist=.05,
        n_components=2,
        init="spectral")
X_transformed_umap = reducer.fit_transform(similarity_matrix)

In [None]:
UMAP_df = pd.DataFrame(X_transformed_umap,columns=['UMAP' + str(i) for i in range(1,3)])

In [None]:
UMAP_df["strain"] = similarity_matrix.index

In [None]:
UMAP_df.head()

In [None]:
merged_umap_df = UMAP_df.merge(node_df, on="strain")

In [None]:
merged_umap_df.head()

In [None]:
merged_umap_df.shape

In [None]:
scatterplot_with_tooltip_interactive(merged_umap_df,'UMAP1','UMAP2',"UMAP1","UMAP2",['strain','clade_membership'],'clade_membership_color')

In [None]:
list_of_data_and_titles = ['UMAP1','UMAP2']
list_of_chart = linking_tree_with_plots_brush(
    merged_umap_df,
    list_of_data_and_titles,
    list_of_data_and_titles,
    'clade_membership_color',
    ["clade_membership","strain:N"]
)
chart = list_of_chart[0]|list_of_chart[1]
chart
chart.save("docs/UMAP" + virus_name + "Brush.html")
chart.save("docs/UMAP" + virus_name + "Brush.png", scale_factor=2.0)

In [None]:
UMAP_violin_df = get_euclidean_data_frame(merged_umap_df, "UMAP1", "UMAP2", "UMAP")
g = sns.FacetGrid(
    UMAP_violin_df,
    col="embedding",
    col_wrap=3,
    sharey=False,
    height=4
)
g = g.map(sns.violinplot, "clade_status", "distance", order=["within", "between"])
g.set_axis_labels("Clade status", "Distance")
plt.savefig("docs/UMAPViolinPlot" + virus_name + ".png")

In [None]:
total_df = scatterplot_xyvalues(similarity_matrix.index, similarity_matrix, merged_umap_df, "UMAP1", "UMAP2", "UMAP")
y_values = statsmodels.nonparametric.smoothers_lowess.lowess(
    total_df["euclidean"],
    total_df["genetic"],
    frac=0.6666666666666666,
    it=3,
    delta=0.0,
    is_sorted=False,
    missing='drop',
    return_sorted=True
)

PD_Y_values = pd.DataFrame(y_values)
PD_Y_values.columns = ["LOWESS_x", "LOWESS_y"]

regression = linregress(total_df["genetic"], total_df["euclidean"])
slope, intercept, r_value, p_value, std_err = regression

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot(total_df["genetic"], total_df["euclidean"], "o", alpha=0.25)
ax.plot(PD_Y_values["LOWESS_x"], PD_Y_values["LOWESS_y"], label="LOESS")

ax.set_xlabel("Genetic distance")
ax.set_ylabel("Euclidean distance (UMAP)")
ax.set_title("UMAP Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (r_value ** 2))

sns.despine()
plt.savefig("docs/UMAPScatterplot" + virus_name + ".png")

# Linking all plots together clickable with Tree

In [None]:
merged_df = node_df.merge(
    principalDf,
    on="strain"
).merge(
    MDS_df,
    on="strain"
).merge(
    TSNE_df,
    on="strain"
).merge(
    UMAP_df,
    on="strain"
)

In [None]:
merged_df.shape

In [None]:
merged_df.head()

In [None]:
data = linking_tree_with_plots_clickable(
    merged_df,
    ['MDS1', 'MDS2','TSNE1', 'TSNE2', 'PCA1', 'PCA2', 'UMAP1', 'UMAP2'],
    ['MDS1', 'MDS2', 'TSNE1', 'TSNE2','PCA1 (Expected Variance : {}%'.format(round(pca.explained_variance_ratio_[0]*100,2)) + ")",
    'PCA2 (Expected Variance : {}%'.format(round(pca.explained_variance_ratio_[1]*100,2)) + ")",'UMAP1','UMAP2'],
    'clade_membership_color:N',
    ['clade_membership'],
    ['strain','clade_membership']
)

In [None]:
data1 = linking_tree_with_plots_brush(
    merged_df,
    ['TSNE1', 'TSNE2','UMAP1', 'UMAP2'],
    ['TSNE1', 'TSNE2','UMAP1','UMAP2'],
    'clade_membership_color:N',
    ['strain','clade_membership']
)

In [None]:
TSNEUMAP = data1[0]|data1[1]|data1[2]
TSNEUMAP.save("docs/TSNEUMAPClickable" + virus_name + ".html")
TSNEUMAP

In [None]:
PCAMDS = data[3]|data[1]|data[5]
TSNEUMAP = data[2]|data[4]
embeddings = alt.vconcat(PCAMDS,TSNEUMAP)
embeddings
fullChart = alt.hconcat(data[0],embeddings)
fullChart.save("docs/FullLinkedChartClickable" + virus_name + ".html")
fullChart.save("docs/FullLinkedChartClickable" + virus_name + ".svg", scale_factor=2.0)
drawing = svg2rlg("docs/FullLinkedChartClickable" + virus_name + ".svg")
renderPDF.drawToFile(drawing, "docs/FullLinkedChartClickable" + virus_name + ".pdf")

## Scatterplots for all embeddings 
Concatenating all embedding data frames to plot genetic vs Euclidean distance for each embedding

In [None]:
umap_df = scatterplot_xyvalues(similarity_matrix.index, similarity_matrix, merged_umap_df, "UMAP1", "UMAP2", "UMAP")
tsne_df = scatterplot_xyvalues(similarity_matrix.index, similarity_matrix, merged_tsne_df, "TSNE1", "TSNE2", "TSNE")
mds_df = scatterplot_xyvalues(similarity_matrix.index, similarity_matrix, merged_mds_df, "MDS1", "MDS2", "MDS")
pca_df = scatterplot_xyvalues(similarity_matrix.index, similarity_matrix, merged_pca_df, "PCA1", "PCA2", "PCA")

y_values_umap = statsmodels.nonparametric.smoothers_lowess.lowess(
    umap_df["euclidean"],
    umap_df["genetic"],
    frac=0.6666666666666666,
    it=3,
    delta=0.0,
    is_sorted=False,
    missing='drop',
    return_sorted=True
)

y_values_tsne = statsmodels.nonparametric.smoothers_lowess.lowess(
    tsne_df["euclidean"],
    tsne_df["genetic"],
    frac=0.6666666666666666,
    it=3,
    delta=0.0,
    is_sorted=False,
    missing='drop',
    return_sorted=True
)

y_values_mds = statsmodels.nonparametric.smoothers_lowess.lowess(
    mds_df["euclidean"],
    mds_df["genetic"],
    frac=0.6666666666666666,
    it=3,
    delta=0.0,
    is_sorted=False,
    missing='drop',
    return_sorted=True
)

y_values_pca = statsmodels.nonparametric.smoothers_lowess.lowess(
    pca_df["euclidean"],
    pca_df["genetic"],
    frac=0.6666666666666666,
    it=3,
    delta=0.0,
    is_sorted=False,
    missing='drop',
    return_sorted=True
)

PD_Y_values_umap = pd.DataFrame(y_values_umap)
PD_Y_values_umap.columns = ["LOWESS_x", "LOWESS_y"]

regression_umap = linregress(umap_df["genetic"], umap_df["euclidean"])
slope, intercept, r_value_umap, p_value, std_err = regression_umap

PD_Y_values_tsne = pd.DataFrame(y_values_tsne)
PD_Y_values_tsne.columns = ["LOWESS_x", "LOWESS_y"]

regression_tsne = linregress(tsne_df["genetic"], tsne_df["euclidean"])
slope, intercept, r_value_tsne, p_value, std_err = regression_tsne

PD_Y_values_mds = pd.DataFrame(y_values_mds)
PD_Y_values_mds.columns = ["LOWESS_x", "LOWESS_y"]

regression_mds = linregress(mds_df["genetic"], mds_df["euclidean"])
slope, intercept, r_value_mds, p_value, std_err = regression_mds

PD_Y_values_pca = pd.DataFrame(y_values_pca)
PD_Y_values_pca.columns = ["LOWESS_x", "LOWESS_y"]

regression_pca = linregress(pca_df["genetic"], pca_df["euclidean"])
slope, intercept, r_value_pca, p_value, std_err = regression_pca

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(24, 6))
fig.tight_layout(pad=4.0)

ax1.plot(pca_df["genetic"], pca_df["euclidean"], "o", alpha=0.25)
ax1.plot(PD_Y_values_pca["LOWESS_x"], PD_Y_values_pca["LOWESS_y"], label="LOESS")

ax1.set_xlabel("Genetic distance")
ax1.set_ylabel("Euclidean distance (PCA)")
ax1.set_title("PCA Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (r_value_pca ** 2))

ax2.plot(mds_df["genetic"], mds_df["euclidean"], "o", alpha=0.25)
ax2.plot(PD_Y_values_mds["LOWESS_x"], PD_Y_values_mds["LOWESS_y"], label="LOESS")

ax2.set_xlabel("Genetic distance")
ax2.set_ylabel("Euclidean distance (MDS)")
ax2.set_title("MDS Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (r_value_mds ** 2))

ax3.plot(tsne_df["genetic"], tsne_df["euclidean"], "o", alpha=0.25)
ax3.plot(PD_Y_values_tsne["LOWESS_x"], PD_Y_values_tsne["LOWESS_y"], label="LOESS")

ax3.set_xlabel("Genetic distance")
ax3.set_ylabel("Euclidean distance (t-SNE)")
ax3.set_title("t-SNE Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (r_value_tsne ** 2))

ax4.plot(umap_df["genetic"], umap_df["euclidean"], "o", alpha=0.25)
ax4.plot(PD_Y_values_umap["LOWESS_x"], PD_Y_values_umap["LOWESS_y"], label="LOESS")

ax4.set_xlabel("Genetic distance")
ax4.set_ylabel("Euclidean distance (UMAP)")
ax4.set_title("UMAP Euclidean distance vs. genetic distance ($R^2=%.3f$)" % (r_value_umap ** 2))


sns.despine()
plt.savefig("docs/FullScatterplot" + virus_name + ".png")

## Within- and between-clade Euclidean distances for all embeddings

Use the complete embedding data frame to calculate pairwise Euclidean distances between samples and plot the results in a single figure.

In [None]:
data_frames = [
    get_euclidean_data_frame(merged_df, "PCA1", "PCA2", "PCA"),
    get_euclidean_data_frame(merged_df, "MDS1", "MDS2", "MDS"),
    get_euclidean_data_frame(merged_df, "TSNE1", "TSNE2", "t-SNE"),
    get_euclidean_data_frame(merged_df, "UMAP1", "UMAP2", "UMAP"),
]

Extract pairwise genetic (Hamming) distances corresponding to the records sampled above. This step assumes that the original merged data frame is indexed from zero to N for N total samples in the same order as the similarity matrix.

In [None]:
genetic_distances = squareform(similarity_matrix)

In [None]:
data_frames.append(pd.DataFrame({
    "distance": genetic_distances,
    "clade_status": data_frames[0]["clade_status"].values,
    "embedding": "genetic"
}))

In [None]:
len(data_frames)

In [None]:
euclidean_data_frame = pd.concat(data_frames)

In [None]:
g = sns.FacetGrid(
    euclidean_data_frame,
    col="embedding",
    col_wrap=3,
    col_order=["genetic", "PCA", "MDS", "t-SNE", "UMAP"],
    sharey=False,
    height=4
)
g = g.map(sns.violinplot, "clade_status", "distance", order=["within", "between"])
g.set_axis_labels("Clade status", "Distance")
plt.savefig("docs/FullViolinPlot" + virus_name + ".png")

In [None]:
PCA_df = euclidean_data_frame[euclidean_data_frame.embedding == "PCA"]
MDS_df = euclidean_data_frame[euclidean_data_frame.embedding == "MDS"]
TSNE_df = euclidean_data_frame[euclidean_data_frame.embedding == "t-SNE"]
UMAP_df = euclidean_data_frame[euclidean_data_frame.embedding == "UMAP"]
genetic_df = euclidean_data_frame[euclidean_data_frame.embedding == "genetic"]

In [None]:
genetic_df

In [None]:
genetic_df['distance'].mean()

In [None]:
median_genetic_within = genetic_df[genetic_df.clade_status == "within"].median()
median_genetic_between = genetic_df[genetic_df.clade_status == "between"].median()

In [None]:
median_PCA_within = PCA_df[PCA_df.clade_status == "within"].median()
median_PCA_between = PCA_df[PCA_df.clade_status == "between"].median()

In [None]:
median_MDS_within = MDS_df[MDS_df.clade_status == "within"].median()
median_MDS_between = MDS_df[MDS_df.clade_status == "between"].median()

In [None]:
median_TSNE_within = TSNE_df[TSNE_df.clade_status == "within"].median()
median_TSNE_between = TSNE_df[TSNE_df.clade_status == "between"].median()

In [None]:
median_UMAP_within = UMAP_df[UMAP_df.clade_status == "within"].median()
median_UMAP_between = UMAP_df[UMAP_df.clade_status == "between"].median()

In [None]:
def ratioFunction(num1, num2):
    ratio12 = int(num1/num2)
    return ratio12

In [None]:
genetic_ratio = ratioFunction(median_genetic_between,median_genetic_within)
print(genetic_ratio)

In [None]:
PCA_ratio = ratioFunction(median_PCA_between,median_PCA_within)
print(PCA_ratio)

In [None]:
MDS_ratio = ratioFunction(median_MDS_between,median_MDS_within)
print(MDS_ratio)

In [None]:
TSNE_ratio = ratioFunction(median_TSNE_between,median_TSNE_within)
print(TSNE_ratio)

In [None]:
UMAP_ratio = ratioFunction(median_UMAP_between,median_UMAP_within)
print(UMAP_ratio)

## Missing Bases Analysis

(low importance)
- use similarity_matrix.index and cluster by K-means
- find accuracy by taking into account the total amount of relationships between clusters and taking the percentage of them which are STILL correct (ex. within/between stays within/between)
    - this allows you to reuse code if needed

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=7, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='auto')
kmeans_data= kmeans.fit(X_transformed_tsne)

In [None]:
merged_tsne_df_copy = merged_tsne_df.copy()
merged_tsne_df_copy["clade_membership"] = kmeans_data.labels_

In [None]:
brush = alt.selection(type='interval', resolve='global')
chart = alt.Chart(merged_tsne_df_copy).mark_circle(size=60).encode(
    x=alt.X("TSNE1", title="TSNE1"),
    y=alt.X("TSNE2", title="TSNE2"),
    color=alt.Color('clade_membership',
               scale=alt.Scale(
        domain = ["0", "1", "2", "3", "4", "5", "6"],
        range=['blue', 'orange', "red", "light blue", "green", "yellow", "purple"])),
    tooltip=["strain", "clade_membership"]
).interactive()
chart.display()

In [None]:
chart2 = scatterplot_with_tooltip_interactive(merged_tsne_df,'TSNE1','TSNE2',"TSNE1","TSNE2",['strain','clade_membership'],'clade_membership_color')

In [None]:
chart | chart2

In [None]:
merged_tsne_df_copy = get_euclidean_data_frame(merged_tsne_df_copy, "TSNE1", "TSNE2", "TSNE")

In [None]:
diff_btw_df = TSNE_violin_df["clade_status"] == merged_tsne_df_copy["clade_status"]
diff_btw_df.sum()

In [None]:
((diff_btw_df.sum() / TSNE_violin_df.shape[0])*100)

In [None]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(TSNE_violin_df["clade_status"], merged_tsne_df_copy["clade_status"])
confusion_df = pd.DataFrame(confusion)

In [None]:
confusion_df.columns = ["False", "True"]
confusion_df.index = ["False", "True"]
confusion_df

In [None]:
sns.heatmap(confusion_df, cmap = sns.diverging_palette(220, 10, as_cmap=True))

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=7, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='auto')
kmeans_data= kmeans.fit(X_transformed_umap)

In [None]:
merged_umap_df_copy = merged_umap_df.copy()
merged_umap_df_copy["clade_membership"] = kmeans_data.labels_

In [None]:
brush = alt.selection(type='interval', resolve='global')
chart = alt.Chart(merged_umap_df_copy).mark_circle(size=60).encode(
    x=alt.X("UMAP1", title="UMAP1"),
    y=alt.X("UMAP2", title="UMAP2"),
    color=alt.Color('clade_membership',
               scale=alt.Scale(
        domain = ["0", "1", "2", "3", "4", "5", "6"],
        range=['blue', 'orange', "red", "light blue", "green", "yellow", "purple"])),
    tooltip=["strain", "clade_membership"]
).interactive()
chart.display()

In [None]:
chart2 = scatterplot_with_tooltip_interactive(merged_umap_df,'UMAP1','UMAP2',"UMAP1","UMAP2",['strain','clade_membership'],'clade_membership_color')

In [None]:
chart | chart2

In [None]:
merged_umap_df_copy = get_euclidean_data_frame(merged_umap_df_copy, "UMAP1", "UMAP2", "UMAP")

In [None]:
diff_btw_df = UMAP_violin_df["clade_status"] == merged_umap_df_copy["clade_status"]
diff_btw_df.sum()

In [None]:
((diff_btw_df.sum() / UMAP_violin_df.shape[0])*100)

In [None]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(UMAP_violin_df["clade_status"], merged_umap_df_copy["clade_status"])
confusion_df = pd.DataFrame(confusion)

In [None]:
confusion_df.columns = ["False", "True"]
confusion_df.index = ["False", "True"]
confusion_df

In [None]:
sns.heatmap(confusion_df, cmap = sns.diverging_palette(220, 10, as_cmap=True))

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=7, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='auto')
kmeans_data= kmeans.fit(X_transformed_mds)

In [None]:
merged_mds_df_copy = merged_mds_df.copy()
merged_mds_df_copy["clade_membership"] = kmeans_data.labels_

In [None]:
brush = alt.selection(type='interval', resolve='global')
chart = alt.Chart(merged_mds_df_copy).mark_circle(size=60).encode(
    x=alt.X("MDS1", title="MDS1"),
    y=alt.X("MDS2", title="MDS2"),
    color=alt.Color('clade_membership',
               scale=alt.Scale(
        domain = ["0", "1", "2", "3", "4", "5", "6"],
        range=['blue', 'orange', "red", "light blue", "green", "yellow", "purple"])),
    tooltip=["strain", "clade_membership"]
).interactive()
chart.display()

In [None]:
chart2 = scatterplot_with_tooltip_interactive(merged_mds_df,'MDS1','MDS2',"MDS1","MDS2",['strain','clade_membership'],'clade_membership_color')

In [None]:
chart | chart2

In [None]:
merged_mds_df_copy = get_euclidean_data_frame(merged_mds_df_copy, "MDS1", "MDS2", "MDS")

In [None]:
diff_btw_df = MDS_violin_df["clade_status"] == merged_mds_df_copy["clade_status"]
diff_btw_df.sum()

In [None]:
((diff_btw_df.sum() / MDS_violin_df.shape[0])*100)

In [None]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(MDS_violin_df["clade_status"], merged_mds_df_copy["clade_status"])
confusion_df = pd.DataFrame(confusion)

In [None]:
confusion_df.columns = ["False", "True"]
confusion_df.index = ["False", "True"]
confusion_df

In [None]:
sns.heatmap(confusion_df, cmap = sns.diverging_palette(220, 10, as_cmap=True))