# This is Cartography put together into one place. 

## Modules to add to run this code:

- [BioPython][1]
- [Pandas][2]
- [Numpy][3]
- [Altair][4]
- [Seaborn][5]
- [Scikit-Learn][6]
- [UMAP][7]
- json
- nextstrain-augur
[1]:https://biopython.org/wiki/Download
[2]:https://pandas.pydata.org/pandas-docs/version/0.23.3/install.html
[3]:https://docs.scipy.org/doc/numpy/user/quickstart.html
[4]:https://altair-viz.github.io/getting_started/installation.html
[5]:https://seaborn.pydata.org/installing.html
[6]:https://scikit-learn.org/stable/install.html
[7]:https://umap-learn.readthedocs.io/en/latest/


# Imports Section 

In [None]:
import pandas as pd
import altair as alt
import numpy as np
from scipy.spatial.distance import squareform, pdist
import pandas as pd
import numpy as np
from Bio import SeqIO
import seaborn as sns
import re
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from augur.utils import json_to_tree
import json
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
import umap
from scipy.stats import linregress
from pathlib import Path

# METHODS SECTION
- All the methods I use throughout this notebook is all together up here. 

### Taking the result strains from reading in the metadata and concatenating it to the principal components (where each row is joined with one genome)

In [None]:
"""
principal_Df -- Data from data reduction (-T-SNE, MDS, etc) (pandas DataFrame)
result_metadata -- the metadata that is being read in (Pandas DataFrame)
fields, the parts of metadata that should be concatenated with princiapl_Df (list)
"""
def concatenate_results_with_strain_data(principal_Df,result_metadata,fields):
    finalDf = pd.concat([principal_Df, result_metadata[fields]], axis = 1)
    return finalDf

### Altair method for easy tooltip/brushing below

In [None]:
"""
Defining Fields:
finalDf: The data that is used to generate the scatter plot (pandas DataFrame)
x, the data you want on the x axis (string)
y, the data you want on the y axis (string)
Titlex,the name you want on the x axis (string)
Titley, the name you want on the y axis (string)
Tooltip, when scanning over a point, the data you want avaiable (list)
Color, what the scatterplot is colored by (String)
"""
def scatterplot_with_tooltip_interactive(finalDf,x,y,Titlex,Titley,ToolTip,color):
    brush = alt.selection(type='interval', resolve='global')
    chart = alt.Chart(finalDf).mark_circle(size=60).encode(
        x=alt.X(x,title=Titlex),
        y=alt.X(y,title=Titley),
        color=color,
        tooltip=ToolTip
    ).interactive()
    chart.display()
    #return chart

### Linking tree with plots brush

In [None]:
"""
dataframe: dataframe including node data and dimensionality reduction data (Pandas Dataframe)
list_of_data: list of all the names of the columns in the dataframe for which you want graphs: goes in the order of [x1,y1,x2,y2,x3,y3] etc.(list)
list_of_titles: list of all the TITLES you want for each axis: goes in order of[x1,y1,x2,y2,x3,y3] etc.(list)
color: what the data should be colored by (string)
ToolTip: when hovering over the data, what data should be shown (list)
"""
def linking_tree_with_plots_brush(dataFrame,list_of_data,list_of_titles,color,ToolTip):
    list_of_chart = []
    if(len(list_of_data) % 2 != 0 or len(list_of_titles) % 2 != 0):
        raise Exception('The length of list_of_data and the length of list_of_titles should not be odd.')
    else:
        base = alt.Chart(dataFrame)
        brush = alt.selection(type='interval', resolve='global')
        tree_name = base.mark_circle().encode(
            x=alt.X(
                "date:Q",
                scale=alt.Scale(domain=(dataFrame["date"].min() - 0.2, dataFrame["date"].max() + 0.2)),
                title="Date"
            ),
            y=alt.Y(
                "y:Q",
                title=""
            ),
            color=alt.condition(brush, color, alt.ColorValue('gray')),
            tooltip=ToolTip
        ).add_selection(brush).properties(width=400,height=250)
        list_of_chart.append(tree_name)
        
        for i in range(0,len(list_of_data) - 1,2):
            if(i == len(list_of_data)):
                break
            chart = base.mark_circle(size=60).encode(
            x=alt.X(list_of_data[i],title=list_of_titles[i]),
            y=alt.X(list_of_data[i + 1],title=list_of_titles[i + 1]),
            color=alt.condition(brush, color, alt.ColorValue('gray')),
            tooltip=ToolTip
            ).add_selection(
                brush
            ).properties(
                width=250,
                height=250
            )
            list_of_chart.append(chart)
        return list_of_chart

### Linking Tree with Plots Clickable

In [None]:
def linking_tree_with_plots_clickable(dataFrame,list_of_data,list_of_titles,colors,fields,ToolTip):
    list_of_chart = []
    if(len(list_of_data) % 2 != 0 or len(list_of_titles) % 2 != 0):
        raise Exception('The length of list_of_data and the length of list_of_titles should not be odd.')
    else:
        base = alt.Chart(dataFrame)
        selection = alt.selection_multi(fields=fields)

        color = alt.condition(selection,
                              alt.Color(colors,legend=None),
                              alt.value('lightgray'))
        tree_name = base.mark_circle().encode(
            x=alt.X(
                "date:Q",
                scale=alt.Scale(domain=(dataFrame["date"].min() - 0.2, dataFrame["date"].max() + 0.2)),
                title="Date"
            ),
            y=alt.Y(
                "y:Q",
                title=""
            ),
            color=color,
            tooltip=ToolTip
        ).add_selection(selection).properties(width=400,height=250)
        
        list_of_chart.append(tree_name)
        for i in range(0,len(list_of_data) - 1,2):
            if(i == len(list_of_data)):
                break
            chart = base.mark_circle(size=60).encode(
            x=alt.X(list_of_data[i],title=list_of_titles[i]),
            y=alt.X(list_of_data[i + 1],title=list_of_titles[i + 1]),
            color=color,
            tooltip=ToolTip
            ).add_selection(
                selection
            ).properties(
                width=250,
                height=250
            )
            list_of_chart.append(chart)
        legend = base.mark_point().encode(
            y=alt.Y(colors, axis=alt.Axis(orient='right')),
            color=colors
        ).add_selection(
            selection
        )
        list_of_chart.append(legend)
        
        
        return list_of_chart

### Making scatterplot of pairwise vs euclidean distance

In [None]:
"""
similarity_matrix: matrix of pairwise differences from data (pandas Dataframe)
df_merged: the mergd dataframe between your tree JSON and the FASTA file strains (pandas Dataframe)
column1: one of the data cluster column names in df_merged
column2: the other data cluster column name in df_merged
type_of_embedding: type of embedding (PCA, UMAP, TSNE, MDS)
"""

def scatterplot_tooltips(similarity_matrix, df_merged, column1, column2, type_of_embedding):
    
    embedding_df = df_merged[[column1, column2]]

    finalDf = pd.concat([embedding_df, df_merged[['strain']]], axis = 1).dropna()
    finalDf.columns = [column1, column2, 'strain']
    
    pairwise_distance_array = np.array(similarity_matrix)[np.triu_indices(len(finalDf), k = 0)]
    pairwise_df = pd.DataFrame(pairwise_distance_array)
    pairwise_df = pd.concat([pairwise_df, finalDf[['strain']]], axis = 1)

    euclidean_distance_array = pdist(finalDf[[column1, column2]])
    euclidean_distance_array = squareform(euclidean_distance_array)
    euclidean_distance_array = euclidean_distance_array[np.triu_indices(len(finalDf), k = 0)]
        
    euclidean_df = pd.DataFrame(euclidean_distance_array)
    euclidean_df = pd.concat([euclidean_df, finalDf[['strain']]], axis = 1).dropna()
    
    final_df = euclidean_df.merge(pairwise_df,how='inner',on = 'strain')
    
    regression = linregress(pairwise_distance_array,euclidean_distance_array)
    slope, intercept, r_value, p_value, std_err = regression
    
    alt.data_transformers.disable_max_rows()
    chart = alt.Chart(final_df).mark_circle(size=60).encode(
        x=alt.X('0_x',title = "pairwise distance"),
        y=alt.X('0_y', title = "Euclidean distance"),
        tooltip=['strain']
    ).properties(title="Pairwise vs. Euclidean scatterplot: " + type_of_embedding + "  (" + str(r_value.round(3)) + ")",height=200,width=300)
    return chart   
        

In [None]:
def scatterplot_xyvalues(similarity_matrix, df_merged, column1, column2, type_of_embedding):
    embedding_df = df_merged[[column1, column2]]

    finalDf = pd.concat([embedding_df, df_merged[['strain']]], axis = 1).dropna()
    finalDf.columns = [column1, column2, 'strain']
    
    pairwise_distance_array = np.array(similarity_matrix)[np.triu_indices(len(finalDf), k = 0)]
    pairwise_df = pd.DataFrame(pairwise_distance_array)
    pairwise_df = pd.concat([pairwise_df, finalDf[['strain']]], axis = 1)

    euclidean_distance_array = pdist(finalDf[[column1, column2]])
    euclidean_distance_array = squareform(euclidean_distance_array)
    euclidean_distance_array = euclidean_distance_array[np.triu_indices(len(finalDf), k = 0)]
        
    euclidean_df = pd.DataFrame(euclidean_distance_array)
    euclidean_df = pd.concat([euclidean_df, finalDf[['strain']]], axis = 1).dropna()
    
    final_df = euclidean_df.merge(pairwise_df,how='inner',on = 'strain')
    
    return final_df

In [None]:
def linregress_data(similarity_matrix, df_merged, column1, column2, type_of_embedding):
    
    embedding_df = df_merged[[column1, column2]]

    finalDf = pd.concat([embedding_df, df_merged[['strain']]], axis = 1).dropna()
    finalDf.columns = [column1, column2, 'strain']
    
    pairwise_distance_array = np.array(similarity_matrix)[np.triu_indices(len(finalDf), k = 0)]
    pairwise_df = pd.DataFrame(pairwise_distance_array)
    pairwise_df = pd.concat([pairwise_df, finalDf[['strain']]], axis = 1)

    euclidean_distance_array = pdist(finalDf[[column1, column2]])
    euclidean_distance_array = squareform(euclidean_distance_array)
    euclidean_distance_array = euclidean_distance_array[np.triu_indices(len(finalDf), k = 0)]
        
    euclidean_df = pd.DataFrame(euclidean_distance_array)
    euclidean_df = pd.concat([euclidean_df, finalDf[['strain']]], axis = 1).dropna()
    
    final_df = euclidean_df.merge(pairwise_df,how='inner',on = 'strain')
    
    regression = linregress(pairwise_distance_array,euclidean_distance_array)
    slope, intercept, r_value, p_value, std_err = regression
    return r_value

In [None]:
def scatterplot_tooltips_df(similarity_matrix, df_merged, column1, column2, type_of_embedding):
    
    embedding_df = df_merged[[column1, column2]]

    finalDf = pd.concat([embedding_df, df_merged[['strain']]], axis = 1).dropna()
    finalDf.columns = [column1, column2, 'strain']
    
    pairwise_distance_array = np.array(similarity_matrix)[np.triu_indices(len(finalDf), k = 0)]
    pairwise_df = pd.DataFrame(pairwise_distance_array)
    pairwise_df = pd.concat([pairwise_df, finalDf[['strain']]], axis = 1)

    euclidean_distance_array = pdist(finalDf[[column1, column2]])
    euclidean_distance_array = squareform(euclidean_distance_array)
    euclidean_distance_array = euclidean_distance_array[np.triu_indices(len(finalDf), k = 0)]
        
    euclidean_df = pd.DataFrame(euclidean_distance_array)
    euclidean_df = pd.concat([euclidean_df, finalDf[['strain']]], axis = 1).dropna()
    
    final_df = euclidean_df.merge(pairwise_df,how='inner',on = 'strain')
    final_df["embedding"]=type_of_embedding
    return final_df

### Creating a method for making the similarity matrix

In [None]:
"""
path: path to file (string)
df_merged: the merged dataframe that contains the metadata/strains that the tree JSON and FASTA file have (pandas dataframe)
"""
def making_scatterplot_pairwise_matrix(path,df_merged):
    strains = []
    genomes = []
    for record in SeqIO.parse(path, "fasta"):
            strains.append(str(record.id))
            genomes.append(str(record.seq))
    genomes_df = pd.DataFrame(genomes)
    genomes_df.columns = ['genomes']

    strains_df = pd.DataFrame(strains)
    strains_df.columns = ['strain']

    df_merged = genomes_df.merge(strains_df, how='outer', left_index=True, right_index=True)

    merged2_df = df_merged.merge(merged_df, how = 'inner', on = 'strain')
    merged2_df = merged2_df[['genomes','strain']]
    
    genomes_numpy = merged2_df[['genomes']].to_numpy()
    genomes = genomes_numpy.tolist()
    genomes = list(genomes_numpy)
    
    numbers = []
    matrixOfNum = []
    for i in merged2_df.index:
        for j in merged2_df.index:
            num = hamming_distance(merged2_df['genomes'][i],merged2_df['genomes'][j])
            numbers.append(num)
        matrixOfNum.append(numbers)
        numbers = []
    dataa = pd.DataFrame(matrixOfNum, columns = ["strain " + str(i) for i in range(1,len(matrixOfNum) + 1)], dtype="float")
    return dataa
    
    

### Making within vs between clade boxplots

In [None]:
def get_euclidean_data_frame(sampled_df, column1, column2, embedding):
    """
    Returns a data frame of Euclidean distances for the requested embedding columns.
    
    The given `sampled_df` MUST include a "clade_membership" column.
    """    
    # Traverse pairs of samples from left-to-right, top-to-bottom
    # along the upper triangle of the pairwise matrix and collect
    # the clade status of each pair as either within- or between-clades.
    # This traversal excludes self-self comparisons along the diagonal.
    clade_status = []
    for i in range(sampled_df.shape[0] - 1):
        for j in range(i + 1, sampled_df.shape[0]):
            if sampled_df.iloc[i]["clade_membership"] != sampled_df.iloc[j]["clade_membership"]:
                clade_status.append("between")
            else:
                clade_status.append("within")

    # Calculate pairwise distances between samples for the requested columns.
    # The resulting array is in the same left-to-right, top-to-bottom order
    # as the clade statuses above.
    sampled_distances = pdist(sampled_df[[column1, column2]])
    
    # Align clade status with pairwise distance for each pairwise comparison.
    sampled_distances_df = pd.DataFrame({"distance": sampled_distances, "clade_status": clade_status})
    
    # Annotate the requested embedding.
    sampled_distances_df["embedding"] = embedding
    
    return sampled_distances_df

In [None]:
#merged_df MUST include clade_membership and strain name merged with the respective distances
def making_with_vs_between_boxplots(merged_df, column1, column2):
    matrix_of_numbers = []
    row_of_numbers = []
    node_list = merged_df['clade_membership'].tolist()

    for index in node_list:
        for index2 in node_list:
            if(index == index2):
                row_of_numbers.append("within_clade")
            else:
                row_of_numbers.append("between_clade")
        matrix_of_numbers.append(row_of_numbers)
        row_of_numbers = []

    color_distance_array = np.array(matrix_of_numbers)[np.triu_indices(len(matrix_of_numbers), k = 0)]
    color_df = pd.DataFrame(color_distance_array)
    
    euclidean_distance_array = pdist(merged_df[[column1, column2]])
    euclidean_distance_array = squareform(euclidean_distance_array)
    euclidean_distance_array = euclidean_distance_array[np.triu_indices(len(matrix_of_numbers), k = 0)]
    
    euclidean_df = pd.DataFrame(euclidean_distance_array)
    
    color_similarity = euclidean_df.merge(color_df, right_index = True, left_index = True)
    color_similarity.columns = ['distance','color']
    
    chart = alt.Chart(color_similarity.sample(n=25000),height=150).mark_boxplot().encode(
    x = alt.X('distance',title="mean distances"),
    y = 'color'
    )
    return chart

### Reading in the Fasta File
- I used BioPython to parse the Fasta file into two numpy Arrays: Genomes and Strains. 

In [None]:
#work on making this work
path = "../zika-nextstrain/results/aligned.fasta"
strains = []
genomes = []
for record in SeqIO.parse(path, "fasta"):
    strains.append(str(record.id))
    genomes.append(str(record.seq))

#### Checking to make sure the file I picked is a aligned Fasta file / is the file I wanted

In [None]:
len(strains)
print(len(genomes[0]) == len(genomes[1]))
print(len(genomes))

In [None]:
strains[:5]

# Creating the Distance Matrix
- I used Hamming Distance to find the pairwise distance between each genome and each other genome, effectively creating a similarity/distance matrix
    - In my Hamming Distance method, I only counted something as a different if it was a mismatch between the nucleotides (A,G,C, or T), not gaps (as that was throwing off the algorithm too much for smaller strains)
- I then used Seaborn to generate a heatmap to make sure the matrix looked correct

In [None]:
# Return the Hamming distance between string1 and string2.
# string1 and string2 should be the same length.
# check both are in a list of [agct] - check if they are different
# recalculate pairwise distances with just ATCG
listOfNucleotides = ["A","G","C","T"]
def hamming_distance(array1, array2): 
    # Start with a distance of zero, and count up
    distance = 0
    # Loop over the indices of the string
    L = len(array1)
    for i in range(L):
        # Add 1 to the distance if these two characters are not equal
        if array1[i] in listOfNucleotides and array2[i] in listOfNucleotides and array1[i] != array2[i]:
            distance += 1
    # Return the final count of differences
    return distance

In [None]:
try:
    similarity_matrix = pd.read_csv("DistanceMatrixFluZika.csv")
except:
    #using Hamming Distance to create a similarity matrix
    numbers = []
    matrixOfNum = []
    for i in range(0, len(genomes)):
        for j in range(0,len(genomes)):
            num = hamming_distance(genomes[i],genomes[j])
            numbers.append(num)
        matrixOfNum.append(numbers)
        numbers = []
    similarity_matrix = pd.DataFrame(matrixOfNum, columns = ["strain " + str(i) for i in range(1,len(matrixOfNum) + 1)], dtype="int") 
    similarity_matrix = similarity_matrix.to_csv("DistanceMatrixFluZika.csv",sep=',',index=False)

In [None]:
print(sns.heatmap(similarity_matrix))

# Reading in the Metadata
- The metadata is used for getting the region, country, etc of different strains. This data is used to color the clusters.
- The metadata contains all of the possible sampled strains, so many of these will not be in the genomes from the aligned file, probably because the strains were corrupted, too short, etc. 
- We merge this metadata with the strains we have in the aligned file to get a list of all of the strains that match between both. It should come out to the amount of strains in the aligned file.

In [None]:
#merging my final dataframe with their regions and strain names
cwd = Path.cwd()
goal_dir = cwd.parent
pathMeta = goal_dir.joinpath("zika-nextstrain", "results", "metadata.tsv")
metadata_df = pd.read_csv(pathMeta, delimiter='\t')

In [None]:
metadata_df.head()

In [None]:
metadata_df.shape

In [None]:
#making a dataframe out of the strains from the alignment file to merge with the metadata
strains_df = pd.DataFrame(strains)
strains_df.columns = ['strain']

In [None]:
strains_df.shape

In [None]:
#Merging strains and metadata 
result_strains = pd.merge(strains_df,metadata_df, on='strain', how= "left")

In [None]:
#checking that no strains were lost
result_strains.shape

In [None]:
np.setdiff1d(strains_df['strain'].unique(),metadata_df['strain'].unique())

# Creating the Phylogenetic Tree in Altair
- I used Altair to make this tree (Documentation linked [here][1]
- I opened and imported the json from a build from NextStrain ([flu][2], [zika][3], etc)
- The data from the JSON and the Data from the tree are usually a little different, so after merging the two dataframes you may get some errors.

[1]: https://altair-viz.github.io/index.html
[2]: https://github.com/nextstrain/seasonal-flu
[3]: https://altair-viz.github.io/index.html

In [None]:
with open('../zika-nextstrain/auspice/zika-cartography_tree.json') as fh:
    json_tree_handle = json.load(fh)

In [None]:
tree = json_to_tree(json_tree_handle)

In [None]:
tree

In [None]:
node_data = [
    {
        "strain": node.name,
        "date": node.attr["num_date"],
        "y": node.yvalue,
        "region": node.attr["region"],
        "country": node.attr["country"],
        "parent_date": node.parent is not None and node.parent.attr["num_date"] or node.attr["num_date"],
        "parent_y": node.parent is not None and node.parent.yvalue or node.yvalue,
        "clade_membership" : node.attr['clade_membership']
    }
    for node in tree.find_clades(terminal=True)
]

In [None]:
node_data[10]

In [None]:
node_df = pd.DataFrame(node_data)

In [None]:
node_df.head()

In [None]:
node_df["y"] = node_df["y"].max() - node_df["y"]

In [None]:
node_df["parent_y"] = node_df["parent_y"].max() - node_df["parent_y"]

In [None]:
node_df.shape

In [None]:
node_df.head()

In [None]:
node_df["region"].unique()

In [None]:
node_numpy = node_df['clade_membership'].to_numpy()
clade_ok = []
for node in node_numpy:
    clade_ok.append(node)

In [None]:
clade_new = pd.DataFrame(clade_ok)
node_df = node_df.merge(clade_new, how='outer', left_index=True, right_index=True)

In [None]:
node_df.head()

In [None]:
node_df.columns = ['strain', 'date', 'y', 'region', 'country', 'parent_date',
      'parent_y', 'clade_membership', 'clade_membership_color']

## Checking for Outliers in Pairwise Distance

In [None]:
result_df = pd.DataFrame(result_strains['strain'])
similarity_matrix.shape

In [None]:
index_df = pd.DataFrame(similarity_matrix.mean().index)

In [None]:
result_index_df = result_df.set_index(similarity_matrix.mean().index)
mean_similarity_matrix = pd.DataFrame(similarity_matrix.mean())

In [None]:
result_index_df = result_index_df.merge(mean_similarity_matrix, left_index=True, right_index=True)
result_index_df.columns = ["strain","mean"]
result_index_df

In [None]:
alt.Chart(result_index_df,height=150).mark_boxplot().encode(
    x = alt.X('mean',title="mean of pairwise distances"),
    tooltip = ["strain"]
)

# Running PCA on Scaled and Centered Data
- I treated each nucleotide as a "site", or dimension, and found the probability of having a certain nucleotide given the frequency of that letter at that site.
- I used [this paper][1] as my source 
- The equation is as follows where C is the matrix of dimensions, M is the mean, and p is the frequency of a nucleotide at that given site. 
![](https://journals.plos.org/plosgenetics/article/file?type=thumbnail&id=info:doi/10.1371/journal.pgen.0020190.e003)

In [None]:
numbers = genomes[:]
for i in range(0,len(genomes)):
    numbers[i] = re.sub(r'[^AGCT]', '5', numbers[i])
    numbers[i] = list(numbers[i].replace('A','1').replace('G','2').replace('C', '3').replace('T','4'))
    numbers[i] = [int(j) for j in numbers[i]]
genomes_df = pd.DataFrame(numbers)
genomes_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]

In [None]:
genomes_df.head()

In [None]:
#performing PCA on my pandas dataframe 
pca = PCA(n_components=10,svd_solver='full') #can specify n, since with no prior knowledge, I use None
principalComponents = pca.fit_transform(genomes_df)
principalDf = pd.DataFrame(data = principalComponents, columns = ["principal component " + str(i) for i in range(1,11)])

In [None]:
df = pd.concat([pd.DataFrame(np.arange(1,11)), pd.DataFrame([round(pca.explained_variance_ratio_[i],4) for i in range(0,len(pca.explained_variance_ratio_))])], axis = 1)
df.columns = ['principal components','explained variance']
df

In [None]:
alt.Chart(df).mark_circle().encode(
    x='principal components:Q',
    y='explained variance:Q')

In [None]:
principalDf.head()


# Merging Strain and PCA

In [None]:
strains_df = pd.DataFrame(strains)
strains_df.columns = ['strain']

In [None]:
result_strains = pd.merge(strains_df,metadata_df, on='strain')

In [None]:
finalDf = pd.concat([principalDf, result_strains[['region','country','strain']]], axis = 1)

In [None]:
merged_df = pd.merge(finalDf,node_df,on=["strain","region"]).dropna()

In [None]:
merged_df.head()

In [None]:
merged_df = pd.merge(finalDf,node_df,on=["strain","region"]).dropna()

In [None]:
merged_df.head()

In [None]:
list_of_chart = linking_tree_with_plots_brush(merged_df,['principal component 1','principal component 2','principal component 3','principal component 4'],
                                         ['Principal Component 1 (Explained Variance : {}%'.format(round(pca.explained_variance_ratio_[0]*100,2)) + ")",
                                          'Principal Component 2 (Explained Variance : {}%'.format(round(pca.explained_variance_ratio_[1]*100,2)) + ")",
                                          'Principal Component 3 (Explained Variance : {}%'.format(round(pca.explained_variance_ratio_[2]*100,2)) + ")",
                                          'Principal Component 4 (Explained Variance : {}%'.format(round(pca.explained_variance_ratio_[3]*100,2)) + ")"],
                                         "clade_membership:N",['strain','region'])
chart = list_of_chart[0]|list_of_chart[1]|list_of_chart[2]
chart

In [None]:
alt.data_transformers.disable_max_rows()
chart = making_with_vs_between_boxplots(merged_df,'principal component 1','principal component 2')
chart

In [None]:
merged_df.head()

In [None]:
merged_df.columns

In [None]:
matrix_of_numbers = []
row_of_numbers = []
node_list = merged_df['clade_membership'].tolist()

In [None]:
for index in node_list:
    for index2 in node_list:
        if(index == index2):
            row_of_numbers.append("within_clade")
        else:
            row_of_numbers.append("between_clade")
    matrix_of_numbers.append(row_of_numbers)
    row_of_numbers = []

In [None]:
len(matrix_of_numbers)

In [None]:
len(matrix_of_numbers[0])

In [None]:
matrix_of_numbers[0][:5]

In [None]:
color_distance_array = np.array(matrix_of_numbers)[np.triu_indices(len(matrix_of_numbers), k = 0)]
color_df = pd.DataFrame(color_distance_array)

In [None]:
np.triu_indices(len(matrix_of_numbers), k = 0)

In [None]:
color_distance_array

In [None]:
color_df.head()

In [None]:
color_df.shape

In [None]:
merged_df[['principal component 1', 'principal component 2']].head()

In [None]:
euclidean_distance_array = pdist(merged_df[['principal component 1', 'principal component 2']])

In [None]:
euclidean_distance_array.shape

In [None]:
euclidean_distance_array = squareform(euclidean_distance_array)

In [None]:
euclidean_distance_array.shape

In [None]:
euclidean_distance_array = euclidean_distance_array[np.triu_indices(len(matrix_of_numbers), k = 0)]

In [None]:
euclidean_distance_array

In [None]:
euclidean_df = pd.DataFrame(euclidean_distance_array)

color_similarity = euclidean_df.merge(color_df, right_index = True, left_index = True)
color_similarity.columns = ['distance','color']

chart = alt.Chart(color_similarity.sample(n=25000),height=150).mark_boxplot().encode(
x = alt.X('distance',title="mean distances"),
y = 'color'
)

# Running MDS on the Dataset

In [None]:
embedding = MDS(n_components=10,metric=True,dissimilarity='precomputed')
X_transformed = embedding.fit_transform(similarity_matrix)

In [None]:
raw_stress = embedding.stress_
normalized_stress = np.sqrt(raw_stress /((similarity_matrix.values.ravel() ** 2).sum() / 2))
print(normalized_stress.round(2))

In [None]:
MDS_df = pd.DataFrame(X_transformed,columns=['MDS cluster ' + str(i) for i in range(1,11)])
print(MDS_df.head())

In [None]:
finalDf = concatenate_results_with_strain_data(MDS_df, node_df, ['strain','clade_membership', 'clade_membership_color'])

In [None]:
finalDf.columns

In [None]:
finalDf.head()

In [None]:
merged_df = pd.merge(finalDf,node_df,on=["strain",'clade_membership_color','clade_membership']).dropna()

In [None]:
merged_df.columns

In [None]:
scatterplot_with_tooltip_interactive(merged_df,'MDS cluster 1','MDS cluster 2',"MDS cluster 1","MDS cluster 2",['strain','clade_membership'],'clade_membership_color')

In [None]:
scatterplot_with_tooltip_interactive(merged_df,'MDS cluster 3','MDS cluster 4',"MDS cluster 3","MDS cluster 4",['strain','clade_membership'],'clade_membership_color')

In [None]:
chart = scatterplot_tooltips(similarity_matrix, merged_df, "MDS cluster 1", "MDS cluster 2", "MDS")
chart

In [None]:
chart = scatterplot_tooltips(similarity_matrix, merged_df, "MDS cluster 3", "MDS cluster 4", "MDS")
chart

In [None]:
chart = scatterplot_tooltips(similarity_matrix, merged_df, "MDS cluster 5", "MDS cluster 6", "MDS")
chart

In [None]:
scatterplot_with_tooltip_interactive(merged_df,'MDS cluster 5','MDS cluster 6',"MDS cluster 5","MDS cluster 6",['strain','clade_membership'],'clade_membership_color')

# Linking Tree to MDS plot

In [None]:
list_of_data_and_titles = ['MDS cluster 1','MDS cluster 2','MDS cluster 3','MDS cluster 4','MDS cluster 5','MDS cluster 6']
list_of_chart = linking_tree_with_plots_brush(merged_df,list_of_data_and_titles,list_of_data_and_titles,'clade_membership_color',["clade_membership","strain:N"])
chart = list_of_chart[0]|list_of_chart[1]|list_of_chart[2]|list_of_chart[3]
chart

In [None]:
data = linking_tree_with_plots_clickable(merged_df,['MDS cluster 1','MDS cluster 2','MDS cluster 3','MDS cluster 4','MDS cluster 5','MDS cluster 6'],['MDS cluster 1','MDS cluster 2','MDS cluster 3','MDS cluster 4','MDS cluster 5','MDS cluster 6'],'clade_membership_color:N',['clade_membership'],["clade_membership:N","strain:N"])

chart = data[0]|data[1]|data[2]|data[3]|data[4]
chart

In [None]:
alt.data_transformers.disable_max_rows()
chart = making_with_vs_between_boxplots(merged_df,'MDS cluster 1','MDS cluster 2')
chart

In [None]:
chart = making_with_vs_between_boxplots(merged_df, 'MDS cluster 3', 'MDS cluster 4')
chart

# Running T-SNE on the Dataset 

In [None]:
embedding = TSNE(n_components=2,metric='precomputed',perplexity = 25.95)
X_transformed = embedding.fit_transform(similarity_matrix)

In [None]:
TSNE_df = pd.DataFrame(X_transformed,columns=['TSNE cluster ' + str(i) for i in range(1,3)])
print(TSNE_df.head())

In [None]:
finalDf = concatenate_results_with_strain_data(TSNE_df,result_strains,['strain','region'])

In [None]:
finalDf.shape

In [None]:
merged_df = pd.merge(finalDf,node_df,on=["strain"]).dropna()

In [None]:
merged_df.columns

In [None]:
chart = scatterplot_tooltips(similarity_matrix, merged_df, "TSNE cluster 1", "TSNE cluster 2", "TSNE")
chart

In [None]:
scatterplot_with_tooltip_interactive(merged_df,'TSNE cluster 1','TSNE cluster 2',"TSNE cluster 1","TSNE cluster 2",['strain','clade_membership'],'clade_membership_color')

# Linking Tree to T-SNE plot

In [None]:
list_of_chart = linking_tree_with_plots_brush(merged_df,['TSNE cluster 1','TSNE cluster 2'],['TSNE cluster 1','TSNE cluster 2'],'clade_membership_color',["clade_membership:N","strain:N"])
chart = list_of_chart[0]|list_of_chart[1]
chart

In [None]:
data = linking_tree_with_plots_clickable(merged_df,['TSNE cluster 1','TSNE cluster 2'],['TSNE cluster 1','TSNE cluster 2'],'clade_membership_color:N',['clade_membership'],["clade_membership:N","strain:N"])

chart = data[0]|data[1]|data[2]
chart

In [None]:
alt.data_transformers.disable_max_rows()
chart = making_with_vs_between_boxplots(merged_df,'TSNE cluster 1','TSNE cluster 2')
chart

# Running UMAP on the Dataset

In [None]:
reducer = umap.UMAP(n_neighbors=300,
        min_dist=.5,
        n_components=2,
        init="random")
embedding = reducer.fit_transform(similarity_matrix)

In [None]:
UMAP_df = pd.DataFrame(embedding,columns=['UMAP cluster ' + str(i) for i in range(1,3)])
print(UMAP_df.head())

In [None]:
finalDf = concatenate_results_with_strain_data(UMAP_df, result_strains, ['strain','region','country'])

In [None]:
finalDf.shape

In [None]:
merged_df = pd.merge(finalDf,node_df,on=["strain","region"]).dropna()

In [None]:
merged_df.shape

In [None]:
chart = scatterplot_tooltips(similarity_matrix, merged_df, "UMAP cluster 1", "UMAP cluster 2", "UMAP")
chart

In [None]:
scatterplot_with_tooltip_interactive(merged_df,'UMAP cluster 1','UMAP cluster 2',"UMAP cluster 1","UMAP cluster 2",['strain','clade_membership'],'clade_membership_color')

# Linking Tree with UMAP plot

In [None]:
list_of_data_and_titles = ['UMAP cluster 1','UMAP cluster 2']
list_of_chart = linking_tree_with_plots_brush(merged_df,list_of_data_and_titles,list_of_data_and_titles,'clade_membership_color',["clade_membership","strain:N"])
chart = list_of_chart[0]|list_of_chart[1]
chart

In [None]:
alt.data_transformers.disable_max_rows()
chart = making_with_vs_between_boxplots(merged_df,'UMAP cluster 1','UMAP cluster 2')
chart

# Linking all plots together clickable with Tree

In [None]:
together_df = MDS_df.merge(TSNE_df, how='outer', left_index=True, right_index=True)
together_df = together_df.merge(principalDf, how='outer', left_index=True, right_index=True)
together_df = together_df.merge(UMAP_df, how='outer', left_index=True, right_index=True)
together_df.head()

In [None]:
finalDf = concatenate_results_with_strain_data(together_df, result_strains, ['strain','region','country'])

In [None]:
merged_df = pd.merge(finalDf,node_df,on=["strain","region"]).dropna()

In [None]:
merged_df.columns

In [None]:
data = linking_tree_with_plots_clickable(merged_df,['MDS cluster 1', 'MDS cluster 2','TSNE cluster 1', 'TSNE cluster 2', 'principal component 1', 'principal component 2', 'UMAP cluster 1', 'UMAP cluster 2'], ['MDS cluster 1', 'MDS cluster 2', 'TSNE cluster 1', 'TSNE cluster 2','Principal Component 1 (Expected Variance : {}%'.format(round(pca.explained_variance_ratio_[0]*100,2)) + ")",
'Principal Component 2 (Expected Variance : {}%'.format(round(pca.explained_variance_ratio_[1]*100,2)) + ")",'UMAP cluster 1','UMAP cluster 2'],'clade_membership_color:N',['clade_membership'],['strain','clade_membership'])

In [None]:
PCAMDS = data[3]|data[1]|data[5]
TSNEUMAP = data[2]|data[4]
embeddings = alt.vconcat(PCAMDS,TSNEUMAP)
embeddings
fullChart = alt.hconcat(data[0],embeddings)
fullChart

In [None]:
n_samples = 650

In [None]:
sampled_df = merged_df.sample(n_samples).copy().sort_index()

In [None]:
#PCA eigenvectors factoring in - how do I do that
data_frames = [
    scatterplot_tooltips_df(similarity_matrix, sampled_df, "principal component 1", "principal component 2", "PCA"),
    scatterplot_tooltips_df(similarity_matrix, sampled_df, "MDS cluster 1", "MDS cluster 2", "MDS"),
    scatterplot_tooltips_df(similarity_matrix, sampled_df, "TSNE cluster 1", "TSNE cluster 2", "TSNE"),
    scatterplot_tooltips_df(similarity_matrix, sampled_df, "UMAP cluster 1", "UMAP cluster 2", "UMAP"),
]

In [None]:
sampled_index = np.array(sorted(sampled_df.index.values))

Extract pairwise genetic (Hamming) distances corresponding to the records sampled above. This step assumes that the original merged data frame is indexed from zero to N for N total samples in the same order as the similarity matrix.

In [None]:
genetic_distances = squareform(similarity_matrix.values[sampled_index][:, sampled_index])

In [None]:
data_frames.append(pd.DataFrame({
    "distance": genetic_distances,
    "embedding":"genetic"
}))

In [None]:
len(data_frames)

In [None]:
euclidean_data_frame = pd.concat(data_frames, sort=False)

In [None]:
euclidean_data_frame.head()

In [None]:
euclidean_data_frame.shape

In [None]:
PCA = scatterplot_tooltips(similarity_matrix, sampled_df, "principal component 1", "principal component 2", "PCA")
MDS = scatterplot_tooltips(similarity_matrix, sampled_df, "MDS cluster 1", "MDS cluster 2", "MDS")
TSNE = scatterplot_tooltips(similarity_matrix, sampled_df, "TSNE cluster 1", "TSNE cluster 2", "TSNE")
UMAP = scatterplot_tooltips(similarity_matrix, sampled_df, "UMAP cluster 1", "UMAP cluster 2", "UMAP")
    
chart = PCA|MDS|UMAP|TSNE

In [None]:
chart

In [None]:
chart = alt.Chart(euclidean_data_frame).mark_circle(size=60).encode(
    x="0_y",
    y="0_x",
    facet=alt.Facet(
        "embedding:N",
        sort=["PCA", "MDS", "t-SNE", "UMAP"]
    )
).interactive()

chart.properties(width=150, height=200, columns=5)

# Notes to Self:

- Collapse cells underneath Markdown headers
- Get docstrings above methods to show up when user presses SHIFT + TAB
- link back to the methods section for user each time method is used
- Run more Flu builds (3 years, 6 years, 12 years, H3, H1)
- Try algorithm on MERS
- Try algorithm on other bacterial genomes (unaligned / small snips of genomes)
- Make Zika clades or run automatic clade naming on 12y H3N2 flu with different cutoffs to standardize coloring for graphs
- Write a paper


## Within- and between-clade Euclidean distances for all embeddings

Sample a random subset of the complete embedding data frame, calculate pairwise Euclidean distances between samples, and plot the results in a single figure.

In [None]:
n_samples = 50

In [None]:
sampled_df = merged_df.sample(n_samples).copy().sort_index()

In [None]:
data_frames = [
    get_euclidean_data_frame(sampled_df, "principal component 1", "principal component 2", "PCA"),
    get_euclidean_data_frame(sampled_df, "MDS cluster 1", "MDS cluster 2", "MDS"),
    get_euclidean_data_frame(sampled_df, "TSNE cluster 1", "TSNE cluster 2", "t-SNE"),
    get_euclidean_data_frame(sampled_df, "UMAP cluster 1", "UMAP cluster 2", "UMAP"),
]

In [None]:
sampled_index = np.array(sorted(sampled_df.index.values))

Extract pairwise genetic (Hamming) distances corresponding to the records sampled above. This step assumes that the original merged data frame is indexed from zero to N for N total samples in the same order as the similarity matrix.

In [None]:
genetic_distances = squareform(similarity_matrix.values[sampled_index][:, sampled_index])

In [None]:
data_frames.append(pd.DataFrame({
    "distance": genetic_distances,
    "clade_status": data_frames[0]["clade_status"].values,
    "embedding": "genetic"
}))

In [None]:
len(data_frames)

In [None]:
euclidean_data_frame = pd.concat(data_frames)

In [None]:
PCA_df = euclidean_data_frame[euclidean_data_frame.embedding == "PCA"]
MDS_df = euclidean_data_frame[euclidean_data_frame.embedding == "MDS"]
TSNE_df = euclidean_data_frame[euclidean_data_frame.embedding == "t-SNE"]
UMAP_df = euclidean_data_frame[euclidean_data_frame.embedding == "UMAP"]
genetic_df = euclidean_data_frame[euclidean_data_frame.embedding == "genetic"]

In [None]:
genetic = alt.Chart(genetic_df,width=300).mark_boxplot().encode(
    x="clade_status:N",
    y="distance:Q"
).properties(title="genetic")
PCA = alt.Chart(PCA_df,width=200).mark_boxplot().encode(
    x="clade_status:N",
    y="distance:Q"
).properties(title="PCA")
MDS = alt.Chart(MDS_df,width=200).mark_boxplot().encode(
    x="clade_status:N",
    y="distance:Q"
).properties(title="MDS",width=200)
TSNE = alt.Chart(TSNE_df).mark_boxplot().encode(
    x="clade_status:N",
    y="distance:Q"
).properties(title="TSNE",width=200)
UMAP = alt.Chart(UMAP_df,width=200).mark_boxplot().encode(
    x="clade_status:N",
    y="distance:Q"
).properties(title="UMAP")

chart = genetic|PCA|MDS|TSNE|UMAP

In [None]:
#ratio of within vs between (within shorter than between) - label which metrics + sginificance level

In [None]:
import statistics

In [None]:
genetic_df.mean()

In [None]:
median_genetic_within = genetic_df[genetic_df.clade_status == "within"].median()
median_genetic_between = genetic_df[genetic_df.clade_status == "between"].median()

In [None]:
median_PCA_within = PCA_df[PCA_df.clade_status == "within"].median()
median_PCA_between = PCA_df[PCA_df.clade_status == "between"].median()

In [None]:
median_MDS_within = MDS_df[MDS_df.clade_status == "within"].median()
median_MDS_between = MDS_df[MDS_df.clade_status == "between"].median()

In [None]:
median_TSNE_within = TSNE_df[TSNE_df.clade_status == "within"].median()
median_TSNE_between = TSNE_df[TSNE_df.clade_status == "between"].median()

In [None]:
median_UMAP_within = UMAP_df[UMAP_df.clade_status == "within"].median()
median_UMAP_between = UMAP_df[UMAP_df.clade_status == "between"].median()

In [None]:
def ratioFunction(num1, num2):
    ratio12 = int(num1/num2)
    return ratio12

In [None]:
genetic_ratio = ratioFunction(median_genetic_between,median_genetic_within)
print(genetic_ratio)

In [None]:
PCA_ratio = ratioFunction(median_PCA_between,median_PCA_within)
print(PCA_ratio)

In [None]:
MDS_ratio = ratioFunction(median_MDS_between,median_MDS_within)
print(MDS_ratio)

In [None]:
TSNE_ratio = ratioFunction(median_TSNE_between,median_TSNE_within)
print(TSNE_ratio)

In [None]:
UMAP_ratio = ratioFunction(median_UMAP_between,median_UMAP_within)
print(UMAP_ratio)

In [None]:
x_values = scatterplot_xyvalues(similarity_matrix, merged_df, "MDS cluster 1", "MDS cluster 2", "MDS")["0_x"]
y_values = scatterplot_xyvalues(similarity_matrix, merged_df, "MDS cluster 1", "MDS cluster 2", "MDS")["0_y"]
x_values

In [None]:
import pandas as pd               # Pandas handles dataframes
import numpy as np
import scipy
import matplotlib                 # Numpy handles lots of basic maths operations
import matplotlib.pyplot as plt   # Matplotlib for plotting
import seaborn as sns             # Seaborn for beautiful plots
import statsmodels

df = pd.DataFrame({"Xvalue" : x_values,
                    "Yvalue" : y_values
                    })

# Seaborne solution:
sns.regplot("Xvalue", "Yvalue", data=df,  color="grey",
line_kws={"color":"r","alpha":1,"lw":1} ,lowess=True)
plt.xlabel("X"), plt.ylabel("Y")
plt.title('Test data - with seaborn lowess line')
plt.show()


def loc_eval(x, b):
    loc_est = 0
    for i in enumerate(b): loc_est+=i[1]*(x**i[0])
    return(loc_est)


def loess(xvals, yvals, data, alpha, poly_degree=1):
    all_data = sorted(zip(data[xvals].tolist(), data[yvals].tolist()), key=lambda x: x[0])
    xvals, yvals = zip(*all_data)
    evalDF = pd.DataFrame(columns=['v','g'])
    n = len(xvals)
    m = n + 1
    q = int(np.floor(n * alpha) if alpha <= 1.0 else n)
    avg_interval = ((max(xvals)-min(xvals))/len(xvals))
    v_lb = min(xvals)-(.5*avg_interval)
    v_ub = (max(xvals)+(.5*avg_interval))
    v = enumerate(np.linspace(start=v_lb, stop=v_ub, num=m), start=1)
    xcols = [np.ones_like(xvals)]
    for j in range(1, (poly_degree + 1)):
        xcols.append([i ** j for i in xvals])
    X = np.vstack(xcols).T
    for i in v:
        iterpos = i[0]
        iterval = i[1]
        iterdists = sorted([(j, np.abs(j-iterval)) for j in xvals], key=lambda x: x[1])
        _, raw_dists = zip(*iterdists)
        scale_fact = raw_dists[q-1]
        scaled_dists = [(j[0],(j[1]/scale_fact)) for j in iterdists]
        weights = [(j[0],((1-np.abs(j[1]**3))**3 if j[1]<=1 else 0)) for j in scaled_dists]
        _, weights      = zip(*sorted(weights,     key=lambda x: x[0]))
        _, raw_dists    = zip(*sorted(iterdists,   key=lambda x: x[0]))
        _, scaled_dists = zip(*sorted(scaled_dists,key=lambda x: x[0]))
        W         = np.diag(weights)
        b         = np.linalg.inv(X.T @ W @ X) @ (X.T @ W @ yvals)
        local_est = loc_eval(iterval, b)
        iterDF2   = pd.DataFrame({
                       'v'  :[iterval],
                       'g'  :[local_est]
                       })
        evalDF = pd.concat([evalDF, iterDF2])
    evalDF = evalDF[['v','g']]
    return(evalDF)


evalDF = loess("Xvalue", "Yvalue", data = df, alpha=0.7, poly_degree=2)

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(df["Xvalue"], df["Yvalue"], color="grey", marker="o", s=5, label="_nolegend_")
ax1.plot(evalDF['v'], evalDF['g'], color='red', linewidth= 3, label="Test")
plt.title("UMAP pairwise vs Euclidean")
plt.legend()
plt.tight_layout()
plt.show()