## Imports

In [6]:
import sys
sys.path.extend(["./notebooks/scripts"])

In [7]:
import altair as alt
from altair_saver import save
import pandas as pd

from Helpers import linking_tree_with_plots_brush

import bcubed

In [8]:
alt.renderers.set_embed_options(
    padding={"left": 0, "right": 0, "bottom": 1, "top": 1}
)

RendererRegistry.enable('default')

## Inputs

In [None]:
colors_path = snakemake.input.colors
embeddings_path = snakemake.input.annotated_embeddings
accuracy_path = snakemake.input.accuracy_table

explained_variance_pca_ha = snakemake.input.explained_variance_pca_ha
explained_variance_pca_concatenated = snakemake.input.explained_variance_pca_concatenated

output_pca_html = snakemake.output.HANAFullChartBrushablePCAHTML
output_pca_png = snakemake.output.HANAFullChartBrushablePCAPNG
output_mds_html = snakemake.output.HANAFullChartBrushableMDSHTML
output_mds_png = snakemake.output.HANAFullChartBrushableMDSPNG
output_tsne_html = snakemake.output.HANAFullChartBrushableTSNEHTML
output_tsne_png = snakemake.output.HANAFullChartBrushableTSNEPNG
output_umap_html = snakemake.output.HANAFullChartBrushableUMAPHTML
output_umap_png = snakemake.output.HANAFullChartBrushableUMAPPNG
output_full_html = snakemake.output.fullChartHTML
output_full_png = snakemake.output.fullChartPNG

In [17]:
colors_path = "../notebooks/config/color_schemes.tsv"
embeddings_path ="../ha-na-nextstrain/results/annotated_embeddings.tsv"
accuracy_path ="../ha-na-nextstrain/results/full_HDBSCAN_metadata.csv"
explained_variance_pca_ha = "../ha-na-nextstrain/results/explained_variance_pca_ha.csv"
explained_variance_pca_concatenated = "../ha-na-nextstrain/results/explained_variance_pca_concatenated.csv"

## Load data

In [18]:
colors = pd.read_csv(colors_path, sep="\t", names=[i for i in range(0,101)])

In [19]:
embeddings_df = pd.read_csv(embeddings_path, sep="\t")

In [20]:
embeddings_df.rename(
    columns={
        "y_value": "y",
        "num_date": "date",
    },
    inplace=True
)

In [21]:
embeddings_df.head()

Unnamed: 0,strain,y,date,MCC,clade_membership,pca1_ha,pca2_ha,pca3_ha,pca4_ha,pca5_ha,...,mds_label_concatenated,mds_label_default_concatenated,tsne_x_concatenated,tsne_y_concatenated,t-sne_label_concatenated,t-sne_label_default_concatenated,umap_x_concatenated,umap_y_concatenated,umap_label_concatenated,umap_label_default_concatenated
0,A/RobatKarim/21210/2016,1,2016.05,unassigned,3c3,4.124323,0.42406,-0.284784,-0.211635,0.39558,...,-1,-1,-21.82644,-19.913363,8,-1,7.126915,6.470109,4,2
1,A/MUWRP-Uganda/579/2016,2,2016.67,unassigned,3c3,4.907026,1.182966,-0.479557,0.101974,0.52729,...,-1,-1,-21.396805,-25.264858,8,17,-6.238563,-1.219417,2,-1
2,A/Sydney/29/2016,3,2016.03,unassigned,3c3,6.520086,1.084436,-0.503306,0.362171,-0.175372,...,-1,-1,-21.905928,-20.044443,8,-1,-6.796391,-1.107938,2,8
3,A/Arkansas/02/2016,4,2016.08,unassigned,3c3,6.520956,1.087413,-0.500636,0.362182,-0.165772,...,-1,-1,-21.906374,-20.0345,8,-1,-6.69848,-1.16888,2,-1
4,A/Pennsylvania/08/2016,5,2016.02,unassigned,3c3,6.549953,1.092052,-0.508169,0.366696,-0.183761,...,-1,-1,-21.906868,-20.034521,8,-1,-7.058081,-1.003462,2,-1


In [22]:
# Parametrizing node_df
clade_membership = "MCC"

In [23]:
accuracy_df = pd.read_csv(accuracy_path)

In [24]:
accuracy_df

Unnamed: 0,MCC,embedding,threshold,TN,FN,TP,FP,analysis_name
0,0.357,mds,4.0,692237,22642,32570,78806,ha
1,0.413,pca,2.0,692243,17862,37350,78800,concatenated
2,0.505,umap,2.0,651268,1160,54052,119775,ha
3,0.506,pca,2.0,655909,1905,53307,115134,ha
4,0.544,umap,2.0,671175,1216,53996,99868,concatenated
5,0.571,mds,4.0,738031,17509,37703,33012,concatenated
6,0.654,t-sne,4.0,747607,14997,40215,23436,ha
7,0.833,t-sne,4.0,758417,5783,49429,12626,concatenated


In [25]:
explained_variance_df_ha = pd.read_csv(explained_variance_pca_ha)

In [26]:
explained_variance_df_ha

Unnamed: 0,explained variance,principal components
0,0.2652,1
1,0.2039,2
2,0.0826,3
3,0.0528,4
4,0.0334,5
5,0.0262,6
6,0.0209,7
7,0.0146,8
8,0.0116,9
9,0.0113,10


In [27]:
explained_variance_pca_ha_values = explained_variance_df_ha["explained variance"].values.tolist()

In [28]:
explained_variance_pca_ha_values

[0.2652,
 0.2039,
 0.0826,
 0.0528,
 0.0334,
 0.0262,
 0.0209,
 0.0146,
 0.0116,
 0.0113]

In [29]:
explained_variance_df_concatenated = pd.read_csv(explained_variance_pca_concatenated)

In [30]:
explained_variance_df_concatenated

Unnamed: 0,explained variance,principal components
0,0.4392,1
1,0.1335,2
2,0.0751,3
3,0.0492,4
4,0.0251,5
5,0.0179,6
6,0.0165,7
7,0.0151,8
8,0.0109,9
9,0.0095,10


In [31]:
explained_variance_pca_concatenated_values = explained_variance_df_concatenated["explained variance"].values.tolist()


In [32]:
explained_variance_pca_concatenated_values

[0.4392,
 0.1335,
 0.0751,
 0.0492,
 0.0251,
 0.0179,
 0.0165,
 0.0151,
 0.0109,
 0.0095]

## Build color scales

In [33]:
def build_color_range_for_domain(domain, colors, value_for_unassigned=None):
    # Rows are zero-indexed, so to get N colors, we select row N - 1.
    range_ = colors.loc[len(domain) - 1].dropna().values.tolist()
   
    # Replace known values for "unassigned" clade or cluster labels.
    index_for_unassigned = None
    if value_for_unassigned is not None and value_for_unassigned in domain:
        index_for_unassigned = domain.index(value_for_unassigned)
        range_[index_for_unassigned] = "#999999"
        
    return range_

In [34]:
clade_color_domain = embeddings_df[clade_membership].drop_duplicates().values.tolist()

In [35]:
# Order MCCs with "unassigned" always listed first followed by MCCs
# in numerical order.
clade_color_domain = sorted(
    clade_color_domain,
    key=lambda value: -1 if value == "unassigned" else int(value.split("_")[-1])
)

In [36]:
clade_color_range = build_color_range_for_domain(clade_color_domain, colors, value_for_unassigned="unassigned")

## PCA

In [37]:
pca_ha_label_color_domain =  sorted(embeddings_df["pca_label_ha"].drop_duplicates().values)

In [38]:
pca_ha_label_color_range = build_color_range_for_domain(
    pca_ha_label_color_domain,
    colors,
    value_for_unassigned=-1,
)

In [39]:
accuracy_pca_ha = accuracy_df.query(
    "(embedding == 'pca') & (analysis_name == 'ha')"
).iloc[0]["MCC"]

In [40]:
pca_ha_list_of_chart = linking_tree_with_plots_brush(
    embeddings_df,
    ["pca1_ha", "pca2_ha"],
    [
        f"PC 1 (Explained variance: {round(explained_variance_pca_ha_values[0] * 100, 2)}%)",
        f"PC 2 (Explained variance: {round(explained_variance_pca_ha_values[1] * 100, 2)}%)"
    ],
    "pca_label_ha:N",
    ["strain:N", clade_membership, "pca_label_ha:N"],
    pca_ha_label_color_domain,
    pca_ha_label_color_range,
)

pca_ha_chart = (
    pca_ha_list_of_chart[0] | pca_ha_list_of_chart[1].properties(
        title="MCC: " + str(round(accuracy_pca_ha, 4))
    )
)

In [41]:
pca_concatenated_label_color_domain = sorted(embeddings_df["pca_label_concatenated"].drop_duplicates().values)

In [42]:
pca_concatenated_label_color_range = build_color_range_for_domain(
    pca_concatenated_label_color_domain,
    colors,
    value_for_unassigned=-1,
)

In [43]:
accuracy_pca_concatenated = accuracy_df.query(
    "(embedding == 'pca') & (analysis_name == 'concatenated')"
).iloc[0]["MCC"]

In [44]:
pca_concatenated_list_of_chart = linking_tree_with_plots_brush(
    embeddings_df,
    ["pca1_concatenated", "pca2_concatenated"],
    [
        f"PC 1 (Explained variance: {round(explained_variance_pca_concatenated_values[0] * 100, 2)}%)",
        f"PC 2 (Explained variance: {round(explained_variance_pca_concatenated_values[1] * 100, 2)}%)",
    ],
    "pca_label_concatenated:N",
    ["strain:N", clade_membership, "pca_label_concatenated:N"],
    pca_concatenated_label_color_domain,
    pca_concatenated_label_color_range,
)

pca_concatenated_chart = (
    pca_concatenated_list_of_chart[0] | (pca_concatenated_list_of_chart[1].properties(
        title="MCC: " + str(round(accuracy_pca_concatenated, 4)))
    )
)

In [45]:
pca_final_chart = alt.vconcat(
    pca_ha_chart,
    pca_concatenated_chart
).resolve_scale(
    color="independent",
)
pca_final_chart

In [46]:
embeddings_df.columns

Index(['strain', 'y', 'date', 'MCC', 'clade_membership', 'pca1_ha', 'pca2_ha',
       'pca3_ha', 'pca4_ha', 'pca5_ha', 'pca6_ha', 'pca7_ha', 'pca8_ha',
       'pca9_ha', 'pca10_ha', 'pca_label_ha', 'pca_label_default_ha',
       'mds1_ha', 'mds2_ha', 'mds3_ha', 'mds4_ha', 'mds5_ha', 'mds6_ha',
       'mds7_ha', 'mds8_ha', 'mds9_ha', 'mds10_ha', 'mds_label_ha',
       'mds_label_default_ha', 'tsne_x_ha', 'tsne_y_ha', 't-sne_label_ha',
       't-sne_label_default_ha', 'umap_x_ha', 'umap_y_ha', 'umap_label_ha',
       'umap_label_default_ha', 'pca1_concatenated', 'pca2_concatenated',
       'pca3_concatenated', 'pca4_concatenated', 'pca5_concatenated',
       'pca6_concatenated', 'pca7_concatenated', 'pca8_concatenated',
       'pca9_concatenated', 'pca10_concatenated', 'pca_label_concatenated',
       'pca_label_default_concatenated', 'mds1_concatenated',
       'mds2_concatenated', 'mds3_concatenated', 'mds4_concatenated',
       'mds5_concatenated', 'mds6_concatenated', 'mds7_concatena

#### BCubed

In [101]:
cdict = embeddings_df[["strain", "pca_label_ha"]].set_index("strain").transpose().to_dict()

for k, v in cdict.items():
    cdict[k] = set(v.values())

In [102]:
ldict = embeddings_df[["strain", clade_membership]].set_index("strain").transpose().to_dict()

for k, v in ldict.items():
    ldict[k] = set(v.values())

In [103]:
precision = bcubed.precision(cdict, ldict)
recall = bcubed.recall(cdict, ldict)
fscore_pca = bcubed.fscore(precision, recall)

In [None]:
fscore_pca

#### VI score

In [98]:
from math import log 
from scipy.stats import entropy
from sklearn.metrics import mutual_info_score
import numpy as np
def variation_of_information(X, Y):
    n = float(sum([len(x) for x in X]))
    sigma = 0.0
    for x in X:
        p = len(x) / n #partition one
        for y in Y:
            q = len(y) / n # partition two
            r = len(set(x) & set(y)) / n # their mutual information
            if r > 0.0:
                sigma += r * (log(r / p, 2) + log(r / q, 2))
    return abs(sigma)
# The VI of two partitions of a set is equal to the difference
# between the sum of the entropies of the two partitions and 
# their mutual information times two.
# def variation_of_information_srav(X, Y):
#     n = float(sum([len(x) for x in X]))
#     r = [[len(set(x) & set(y)) / n for x in X] for y in Y]
#     #r = mutual_info_score(X, Y)
#     p = [len(x) / n for x in X]
#     q = [len(y) / n for y in Y]
#     return entropy(r, qk = p, base = 2) + entropy(r, qk= q, base=2)

In [99]:
X2 = [ [1,2,3,4], [5,6,7,8,9,10] ]
Y2 = [ [1,2,3,4,5,6], [7,8,9,10] ]
print(variation_of_information(X2, Y2))

1.1019550008653873


In [123]:
cdict = embeddings_df[["strain", "pca_label_ha"]].set_index("strain")
clade = cdict.groupby(["pca_label_ha"])
list_clades = [clade.get_group(x) for x in clade.groups]
predicted_values = [list(lists.index) for lists in list_clades]

ldict = embeddings_df[["strain", clade_membership]].set_index("strain")
clade = ldict.groupby([clade_membership])
list_clades = [clade.get_group(x) for x in clade.groups]
actual_values = [list(lists.index) for lists in list_clades]

In [126]:
print(variation_of_information(predicted_values, actual_values))

2.6671728028013404


In [127]:
embeddings_df

Unnamed: 0,strain,y,date,MCC,clade_membership,pca1_ha,pca2_ha,pca3_ha,pca4_ha,pca5_ha,...,mds_label_concatenated,mds_label_default_concatenated,tsne_x_concatenated,tsne_y_concatenated,t-sne_label_concatenated,t-sne_label_default_concatenated,umap_x_concatenated,umap_y_concatenated,umap_label_concatenated,umap_label_default_concatenated
0,A/RobatKarim/21210/2016,1,2016.05,unassigned,3c3,4.124323,0.424060,-0.284784,-0.211635,0.395580,...,-1,-1,-21.826440,-19.913363,8,-1,7.126915,6.470109,4,2
1,A/MUWRP-Uganda/579/2016,2,2016.67,unassigned,3c3,4.907026,1.182966,-0.479557,0.101974,0.527290,...,-1,-1,-21.396805,-25.264858,8,17,-6.238563,-1.219417,2,-1
2,A/Sydney/29/2016,3,2016.03,unassigned,3c3,6.520086,1.084436,-0.503306,0.362171,-0.175372,...,-1,-1,-21.905928,-20.044443,8,-1,-6.796391,-1.107938,2,8
3,A/Arkansas/02/2016,4,2016.08,unassigned,3c3,6.520956,1.087413,-0.500636,0.362182,-0.165772,...,-1,-1,-21.906374,-20.034500,8,-1,-6.698480,-1.168880,2,-1
4,A/Pennsylvania/08/2016,5,2016.02,unassigned,3c3,6.549953,1.092052,-0.508169,0.366696,-0.183761,...,-1,-1,-21.906868,-20.034521,8,-1,-7.058081,-1.003462,2,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,A/Texas/104/2017,1639,2017.25,MCC_25,A1b,-0.114784,-2.494947,-1.570096,-1.786759,0.918412,...,7,7,0.488722,-13.286780,9,53,19.048512,-1.680724,1,10
1639,A/Santiago/RMS_op013d0/2017,1640,2017.45,MCC_25,A1b,-0.165256,-2.658074,-1.691115,-1.755117,0.824991,...,7,7,7.264900,-11.130849,9,45,18.903261,-2.425540,1,10
1640,A/Arkansas/14/2017,1641,2017.14,MCC_25,A1b,-0.165256,-2.658074,-1.691115,-1.755117,0.824991,...,7,7,0.514482,-12.837511,9,53,18.724056,-2.047344,1,10
1641,A/Linkou/0186/2016,1642,2016.98,unassigned,A1b,-0.165256,-2.658074,-1.691115,-1.755117,0.824991,...,7,7,7.034021,-11.752489,9,45,18.553830,-2.414502,1,10


In [None]:
pca_final_chart.save(output_pca_html)
save(pca_final_chart, output_pca_png, scale_factor=2.0)

## MDS

In [None]:
mds_ha_label_color_domain =  sorted(embeddings_df["mds_label_ha"].drop_duplicates().values)

In [None]:
mds_ha_label_color_range = build_color_range_for_domain(
    mds_ha_label_color_domain,
    colors,
    value_for_unassigned=-1,
)

In [None]:
accuracy_mds_ha = accuracy_df.query(
    "(embedding == 'mds') & (analysis_name == 'ha')"
).iloc[0]["MCC"]

In [None]:
mds_ha_list_of_chart = linking_tree_with_plots_brush(
    embeddings_df,
    ["mds1_ha", "mds2_ha"],
    [
        "MDS 1",
        "MDS 2"
    ],
    "mds_label_ha:N",
    ["strain:N", clade_membership, "mds_label_ha:N"],
    mds_ha_label_color_domain,
    mds_ha_label_color_range,
)

mds_ha_chart = (
    mds_ha_list_of_chart[0] | mds_ha_list_of_chart[1].properties(
        title="MCC: " + str(round(accuracy_mds_ha, 4))
    )
)

In [None]:
mds_concatenated_label_color_domain = sorted(embeddings_df["mds_label_concatenated"].drop_duplicates().values)

In [None]:
mds_concatenated_label_color_range = build_color_range_for_domain(
    mds_concatenated_label_color_domain,
    colors,
    value_for_unassigned=-1,
)

In [None]:
accuracy_mds_concatenated = accuracy_df.query(
    "(embedding == 'mds') & (analysis_name == 'concatenated')"
).iloc[0]["MCC"]

In [None]:
mds_concatenated_list_of_chart = linking_tree_with_plots_brush(
    embeddings_df,
    ["mds1_concatenated", "mds2_concatenated"],
    ["MDS 1", "MDS 2"],
    "mds_label_concatenated:N",
    ["strain:N", clade_membership, "mds_label_concatenated:N"],
    mds_concatenated_label_color_domain,
    mds_concatenated_label_color_range,
)

mds_concatenated_chart = (
    mds_concatenated_list_of_chart[0] | (mds_concatenated_list_of_chart[1].properties(
        title="MCC: " + str(round(accuracy_mds_concatenated, 4)))
    )
)

In [None]:
mds_final_chart = alt.vconcat(
    mds_ha_chart,
    mds_concatenated_chart
).resolve_scale(
    color="independent",
)
mds_final_chart

In [None]:
cdict = embeddings_df[["strain", "mds_label_ha"]].set_index("strain").transpose().to_dict()

for k, v in cdict.items():
    cdict[k] = set(v.values())

In [None]:
ldict = embeddings_df[["strain", clade_membership]].set_index("strain").transpose().to_dict()

for k, v in ldict.items():
    ldict[k] = set(v.values())

In [None]:
precision = bcubed.precision(cdict, ldict)
recall = bcubed.recall(cdict, ldict)
fscore_mds = bcubed.fscore(precision, recall)

In [None]:
fscore_mds

In [128]:
cdict = embeddings_df[["strain", "mds_label_ha"]].set_index("strain")
clade = cdict.groupby(["mds_label_ha"])
list_clades = [clade.get_group(x) for x in clade.groups]
predicted_values = [list(lists.index) for lists in list_clades]

print(variation_of_information(predicted_values, actual_values))

3.3752222627175925


In [None]:
mds_final_chart.save(output_mds_html)
save(mds_final_chart, output_mds_png, scale_factor=2.0)

## t-SNE 

In [None]:
tsne_ha_label_color_domain =  sorted(embeddings_df["t-sne_label_ha"].drop_duplicates().values)

In [None]:
tsne_ha_label_color_range = build_color_range_for_domain(
    tsne_ha_label_color_domain,
    colors,
    value_for_unassigned=-1,
)

In [None]:
accuracy_tsne_ha = accuracy_df.query(
    "(embedding == 't-sne') & (analysis_name == 'ha')"
).iloc[0]["MCC"]

In [None]:
tsne_ha_list_of_chart = linking_tree_with_plots_brush(
    embeddings_df,
    ["tsne_x_ha", "tsne_y_ha"],
    [
        "t-SNE 1",
        "t-SNE 2"
    ],
    "t-sne_label_ha:N",
    ["strain:N", clade_membership, "t-sne_label_ha:N"],
    tsne_ha_label_color_domain,
    tsne_ha_label_color_range,
)

tsne_ha_chart = (
    tsne_ha_list_of_chart[0] | tsne_ha_list_of_chart[1].properties(
        title="MCC: " + str(round(accuracy_tsne_ha, 4))
    )
)

In [None]:
tsne_concatenated_label_color_domain = sorted(embeddings_df["t-sne_label_concatenated"].drop_duplicates().values)

In [None]:
tsne_concatenated_label_color_range = build_color_range_for_domain(
    tsne_concatenated_label_color_domain,
    colors,
    value_for_unassigned=-1,
)

In [None]:
accuracy_tsne_concatenated = accuracy_df.query(
    "(embedding == 't-sne') & (analysis_name == 'concatenated')"
).iloc[0]["MCC"]

In [None]:
tsne_concatenated_list_of_chart = linking_tree_with_plots_brush(
    embeddings_df,
    ["tsne_x_concatenated", "tsne_y_concatenated"],
    ["t-SNE 1", "t-SNE 2"],
    "t-sne_label_concatenated:N",
    ["strain:N", clade_membership, "t-sne_label_concatenated:N"],
    tsne_concatenated_label_color_domain,
    tsne_concatenated_label_color_range,
)

tsne_concatenated_chart = (
    tsne_concatenated_list_of_chart[0] | (tsne_concatenated_list_of_chart[1].properties(
        title="MCC: " + str(round(accuracy_tsne_concatenated, 4)))
    )
)

In [None]:
tsne_final_chart = alt.vconcat(
    tsne_ha_chart,
    tsne_concatenated_chart
).resolve_scale(
    color="independent",
)
tsne_final_chart

In [None]:
cdict = embeddings_df[["strain", "t-sne_label_ha"]].set_index("strain").transpose().to_dict()

for k, v in cdict.items():
    cdict[k] = set(v.values())

In [None]:
ldict = embeddings_df[["strain", clade_membership]].set_index("strain").transpose().to_dict()

for k, v in ldict.items():
    ldict[k] = set(v.values())

In [None]:
precision = bcubed.precision(cdict, ldict)
recall = bcubed.recall(cdict, ldict)
fscore_tsne = bcubed.fscore(precision, recall)

In [None]:
fscore_tsne

In [130]:
cdict = embeddings_df[["strain", "t-sne_label_ha"]].set_index("strain")
clade = cdict.groupby(["t-sne_label_ha"])
list_clades = [clade.get_group(x) for x in clade.groups]
predicted_values = [list(lists.index) for lists in list_clades]

print(variation_of_information(predicted_values, actual_values))

2.273363969835358


In [None]:
save(tsne_final_chart, output_tsne_html)
save(tsne_final_chart, output_tsne_png, scale_factor=2.0)

## UMAP

In [None]:
umap_ha_label_color_domain =  sorted(embeddings_df["umap_label_ha"].drop_duplicates().values)

In [None]:
umap_ha_label_color_range = build_color_range_for_domain(
    umap_ha_label_color_domain,
    colors,
    value_for_unassigned=-1,
)

In [None]:
accuracy_umap_ha = accuracy_df.query(
    "(embedding == 'umap') & (analysis_name == 'ha')"
).iloc[0]["MCC"]

In [None]:
umap_ha_list_of_chart = linking_tree_with_plots_brush(
    embeddings_df,
    ["umap_x_ha", "umap_y_ha"],
    [
        "UMAP 1",
        "UMAP 2"
    ],
    "umap_label_ha:N",
    ["strain:N", clade_membership, "umap_label_ha:N"],
    umap_ha_label_color_domain,
    umap_ha_label_color_range,
)

umap_ha_chart = (
    umap_ha_list_of_chart[0] | umap_ha_list_of_chart[1].properties(
        title="MCC: " + str(round(accuracy_umap_ha, 4))
    )
)

In [None]:
umap_concatenated_label_color_domain = sorted(embeddings_df["umap_label_concatenated"].drop_duplicates().values)

In [None]:
umap_concatenated_label_color_range = build_color_range_for_domain(
    umap_concatenated_label_color_domain,
    colors,
    value_for_unassigned=-1,
)

In [None]:
accuracy_umap_concatenated = accuracy_df.query(
    "(embedding == 'umap') & (analysis_name == 'concatenated')"
).iloc[0]["MCC"]

In [None]:
umap_concatenated_list_of_chart = linking_tree_with_plots_brush(
    embeddings_df,
    ["umap_x_concatenated", "umap_y_concatenated"],
    ["UMAP 1", "UMAP 2"],
    "umap_label_concatenated:N",
    ["strain:N", clade_membership, "umap_label_concatenated:N"],
    umap_concatenated_label_color_domain,
    umap_concatenated_label_color_range,
)

umap_concatenated_chart = (
    umap_concatenated_list_of_chart[0] | (umap_concatenated_list_of_chart[1].properties(
        title="MCC: " + str(round(accuracy_umap_concatenated, 4)))
    )
)

In [None]:
umap_final_chart = alt.vconcat(
    umap_ha_chart,
    umap_concatenated_chart
).resolve_scale(
    color="independent",
)
umap_final_chart

In [None]:
cdict = embeddings_df[["strain", "umap_label_ha"]].set_index("strain").transpose().to_dict()

for k, v in cdict.items():
    cdict[k] = set(v.values())

In [None]:
ldict = embeddings_df[["strain", clade_membership]].set_index("strain").transpose().to_dict()

for k, v in ldict.items():
    ldict[k] = set(v.values())

In [None]:
precision = bcubed.precision(cdict, ldict)
recall = bcubed.recall(cdict, ldict)
fscore_umap = bcubed.fscore(precision, recall)

In [None]:
fscore_umap

In [131]:
cdict = embeddings_df[["strain", "umap_label_ha"]].set_index("strain")
clade = cdict.groupby(["umap_label_ha"])
list_clades = [clade.get_group(x) for x in clade.groups]
predicted_values = [list(lists.index) for lists in list_clades]

print(variation_of_information(predicted_values, actual_values))

2.7802122337506563


In [None]:
save(umap_final_chart, output_umap_html)
save(umap_final_chart, output_umap_png, scale_factor=2.0)

In [132]:
for value in ["pca", "mds", "t-sne", "umap"]:
    cdict = embeddings_df[["strain", value + "_label_ha"]].set_index("strain")
    clade = cdict.groupby([value + "_label_ha"])
    list_clades = [clade.get_group(x) for x in clade.groups]
    predicted_values = [list(lists.index) for lists in list_clades]
    
    print(value)
    print(variation_of_information(predicted_values, actual_values))

pca
2.6671728028013404
mds
3.3752222627175925
t-sne
2.273363969835358
umap
2.7802122337506563


## All embeddings by clade membership

In [None]:
# TODO:
# - Add MCC accuracies as titles per plot
charts = linking_tree_with_plots_brush(
    embeddings_df,
    [
        'mds1_concatenated',
        'mds2_concatenated',
        'mds1_ha',
        'mds2_ha',
        'tsne_x_concatenated',
        'tsne_y_concatenated',
        'tsne_x_ha',
        'tsne_y_ha',
        'pca1_concatenated',
        'pca2_concatenated',
        'pca1_ha',
        'pca2_ha',
        'umap_x_concatenated',
        'umap_y_concatenated',
        'umap_x_ha',
        'umap_y_ha',
    ],
    [
        'MDS 1',
        'MDS 2',
        'MDS 1',
        'MDS 2',
        't-SNE 1',
        't-SNE 2',
        't-SNE 1',
        't-SNE 2', 
        'PC 1 (Expected Variance: {}%'.format(round(explained_variance_pca_concatenated_values[0]*100,2)) + ")",
        'PC 2 (Expected Variance: {}%'.format(round(explained_variance_pca_concatenated_values[1]*100,2)) + ")",
        'PC 1 (Expected Variance: {}%'.format(round(explained_variance_pca_ha_values[0]*100,2)) + ")",
        'PC 2 (Expected Variance: {}%'.format(round(explained_variance_pca_ha_values[1]*100,2)) + ")",
        'UMAP 1',
        'UMAP 2',
        'UMAP 1',
        'UMAP 2',
    ],
    clade_membership+":N",
    ['strain', clade_membership],
    clade_color_domain,
    clade_color_range,
)

In [None]:
chart_embeddings = alt.vconcat(
    charts[0],
    charts[6].properties(title=["HA only", "MCC: " + str(round(accuracy_pca_ha, 4))]) | charts[5].properties(title=["HA and NA", "MCC: " + str(round(accuracy_pca_concatenated, 4))]),
    charts[2].properties(title="MCC: " + str(round(accuracy_mds_ha, 4))) | charts[1].properties(title="MCC: " + str(round(accuracy_mds_concatenated, 4))),
    charts[4].properties(title="MCC: " + str(round(accuracy_tsne_ha, 4))) | charts[3].properties(title="MCC: " + str(round(accuracy_tsne_concatenated, 4))),
    charts[8].properties(title="MCC: " + str(round(accuracy_umap_ha, 4))) | charts[7].properties(title="MCC: " + str(round(accuracy_umap_concatenated, 4)))
)
chart_embeddings

In [None]:
chart_embeddings.save(output_full_html)
save(chart_embeddings, output_full_png, scale_factor=2.0)

In [None]:
print("the FIRST value is the fscore, the second is the MCC.")
print("pca_ha:")
print(fscore_pca)
print(accuracy_pca_ha)
print("mds_ha:")
print(fscore_mds)
print(accuracy_mds_ha)
print("t-sne_ha:")
print(fscore_tsne)
print(accuracy_tsne_ha)
print("umap_ha:")
print(fscore_umap)
print(accuracy_umap_ha)