In [1]:
import numpy as np
import pandas as pd

In [2]:
from ema import EmbeddingHandler

In [3]:
SPLIT_ID = 1
AGG_ID = 1
MODEL_NAME = "esm2_t30_150M_UR50D"
FP_AGG_EMB_0 = (
    f"../data/aggregated-embeddings/{MODEL_NAME}/split-0/agg-{AGG_ID}.npy"
)
FP_AGG_EMB_1 = f"../data/aggregated-embeddings/{MODEL_NAME}/split-{SPLIT_ID}/agg-{AGG_ID}.npy"
FP_AGG_VARIANT_META_0 = (
    f"../data/aggregated-embeddings/{MODEL_NAME}/split-0/agg-{AGG_ID}-meta.csv"
)
FP_AGG_VARIANT_META_1 = f"../data/aggregated-embeddings/{MODEL_NAME}/split-{SPLIT_ID}/agg-{AGG_ID}-meta.csv"

## Load data

In [4]:
emb_0 = np.load(FP_AGG_EMB_0)
emb_1 = np.load(FP_AGG_EMB_1)
df_meta_data = pd.read_csv(FP_AGG_VARIANT_META_0)

## Initialise ema embedding object

In [5]:
# initialize embedding handler
emb = EmbeddingHandler(sample_meta_data=df_meta_data)

# add embeddings to the handler
emb.add_emb_space(embeddings=emb_0, emb_space_name="full_length")
emb.add_emb_space(embeddings=emb_1, emb_space_name="chopped")

27 samples loaded.
Meta data columns: Index(['variant', 'gene', 'family'], dtype='object')


## Explore embedding spaces

In [6]:
# show histogram of embedding value distributions of the different embeddings
fig = emb.plot_emb_hist()
fig.show()

In [7]:
# get mean and std for all embedding spaces

In [9]:
# within one sample space show distribution of embedding values between samples
fig = emb.plot_emb_box(group="sample")
fig.show()

In [13]:
fig = emb.plot_emb_box(group="gene")
fig.show()

In [None]:
# visualise embedding space with PCA, tsne, umap

In [12]:
fig = emb.visualise_emb_tsne(emb_space_name="full_length", colour="family")
fig.show()

In [10]:
fig = emb.visualise_emb_tsne(emb_space_name="chopped", colour="family")
fig.show()

## Explore sample distances

In [None]:
fig = emb.plot_emb_dist_heatmap(
        emb_space_name="full_length",
        group="family",
        distance_metric="cosine",
    )
fig = emb.plot_emb_dist_heatmap(
        emb_space_name="chopped",
        group="family",
        distance_metric="cosine",
    )

In [9]:
# plot the distribution of sample distances between the two embeddings
# (e.g. euclidean distance, cosine similarity)

In [10]:
full_length_dis = emb.get_sample_distance("full_length", "cosine")

In [13]:
full_length_dis.shape

(27, 27)

In [None]:
# heatmap of distances of samples within one embedding space
# ordered by metadata