In [None]:
 ! git clone https://github.com/pia-francesca/ema.git

Cloning into 'ema'...
remote: Enumerating objects: 203, done.[K
remote: Counting objects: 100% (203/203), done.[K
remote: Compressing objects: 100% (140/140), done.[K
remote: Total 203 (delta 99), reused 160 (delta 60), pack-reused 0[K
Receiving objects: 100% (203/203), 33.35 MiB | 5.68 MiB/s, done.
Resolving deltas: 100% (99/99), done.


In [None]:
! pip install umap

Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25l[?25hdone
  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3543 sha256=d2c82e2ed73864880a400afb6648c05c3dd30f85fde1877af0ac63fbdff81c29
  Stored in directory: /root/.cache/pip/wheels/15/f1/28/53dcf7a309118ed35d810a5f9cb995217800f3f269ab5771cb
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


In [None]:
import numpy as np
import pandas as pd

from ema.ema.ema import EmbeddingHandler

In [None]:
DATA_DIR = 'ema/examples/ion-channel-proteins/'
FP_METADATA = DATA_DIR + 'metadata.csv'
FP_EMB_ESM1b = DATA_DIR + 'esm1b_t33_650M_UR50S-embeddings.npy'
FP_EMB_ESM2 = DATA_DIR + 'esm2_t33_650M_UR50D-embeddings.npy'
FP_EMB_ESM1v = DATA_DIR + 'esm1v_t33_650M_UR90S_1-embeddings.npy'

In [None]:
# load metadata and embeddings

metadata = pd.read_csv(FP_METADATA)
emb_esm1b = np.load(FP_EMB_ESM1b)
emb_esm2 = np.load(FP_EMB_ESM2)
emb_esm1v = np.load(FP_EMB_ESM1v)

print(emb_esm1b.shape, emb_esm2.shape)
metadata.head()

(102, 1280) (102, 1280)


Unnamed: 0,gene_name,family
0,KCNA1,Kv
1,KCNA2,Kv
2,KCNA3,Kv
3,KCNA4,Kv
4,KCNA5,Kv


In [None]:
# initialize embedding handler
emb_handler = EmbeddingHandler(metadata)

# add embeddings to the handler
emb_handler.add_emb_space(embeddings=emb_esm1b, emb_space_name='esm1b')
emb_handler.add_emb_space(embeddings=emb_esm2, emb_space_name='esm2')
emb_handler.add_emb_space(embeddings=emb_esm1v, emb_space_name='esm1v')

102 samples loaded.
Categories in meta data: ['family']
Numerical columns in meta data: []




8 clusters calculated for esm1b.
Embedding space esm1b added.
Embeddings have length 1280.




8 clusters calculated for esm2.
Embedding space esm2 added.
Embeddings have length 1280.




8 clusters calculated for esm1v.
Embedding space esm1v added.
Embeddings have length 1280.


## Unsupervised Clustering x Metadata

By default ema computes a number of clusters equal to the mean number of categories in the metadata. This is a good starting point, but you can also specify the number of clusters you want to compute.

In [None]:
# specify a specific number of clusters
emb_handler.recalculate_clusters(n_clusters=5, emb_space_name="esm1b")
emb_handler.recalculate_clusters(n_clusters=5, emb_space_name="esm2")
emb_handler.recalculate_clusters(n_clusters=5, emb_space_name="esm1v")





5 clusters calculated for esm1b.
5 clusters calculated for esm2.








5 clusters calculated for esm1v.


### Unsupervised Clusters

### Overlap of unsupervised clusters between embedding spaces

In [None]:
emb_handler.plot_feature_cluster_overlap(emb_space_name="esm1b",
                                         feature='cluster_esm2')

In [None]:
emb_handler.plot_feature_cluster_overlap(emb_space_name="esm1v",
                                         feature='cluster_esm1b')

In [None]:
emb_handler.plot_feature_cluster_overlap(emb_space_name="esm2",
                                         feature='cluster_esm1b')

## Pairwise distances

### Similarities between ESM1b, ESM1v and ESM2 embeddings

In [None]:
emb_handler.plot_emb_dis_scatter(
    emb_space_name_1='esm1b',
    emb_space_name_2='esm2',
    distance_metric = 'euclidean',
)

There is a stronger correlation between the pairwise distances between the embeddings of ESM1b and ESM1v compared to ESM1b and ESM2 for the set of sequencs.

In [None]:
emb_handler.plot_emb_dis_scatter(
    emb_space_name_1='esm1b',
    emb_space_name_2='esm1v',
    distance_metric = 'euclidean',
)

using the normalised Euclidean distance to account for the different scales of the embeddings

In [None]:
emb_handler.plot_emb_dis_scatter(
    emb_space_name_1='esm1b',
    emb_space_name_2='esm1v',
    distance_metric = 'euclidean_normalised',
)

### Unsupervised clusters

We can inspect how close the initial clusters from one embedding space are in the ther embedding spaces.

In [None]:
emb_handler.plot_emb_dis_scatter(
    emb_space_name_1='esm1b',
    emb_space_name_2='esm2',
    distance_metric = 'euclidean',
    colour_group="cluster_esm1b",
    colour_value_1="1",
)

In [None]:
emb_handler.plot_emb_dis_scatter(
    emb_space_name_1='esm1b',
    emb_space_name_2='esm1v',
    distance_metric = 'euclidean',
    colour_group="cluster_esm1b",
    colour_value_1="1",
)

In [None]:
emb_handler.plot_emb_dis_scatter(
    emb_space_name_1='esm1b',
    emb_space_name_2='esm2',
    distance_metric = 'euclidean',
    colour_group="cluster_esm1b",
    colour_value_1="2",
    colour_value_2="4"
)

In [None]:
emb_handler.plot_emb_dis_scatter(
    emb_space_name_1='esm1b',
    emb_space_name_2='esm1v',
    distance_metric = 'euclidean',
    colour_group="cluster_esm1b",
    colour_value_1="2",
    colour_value_2="4"
)

## Visualisation of dimensionality reduction x Unsupervised Clustering

### PCA

#### ESM1b coloured by unsupervised clusters in ESM1b

In [None]:
emb_handler.visualise_emb_pca(emb_space_name="esm1b",
                              colour="cluster_esm1b")

#### ESM1v coloured by unsupervised clusters in ESM1b

In [None]:
emb_handler.visualise_emb_pca(emb_space_name="esm1v",
                              colour="cluster_esm1b")

#### ESM2 coloured by unsupervised clusters in ESM1b

In [None]:
emb_handler.visualise_emb_pca(emb_space_name="esm2",
                              colour="cluster_esm1b")

## Adding Metadata

### Unsupervised Clustering x Metadata

We can check whether the unsupervised clusters are related to the provided metadata.

In [None]:
emb_handler.plot_feature_cluster_overlap(emb_space_name="esm1b",
                                         feature='family')

In [None]:
emb_handler.plot_feature_cluster_overlap(emb_space_name="esm1v",
                                         feature='family')

In [None]:
emb_handler.plot_feature_cluster_overlap(emb_space_name="esm2",
                                         feature='family')

### Pairwise distance x Metadata

In [None]:
emb_handler.plot_emb_dis_dif_dis_per_group(emb_space_name="esm1b",
                                           distance_metric='euclidean',
                                           group="family")

In [None]:
emb_handler.plot_emb_dis_dif_dis_per_group(emb_space_name="esm2",
                                           distance_metric='euclidean',
                                           group="family")

In [None]:
emb_handler.plot_emb_dis_scatter(
    emb_space_name_1='esm1b',
    emb_space_name_2='esm1v',
    distance_metric = 'euclidean',
    colour_group="family",
    colour_value_1="Kir",
)

In [None]:
emb_handler.plot_emb_dis_scatter(
    emb_space_name_1='esm1b',
    emb_space_name_2='esm2',
    distance_metric = 'euclidean',
    colour_group="family",
    colour_value_1="CNG",
    colour_value_2="HCN"
)