## LSM Embedding Analysis
##### Colab Kernel (Brainframe GPU)
##### Dataset (Electrodes)

Grants command for Access on Demand (AoD):

https://grants.corp.google.com/#/grants?request=20h%2Fchr-ards-electrodes-deid-colab-jobs&reason=b%2F314799341

### About This Notebook:
Visualizes embeddings, of training data, produced by the ViT MAE encoder.
This notebook explores the affect of two pre-train data sizes (1K, 1.3M) and the affect fine-tuning as compared to the pre-trained embedding.


## Setup

In [None]:
# @title Imports

import os
import collections
from collections import Counter
import itertools
from typing import Sequence

import jax.numpy as jnp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import tensorflow as tf

from google3.learning.deepmind.xmanager2.client import xmanager_api
from google3.pyglib import gfile



In [None]:
# import

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

In [None]:
# @title Embedding Plotting Helper Functions

# Helper Functions
def plot_embeddings(Xd, yd, colors, names):
  if len(names) != len(colors):
    raise ValueError(f'names ({len(names)}) and colors ({len(colors)}) must have the same length.')

  # # PCA
  # pca = PCA()
  # pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
  # plt.figure(figsize=(8,6))
  # Xt = pipe.fit_transform(Xd)
  # plot = plt.scatter(Xt[:,2], Xt[:,3], c=yd);
  # plt.xlabel('PCA Dim 1')
  # plt.ylabel('PCA Dim 2')
  # plt.legend(
  #     handles=plot.legend_elements()[0],
  #     labels=names,
  #     loc='upper left',
  #     bbox_to_anchor=(1, 1)
  # );
  # plt.show()
  # print('\n\n')


  # # LDA
  # clf = LDA()
  # clf.fit(Xd, yd)
  # lda = LDA(n_components=None, priors=None, shrinkage=None, solver='svd',store_covariance=False, tol=0.0001)
  # X_r2 = lda.fit(Xd, yd).transform(Xd)

  # plt.figure(figsize=(8,6))
  # for i in range(len(names)):
  #   plt.scatter(X_r2[yd == i, 0], X_r2[yd == i, 1], label=names[i], alpha=0.3, c=colors[i])

  # plt.xlabel('LDA Dim 1')
  # plt.ylabel('LDA Dim 2')
  # plt.legend(
  #     loc='upper left',
  #     bbox_to_anchor=(1, 1),
  #     shadow=False,
  #     scatterpoints=1
  # );
  # plt.show()
  # print('\n\n')


  # # LDA 1D Distributions
  # plt.figure(figsize=(8,6))
  # for i in range(len(names)):
  #   plt.hist(X_r2[yd == i, 0],20, density=True, label=names[i], alpha=0.5, color=colors[i])

  # plt.xlabel('LDA Dim 1')
  # plt.ylabel('Frac. of Examples Per Class')
  # plt.legend(
  #     loc='upper left',
  #     bbox_to_anchor=(1, 1),
  #     shadow=False,
  #     scatterpoints=1
  # );
  # plt.show()


  # TSNE
  tsne = TSNE(n_components=2, random_state=0)
  Xt = tsne.fit_transform(Xd)

  plt.figure(figsize=(6, 6))
  for i in range(len(names)):
    plt.scatter(Xt[yd == i, 0], Xt[yd == i, 1], label=names[i], alpha=0.3, c=colors[i])

  plt.xlabel('t-SNE Dim 1')
  plt.ylabel('t-SNE Dim 2')
  plt.legend(
      loc='upper left',
      bbox_to_anchor=(1, 1),
      shadow=False,
      scatterpoints=1
  );
  plt.show()

  return Xt


def reshape_time_crop_patch_embeddings(
    x,
    patch_reorder_shape,
    start=None,
    end=None,
):
  """Reshape n_token embeddeding into an image of embeddedings."""
  # Get patch and input shape.
  n_h, n_w = patch_reorder_shape
  n_batch, n_tokens, embedding_dim = x.shape  # pylint: disable=unused-variable

  # Get start and end crop (along time axis).
  if end is None:
    end = 1
  if start is None:
    start = 0
  if start >= end:
    raise ValueError(f'start {start}, is greater than end {end}.')
  if start > 1 or end > 1:
    raise ValueError(f'start {start} and end {end} cannot be greater than 1.')

  # reorganize patches into image:
  x = jnp.reshape(x, [n_batch, n_h, n_w, embedding_dim])

  # Time Crop image based on horizon
  start_idx = int(start * n_h)
  end_idx = int(end * n_h)
  x = x[:, start_idx:end_idx, :, :]

  return x

In [None]:
# @title Embedding Constants and Setup

# Constants
# Feature names in order.
ALL_FEATURE_NAMES = ['sclValue', 'sclSlope', 'skinTempValue',
                    'hr', 'hrvPercentGood','hrvRR80thPercentile', 'hrvRR20thPercentile',
                    'hrvRRMedian', 'hrvRRMean', 'hrvShannonEntropyRR', 'hrvShannonEntropyRRDiffs',
                    'hrvPNN30', 'hrvRMSSD', 'hrvSDNN', 'sleepCoefficient', 'onWrist',
                    'jerkAuto', 'stepCount', 'logEnergy', 'grok_covariance', 'logEnergyRatio',
                    'zeroCrossingStd', 'zeroCrossingAvg', 'axisMean', 'altimStdNorm', 'grok_kurtosis']

# All activity names and their corresponding activity ID.
actDict = {'Yoga': 52000, 'Pilates': 53000, 'Bike':90001,
            'Run':90009,'Hike':90012,'Walk':90013,'Elliptical':90017,'Treadmill':90019,
            'Swim':90024,'HIIT':91040,'Weightlifting':91043,'Core training':91046}

# Activities and the index that are represented by in an OHE label.
actOHEDict = {
    'Weightlifting': 0, 'Swim': 1, 'Elliptical': 2, 'Walk': 3,
    'Run': 4, 'Bike': 5, 'HIIT': 6, 'Strength training': 7
}

# Pretrain data sizes.
datasizes = [1000, 10000, 100000, 750000, 1321235]

# XM Dict of Embedding Dump Jobs:
embedding_dump_xm_dict = {
    # Train size: 1.3 M
    '126388131/1': {
        'pretrain_datasize': 1321235,
        'pretrain_step': 50000,
        'ft_step': 300,
        'ft_probe': 'linear_probe'
    },

    # Train size: 750 K
    '127490536/4': {
        'pretrain_datasize': 750000,
        'pretrain_step': 50000,
        'ft_step': 300,
        'ft_probe': 'linear_probe'
    },

    # Train size: 100 K
    '127490536/3': {
        'pretrain_datasize': 100000,
        'pretrain_step': 50000,
        'ft_step': 300,
        'ft_probe': 'linear_probe'
    },

    # Train size: 10 K
    '127490536/2': {
        'pretrain_datasize': 10000,
        'pretrain_step': 50000,
        'ft_step': 300,
        'ft_probe': 'linear_probe'
    },

    # Train size: 1 K
    '127490536/1': {
        'pretrain_datasize': 1000,
        'pretrain_step': 50000,
        'ft_step': 300,
        'ft_probe': 'linear_probe'
    },
}

# XM Dict of Embedding Dump Jobs:
finetune_embedding_dump_xm_dict = {
    # Train size: 1.3 M
    '126268296/1': {
        'pretrain_datasize': 1321235,
        'pretrain_step': 50000,
        'ft_step': 300,
        'ft_probe': 'finetune'
    },

    # Train size: 1 K
    '127526958/1': {
        'pretrain_datasize': 1000,
        'pretrain_step': 50000,
        'ft_step': 300,
        'ft_probe': 'finetune'
    },
}

# Setup XM Client
xm_client = xmanager_api.XManagerApi(xm_deployment_env='alphabet')

## Non-Finetune 1.3M Pretrain

In [None]:
# @title Load Embeddings (~20 mins to load embeddings to RAM).

# A full list of these XID / WID pairs can be found in constants
xid_wid = '126388131/1'

file_name = os.path.join('/cns/dz-d/home/xliucs/lsm/xm/', xid_wid)
# embedding_dump_fname = os.path.join(file_name, f'full_train_embeddeding_300.npy')
embedding_dump_fname = os.path.join(file_name, f'pooled_train_embeddeding_300.npy')
metadata_name = os.path.join(file_name, f'metadata_300.npy')

print('Reading Full (Pooled) Embedding File:', embedding_dump_fname)
with gfile.Open(embedding_dump_fname, 'rb') as f:
  embedding_arr = np.load(f)

print('Reading Metadata File:', metadata_name)
with gfile.Open(metadata_name, 'rb') as f:
  metadata_arr = np.load(f)

print('\nPooled Embedding shape', embedding_arr.shape)
print('Metadata shape', metadata_arr.shape)

# Parse meta data
targets = metadata_arr[0, :]
preds = metadata_arr[1, :]
subj_id = metadata_arr[2, :]
age = metadata_arr[3, :]
weight = metadata_arr[4, :]
gender = metadata_arr[5, :]

# Setup
labels = targets.tolist()
print(f'\nTotal Count {len(labels)}\n')
y, X = [], []
for actName, act in actOHEDict.items():
  indices = [i for i, x in enumerate(labels) if x == act]
  print(actName, str(len(indices)))


In [None]:
# @title (UNUSED) Crop and Pool Embeddings
# NOTE: This is to be used if you are loading the FULL (unpooled) embeddings.
# This can then be used to reshape the spatio-temporal characteristics of the
# patches, and then select a time window of interest.

# Xd = embedding_arr

# # Each 0.1 of [start, end] represents 30 mins.
# # Eg. 1: [start, end] = [0.8, 0.9] = [240 min, 270 min]
# # Eg. 2: [start, end] = [None, None] = [0, 1] = [0 min, 300 min]
# # A single patch is 0.033333 = 10 mins
# start = None
# end = None

# print('Full Embedding shape', Xd.shape)
# # Takes the embeddings from the last (end - start) percentage of 300 min window.
# Xd = reshape_time_crop_patch_embeddings(Xd, patch_reorder_shape=(30, 6), start=start, end=end)
# print('Cropped Embedding shape', Xd.shape)
# # These selected embeddings are average pooled.
# Xd = np.mean(Xd, axis=(1, 2))
# print('Pooled Embedding shape', Xd.shape)
# Xd.shape

In [None]:
# @title ACTIVITY RECOGNITION

Xd = embedding_arr
yd = targets
names = list(actOHEDict.keys())
colors = sns.color_palette("Set2", n_colors=8).as_hex()
embedding_xt_1M = plot_embeddings(Xd, yd, colors, names)
yd_1M = yd

In [None]:
# @title Embedding Quality

# get kmeans
num_classes = 8
kmeans_1M = KMeans(n_clusters=8, random_state=42).fit_predict(embedding_xt_1M)

sil_score = silhouette_score(embedding_xt_1M, kmeans_1M)
db_score = davies_bouldin_score(embedding_xt_1M, kmeans_1M)
ch_score = calinski_harabasz_score(embedding_xt_1M, kmeans_1M)
ari = adjusted_rand_score(yd_1M, kmeans_1M)
nmi = normalized_mutual_info_score(yd_1M, kmeans_1M)

print('1.3 M Pretrain / No Finetune')
print(f"Silhouette Score: {sil_score}")
print(f"Davies-Bouldin Score: {db_score}")
print(f"Calinski-Harabasz Score: {ch_score}")
print(f"Adjusted Rand Index: {ari}")
print(f"Normalized Mutual Information: {nmi}")

data = [
    ['Silhouette Score', sil_score],
    ['Davies-Bouldin Score', db_score],
    ['Calinski-Harabasz Score', ch_score],
    ['Adjusted Rand Index', ari],
    ['Normalized Mutual Information', nmi],
]

df = pd.DataFrame(data, columns=['Metric', 'Value'])
df

## Non-Finetune 1K Pretrain

In [None]:
# @title Load Embeddings (~20 mins to load embeddings to RAM).

# A full list of these XID / WID pairs can be found in constants
xid_wid = '127490536/1'

file_name = os.path.join('/cns/dz-d/home/xliucs/lsm/xm/', xid_wid)
# embedding_dump_fname = os.path.join(file_name, f'full_train_embeddeding_300.npy')
embedding_dump_fname = os.path.join(file_name, f'pooled_train_embeddeding_300.npy')
metadata_name = os.path.join(file_name, f'metadata_300.npy')

print('Reading Full (Pooled) Embedding File:', embedding_dump_fname)
with gfile.Open(embedding_dump_fname, 'rb') as f:
  embedding_arr = np.load(f)

print('Reading Metadata File:', metadata_name)
with gfile.Open(metadata_name, 'rb') as f:
  metadata_arr = np.load(f)

print('\nPooled Embedding shape', embedding_arr.shape)
print('Metadata shape', metadata_arr.shape)

# Parse meta data
targets = metadata_arr[0, :]
preds = metadata_arr[1, :]
subj_id = metadata_arr[2, :]
age = metadata_arr[3, :]
weight = metadata_arr[4, :]
gender = metadata_arr[5, :]

# Setup
labels = targets.tolist()
print(f'\nTotal Count {len(labels)}\n')
y, X = [], []
for actName, act in actOHEDict.items():
  indices = [i for i, x in enumerate(labels) if x == act]
  print(actName, str(len(indices)))


In [None]:
# @title ACTIVITY RECOGNITION

Xd = embedding_arr
yd = targets
names = list(actOHEDict.keys())
colors = sns.color_palette("Set2", n_colors=8).as_hex()
embedding_xt_1K = plot_embeddings(Xd, yd, colors, names)
yd_1K = yd

In [None]:
# @title Embedding Quality

num_classes = 8
kmeans_1K = KMeans(n_clusters=8, random_state=42).fit_predict(embedding_xt_1K)

sil_score = silhouette_score(embedding_xt_1K, kmeans_1K)
db_score = davies_bouldin_score(embedding_xt_1K, kmeans_1K)
ch_score = calinski_harabasz_score(embedding_xt_1K, kmeans_1K)
ari = adjusted_rand_score(yd_1K, kmeans_1K)
nmi = normalized_mutual_info_score(yd_1K, kmeans_1K)

print('1.3 M Pretrain / No Finetune')
print(f"Silhouette Score: {sil_score}")
print(f"Davies-Bouldin Score: {db_score}")
print(f"Calinski-Harabasz Score: {ch_score}")
print(f"Adjusted Rand Index: {ari}")
print(f"Normalized Mutual Information: {nmi}")

data = [
    ['Silhouette Score', sil_score],
    ['Davies-Bouldin Score', db_score],
    ['Calinski-Harabasz Score', ch_score],
    ['Adjusted Rand Index', ari],
    ['Normalized Mutual Information', nmi],
]

df = pd.DataFrame(data, columns=['Metric', 'Value'])
df

## Finetune 1.3M

In [None]:
# @title Load Embeddings.

# A full list of these XID / WID pairs can be found in constants
xid_wid = '126268296/1'

file_name = os.path.join('/cns/dz-d/home/xliucs/lsm/xm/', xid_wid)
# embedding_dump_fname = os.path.join(file_name, f'full_train_embeddeding_300.npy')
embedding_dump_fname = os.path.join(file_name, f'pooled_train_embeddeding_300.npy')
metadata_name = os.path.join(file_name, f'metadata_300.npy')

print('Reading Full (Pooled) Embedding File:', embedding_dump_fname)
with gfile.Open(embedding_dump_fname, 'rb') as f:
  embedding_arr = np.load(f)

print('Reading Metadata File:', metadata_name)
with gfile.Open(metadata_name, 'rb') as f:
  metadata_arr = np.load(f)

print('\nPooled Embedding shape', embedding_arr.shape)
print('Metadata shape', metadata_arr.shape)

# Parse meta data
targets = metadata_arr[0, :]
preds = metadata_arr[1, :]
subj_id = metadata_arr[2, :]
age = metadata_arr[3, :]
weight = metadata_arr[4, :]
gender = metadata_arr[5, :]

# Setup
labels = targets.tolist()
print(f'\nTotal Count {len(labels)}\n')
y, X = [], []
for actName, act in actOHEDict.items():
  indices = [i for i, x in enumerate(labels) if x == act]
  print(actName, str(len(indices)))


In [None]:
# @title ACTIVITY RECOGNITION

Xd = embedding_arr
yd = targets
names = list(actOHEDict.keys())
colors = sns.color_palette("Set2", n_colors=8).as_hex()
embedding_xt_FT_1M = plot_embeddings(Xd, yd, colors, names)

yd_FT_1M = yd

In [None]:
# @title Embedding Quality

# get kmeans
num_classes = 8
kmeans_FT_1M = KMeans(n_clusters=8, random_state=42).fit_predict(embedding_xt_FT_1M)

sil_score = silhouette_score(embedding_xt_FT_1M, kmeans_FT_1M)
db_score = davies_bouldin_score(embedding_xt_FT_1M, kmeans_FT_1M)
ch_score = calinski_harabasz_score(embedding_xt_FT_1M, kmeans_FT_1M)
ari = adjusted_rand_score(yd_FT_1M, kmeans_FT_1M)
nmi = normalized_mutual_info_score(yd_FT_1M, kmeans_FT_1M)

print('1.3 M Pretrain / No Finetune')
print(f"Silhouette Score: {sil_score}")
print(f"Davies-Bouldin Score: {db_score}")
print(f"Calinski-Harabasz Score: {ch_score}")
print(f"Adjusted Rand Index: {ari}")
print(f"Normalized Mutual Information: {nmi}")

data = [
    ['Silhouette Score', sil_score],
    ['Davies-Bouldin Score', db_score],
    ['Calinski-Harabasz Score', ch_score],
    ['Adjusted Rand Index', ari],
    ['Normalized Mutual Information', nmi],
]

df = pd.DataFrame(data, columns=['Metric', 'Value'])
df

## Finetune 1K

In [None]:
# @title Load Embeddings.

# A full list of these XID / WID pairs can be found in constants
xid_wid = '127526958/1'

file_name = os.path.join('/cns/dz-d/home/xliucs/lsm/xm/', xid_wid)
# embedding_dump_fname = os.path.join(file_name, f'full_train_embeddeding_300.npy')
embedding_dump_fname = os.path.join(file_name, f'pooled_train_embeddeding_300.npy')
metadata_name = os.path.join(file_name, f'metadata_300.npy')

print('Reading Full (Pooled) Embedding File:', embedding_dump_fname)
with gfile.Open(embedding_dump_fname, 'rb') as f:
  embedding_arr = np.load(f)

print('Reading Metadata File:', metadata_name)
with gfile.Open(metadata_name, 'rb') as f:
  metadata_arr = np.load(f)

print('\nPooled Embedding shape', embedding_arr.shape)
print('Metadata shape', metadata_arr.shape)

# Parse meta data
targets = metadata_arr[0, :]
preds = metadata_arr[1, :]
subj_id = metadata_arr[2, :]
age = metadata_arr[3, :]
weight = metadata_arr[4, :]
gender = metadata_arr[5, :]

# Setup
labels = targets.tolist()
print(f'\nTotal Count {len(labels)}\n')
y, X = [], []
for actName, act in actOHEDict.items():
  indices = [i for i, x in enumerate(labels) if x == act]
  print(actName, str(len(indices)))


In [None]:
# @title ACTIVITY RECOGNITION

Xd = embedding_arr
yd = targets
names = list(actOHEDict.keys())
colors = sns.color_palette("Set2", n_colors=8).as_hex()
embedding_xt_FT_1K = plot_embeddings(Xd, yd, colors, names)
yd_FT_1K = yd

In [None]:
# @title Embedding Quality

# get kmeans
num_classes = 8
kmeans_FT_1K = KMeans(n_clusters=8, random_state=42).fit_predict(embedding_xt_FT_1K)

sil_score = silhouette_score(embedding_xt_FT_1K, kmeans_FT_1K)
db_score = davies_bouldin_score(embedding_xt_FT_1K, kmeans_FT_1K)
ch_score = calinski_harabasz_score(embedding_xt_FT_1K, kmeans_FT_1K)
ari = adjusted_rand_score(yd_FT_1K, kmeans_FT_1K)
nmi = normalized_mutual_info_score(yd_FT_1K, kmeans_FT_1K)

print('1.3 M Pretrain / No Finetune')
print(f"Silhouette Score: {sil_score}")
print(f"Davies-Bouldin Score: {db_score}")
print(f"Calinski-Harabasz Score: {ch_score}")
print(f"Adjusted Rand Index: {ari}")
print(f"Normalized Mutual Information: {nmi}")

data = [
    ['Silhouette Score', sil_score],
    ['Davies-Bouldin Score', db_score],
    ['Calinski-Harabasz Score', ch_score],
    ['Adjusted Rand Index', ari],
    ['Normalized Mutual Information', nmi],
]

df = pd.DataFrame(data, columns=['Metric', 'Value'])
df

# SANDBOX

In [None]:
# x = [1191, 152, 332, 229, 2332, 1860, 6887, 669 ]
x = [412, 49, 104, 425, 441, 315, 1301, 98]
total = sum(x)

In [None]:
total

In [None]:
count = 0
for i in x:
  z = int(i/total * 10000) / 100
  print(z)
  count += z

In [None]:
count

In [None]:
21.17 + 13.37 + 11.63 + 22.25 + 31.57

In [None]:
20.71 + 13.22 + 12.15 + 22.04 + 31.87