In [1]:
# !export PYTHONPATH=/Users/piyush/projects/ViDA-SSL/
import sys
sys.path.append("/Users/piyush/projects/ViDA-SSL/")

In [2]:
from os.path import join
import pandas as pd
from tqdm import tqdm
import numpy as np

import torch
from fast_pytorch_kmeans import KMeans
from transformers import AutoTokenizer, AutoModel

from misc.local_utils import get_phrase_embedding
from utils.viz import bokeh_2d_scatter, bokeh_2d_scatter_new

In [3]:
DATA_DIR = "/Users/piyush/datasets/Something-Something/"
labels_file = join(DATA_DIR, "annotations/action-clf/coarse_grained_classes.csv")

In [4]:
df = pd.read_csv(labels_file)
class_label_dict = {k: df['class'][k] for k in df['class_index'].values}

In [5]:
layers = [-4, -3, -2, -1]

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModel.from_pretrained("bert-base-cased", output_hidden_states=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
eg_embedding = get_phrase_embedding(model, tokenizer, "hello I am a boy")

In [7]:
embeddings = dict()
embeddings_tensor = []
class_ids = []

for k, v in tqdm(class_label_dict.items()):
    phrase_embedding = get_phrase_embedding(model, tokenizer, v)
    embeddings[k] = phrase_embedding
    embeddings_tensor.append(phrase_embedding.unsqueeze(0))
    class_ids.append(k)

embeddings_tensor = torch.cat(embeddings_tensor, dim=0)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 21.33it/s]


In [8]:
embeddings_tensor.shape

torch.Size([50, 768])

In [16]:
kmeans = KMeans(n_clusters=6, mode='euclidean', verbose=1)
labels = kmeans.fit_predict(embeddings_tensor)

(U, S, V) = torch.pca_lowrank(embeddings_tensor)

K = 2
Z = torch.matmul(embeddings_tensor, V[:, :K])

used 2 iterations (0.0026s) to cluster 50 items into 6 clusters


In [17]:
df = pd.DataFrame(None)

df["x"] = Z[:, 0].numpy()
df["y"] = Z[:, 1].numpy()
df["cluster_label"] = labels.numpy()
df["class_id"] = np.array(class_ids)
df["class_desc"] = df["class_id"].apply(lambda k: class_label_dict[k])

In [18]:
bokeh_2d_scatter_new(
    df=df, x="x", y="y", hue="cluster_label", label="class_desc", use_nb=True,
    title="BERT-based embeddings for Something-something action classes."
)