In [1]:
# !export PYTHONPATH=/Users/piyush/projects/ViDA-SSL/
import sys
sys.path.append("/Users/piyush/projects/ViDA-SSL/")

In [2]:
from os.path import join
import pandas as pd
from tqdm import tqdm
import numpy as np

import torch
from fast_pytorch_kmeans import KMeans
from transformers import AutoTokenizer, AutoModel

from misc.local_utils import get_phrase_embedding
from utils.viz import bokeh_2d_scatter, bokeh_2d_scatter_new

In [15]:
DATA_DIR = "/Users/piyush/datasets/EPIC-KITCHENS-100/"
labels_file = join(DATA_DIR, "annotations/action-clf/EPIC_100_verb_classes.csv")

In [16]:
df = pd.read_csv(labels_file)
class_label_dict = {k: df['key'][k] for k in df['id'].values}

In [17]:
layers = [-4, -3, -2, -1]

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModel.from_pretrained("bert-base-cased", output_hidden_states=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
eg_embedding = get_phrase_embedding(model, tokenizer, "hello I am a boy")

In [19]:
embeddings = dict()
embeddings_tensor = []
class_ids = []

for k, v in tqdm(class_label_dict.items()):
    phrase_embedding = get_phrase_embedding(model, tokenizer, v)
    embeddings[k] = phrase_embedding
    embeddings_tensor.append(phrase_embedding.unsqueeze(0))
    class_ids.append(k)

embeddings_tensor = torch.cat(embeddings_tensor, dim=0)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 97/97 [00:03<00:00, 27.08it/s]


In [20]:
embeddings_tensor.shape

torch.Size([97, 768])

In [24]:
kmeans = KMeans(n_clusters=30, mode='euclidean', verbose=1)
labels = kmeans.fit_predict(embeddings_tensor)

(U, S, V) = torch.pca_lowrank(embeddings_tensor)

K = 2
Z = torch.matmul(embeddings_tensor, V[:, :K])

used 3 iterations (0.0783s) to cluster 97 items into 30 clusters


In [25]:
df = pd.DataFrame(None)

df["x"] = Z[:, 0].numpy()
df["y"] = Z[:, 1].numpy()
df["cluster_label"] = labels.numpy()
df["class_id"] = np.array(class_ids)
df["class_desc"] = df["class_id"].apply(lambda k: class_label_dict[k])

In [26]:
bokeh_2d_scatter_new(
    df=df, x="x", y="y", hue="cluster_label", label="class_desc", use_nb=True,
    title="BERT-based embeddings for EPIC-KITCHENS-100 noun classes."
)