In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# !export PYTHONPATH=/Users/piyush/projects/ViDA-SSL/
import sys
sys.path.append("/Users/piyush/projects/ViDA-SSL/")

In [3]:
from os.path import join
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

import torch
from fast_pytorch_kmeans import KMeans
from transformers import AutoTokenizer, AutoModel

from misc.local_utils import get_phrase_embedding
from utils.viz import bokeh_2d_scatter, bokeh_2d_scatter_new
from datasets.ntu import NTU
from datasets.epic import EPIC
from datasets.something_something import SomethingSomething
from datasets.finegym import FineGym
from datasets.ucf import UCF
from datasets.kinetics import Kinetics

### Load BERT model

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModel.from_pretrained("bert-base-cased", output_hidden_states=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def get_sentence_embedding(model, tokenizer, sentence):
    encoded = tokenizer.encode_plus(sentence, return_tensors="pt")

    with torch.no_grad():
        output = model(**encoded)
    
    last_hidden_state = output.last_hidden_state
    assert last_hidden_state.shape[0] == 1
    assert last_hidden_state.shape[-1] == 768
    
    # only pick the [CLS] token embedding (sentence embedding)
    sentence_embedding = last_hidden_state[0, 0]
    
    return sentence_embedding

### Load dataset

In [6]:
dataset_args = {
    "name": "UCF",
    "args": {"data_dir": "../../data/UCF-101/"},
    "color": "red",
    "filename": "classes_cleaned.txt",
}

In [7]:
dataset = eval(f"{dataset_args['name']}(**{dataset_args['args']})")
class_label_dict = dataset._load_annotations(filename=dataset_args["filename"])

### Get embedding for sample action

In [8]:
sample_action = class_label_dict["100"]

In [9]:
sample_action

'Writing On Board'

In [10]:
sample_action_embedding = get_sentence_embedding(model, tokenizer, sample_action)

In [11]:
sample_action_embedding.shape

torch.Size([768])

### Get embeddings for all actions

In [12]:
class_label_embeddings = {
    k: get_sentence_embedding(model, tokenizer, v) \
    for k, v in tqdm(class_label_dict.items(), desc="Extracting BERT embeddings")
}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Extracting BERT embeddings: 100%|██████████████████████████████████████████████████████████| 101/101 [00:04<00:00, 24.88it/s]


In [15]:
class_label_embeddings["1"].shape

torch.Size([768])