In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# !export PYTHONPATH=/Users/piyush/projects/ViDA-SSL/
import sys
sys.path.append("/Users/piyush/projects/ViDA-SSL/")

In [3]:
from os.path import join
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

import torch
from fast_pytorch_kmeans import KMeans
from transformers import AutoTokenizer, AutoModel

from misc.local_utils import get_phrase_embedding
from utils.viz import bokeh_2d_scatter, bokeh_2d_scatter_new
from datasets.ntu import NTU
from datasets.epic import EPIC
from datasets.something_something import SomethingSomething
from datasets.finegym import FineGym
from datasets.ucf import UCF
from datasets.kinetics import Kinetics

In [10]:
layers = [-4, -3, -2, -1]

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModel.from_pretrained("bert-base-cased", output_hidden_states=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
def get_embeddings_for_dataset(class_label_dict):
    embeddings = dict()
    embeddings_tensor = []
    class_ids = []

    for k, v in tqdm(class_label_dict.items()):
        phrase_embedding = get_phrase_embedding(model, tokenizer, v)
        embeddings[k] = phrase_embedding
        embeddings_tensor.append(phrase_embedding.unsqueeze(0))
        class_ids.append(k)

    embeddings_tensor = torch.cat(embeddings_tensor, dim=0)
    return embeddings_tensor

In [21]:
datasets = [
    {
        "name": "Kinetics",
        "args": {"data_dir": "../data/Kinetics-400/"},
        "color": "limegreen",
        "filename": "kinetics_400_labels.csv",
    },
    {
        "name": "UCF",
        "args": {"data_dir": "../data/UCF-101/"},
        "color": "red",
        "filename": "classes_cleaned.txt",
    },
    {
        "name": "NTU",
        "args": {"data_dir": "../data/NTU/"},
        "color": "blue",
        "filename": "class_labels.txt",
    },
    {
        "name": "SomethingSomething",
        "args": {"data_dir": "../data/Something-Something/"},
        "color": "yellow",
        "filename": "fine_grained_classes_cleaned.csv",
    },
    {
        "name": "FineGym",
        "args": {"data_dir": "../data/FineGym/"},
        "color": "gray",
        "filename": "gym99_categories_cleaned.txt",
    },
    {
        "name": "EPIC",
        "args": {"data_dir": "../data/EPIC-KITCHENS-100/"},
        "color": "magenta",
        "filename": "EPIC_100_verb_classes.csv",
    },
]

In [13]:
dfs = []
all_embeddings = []

for datadict in datasets:
    df = pd.DataFrame(None)

    dataset = eval(f"{datadict['name']}(**{datadict['args']})")
    class_label_dict = dataset._load_annotations(filename=datadict["filename"])
    
    class_embeddings = get_embeddings_for_dataset(class_label_dict)
    
    df["class_id"] = list(class_label_dict.keys())
    df["class_label"] = list(class_label_dict.values())
    df["color"] = datadict["color"]
    df["dataset"] = datadict["name"]
    dfs.append(df)
    
    all_embeddings.append(class_embeddings)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:21<00:00, 18.87it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 26.67it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:05<00:00, 22.47it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 174/174 [00:10<00:00, 17.22it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [00:07<00:00, 13.56it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████

In [14]:
embeddings = torch.cat(all_embeddings, dim=0)
(U, S, V) = torch.pca_lowrank(embeddings)

K = 2

all_dimreds = []
for i, X in enumerate(all_embeddings):
    Z = torch.matmul(X, V[:, :K])
    dfs[i]["Z1"] = Z[:, 0]
    dfs[i]["Z2"] = Z[:, 1]

In [15]:
all_df = pd.concat(dfs)

In [16]:
all_df.shape

(990, 6)

In [17]:
all_df.head()

Unnamed: 0,class_id,class_label,color,dataset,Z1,Z2
0,0,abseiling,limegreen,Kinetics,2.220908,10.92487
1,1,air drumming,limegreen,Kinetics,18.985031,12.142136
2,2,answering questions,limegreen,Kinetics,19.361813,11.023711
3,3,applauding,limegreen,Kinetics,6.32196,5.507947
4,4,applying cream,limegreen,Kinetics,14.840065,4.924805


In [18]:
bokeh_2d_scatter_new(
    df=all_df, x="Z1", y="Z2", hue="dataset", label="class_label", use_nb=True, color_column="color",
    title="BERT-based embeddings for action classes (phrases) in various datasets.", legend_loc="top_right",
)