# Prepare regular split with full dataset but embeddings too

In [1]:
import json

In [2]:
import numpy as np

In [3]:
import torch

In [4]:
from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.modeling.language_backbone.transformers import BERT

In [5]:
with open('../datasets/coco/annotations/instances_train2017.json', 'r') as fin:
    coco_train_anno_all = json.load(fin)

In [6]:
with open('../datasets/coco/annotations/instances_val2017.json', 'r') as fin:
    coco_val_anno_all = json.load(fin)

In [7]:
labels_all = [item['name'] for item in coco_val_anno_all['categories']]

In [8]:
labels_all_tokens = []
for item in labels_all:
    labels_all_tokens.extend(item.split())
labels_all_tokens = set(labels_all_tokens)

In [9]:
len(labels_all), len(labels_all_tokens)

(80, 92)

In [10]:
class_token_to_glove = {}
with open('../datasets/coco/zero-shot/glove.6B.300d.txt', 'r') as fin:
    for row in fin:
        row_tk = row.split()
        if row_tk[0] in labels_all_tokens:
            class_token_to_glove[row_tk[0]] = [float(num) for num in row_tk[1:]]


In [11]:
len(class_token_to_glove)

92

In [12]:
embeddings = []
for item in labels_all:
    emb = np.zeros((300,), dtype=np.float32)
    for tk in item.split():
        emb += class_token_to_glove[tk]
    emb /= len(item.split())
    embeddings.append(emb)

In [14]:
class_name_to_glove = {k: v.tolist() for k, v in zip(labels_all, embeddings)}

In [15]:
bert = BERT(cfg)

In [16]:
_ = bert.to('cuda')

In [17]:
encoded_class_list = bert(labels_all)

In [18]:
mask = (1 - encoded_class_list['special_tokens_mask']).to(torch.float32)

In [19]:
mask.sum(-1)

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 2., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 3., 1., 1., 2., 1., 1., 3., 2., 2., 2., 1., 2., 2.,
        2., 2., 3., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 1., 2., 1.,
        2., 1., 1., 1., 3., 1., 2., 1., 1., 1., 1., 1., 1., 2., 1., 1., 2., 1.,
        1., 1., 1., 1., 1., 2., 3., 2.], device='cuda:0')

In [20]:
embeddings = (encoded_class_list['input_embeddings'] * mask[:, :, None]).sum(1) / mask.sum(1)[:, None]

In [21]:
embeddings = embeddings.cpu().numpy()

In [22]:
embeddings.shape

(80, 768)

In [23]:
class_name_to_bertemb = {}
for c, emb in zip(labels_all, embeddings.tolist()):
    class_name_to_bertemb[c] = emb

In [24]:
len(class_name_to_bertemb), len(class_name_to_glove), len(labels_all)

(80, 80, 80)

In [25]:
for item in coco_train_anno_all['categories']:
    item['embedding'] = {}
    item['embedding']['GloVE'] = class_name_to_glove[item['name']]
    item['embedding']['BertEmb'] = class_name_to_bertemb[item['name']]


In [26]:
for item in coco_val_anno_all['categories']:
    item['embedding'] = {}
    item['embedding']['GloVE'] = class_name_to_glove[item['name']]
    item['embedding']['BertEmb'] = class_name_to_bertemb[item['name']]


In [27]:
with open('../datasets/coco/zero-shot/instances_train2017_full.json', 'w') as fout:
    json.dump(coco_train_anno_all, fout)

In [28]:
with open('../datasets/coco/zero-shot/instances_val2017_full.json', 'w') as fout:
    json.dump(coco_val_anno_all, fout)