# Prepare a set of nouns for open-vocabulary inference

In [1]:
import json
import pickle
import numpy as np

In [2]:
from transformers.tokenization_bert import BasicTokenizer

In [3]:
import nltk

In [23]:
import torch

In [24]:
from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.modeling.language_backbone.transformers import BERT

In [4]:
with open('../datasets/coco/annotations/captions_train2017.json', 'r') as fin:
    coco_train_anno_all = json.load(fin)

In [5]:
tokenizer = BasicTokenizer(do_lower_case=True)

In [6]:
nouns = []
for item in coco_train_anno_all['annotations']:
    tokens = tokenizer.tokenize(item['caption'])
    for word, pos in nltk.pos_tag(tokens):
        if pos == 'NN':
            nouns.append(word)

In [7]:
counts = {item: 0 for item in set(nouns)}
for item in nouns:
    counts[item] += 1

In [8]:
len(counts)

14312

In [9]:
counts_mtx = [[item, counts[item]] for item in counts]

In [10]:
counts_mtx = np.asarray(counts_mtx)

In [11]:
sort_idx = np.argsort(- counts_mtx[:, 1].astype(np.int32))

In [12]:
counts_mtx[sort_idx][:20]

array([['man', '73340'],
       ['woman', '34210'],
       ['street', '30182'],
       ['table', '27975'],
       ['person', '24745'],
       ['group', '21599'],
       ['top', '21228'],
       ['field', '20779'],
       ['tennis', '19409'],
       ['front', '19057'],
       ['train', '18431'],
       ['plate', '18324'],
       ['room', '18321'],
       ['dog', '18064'],
       ['cat', '17041'],
       ['water', '16513'],
       ['baseball', '15704'],
       ['bathroom', '14433'],
       ['sign', '13946'],
       ['food', '13242']], dtype='<U18')

In [13]:
counts_mtx[sort_idx][1160:1170]

array([['celery', '100'],
       ['pigeon', '100'],
       ['maker', '100'],
       ['individual', '99'],
       ['mid', '99'],
       ['asphalt', '98'],
       ['coast', '98'],
       ['drawing', '98'],
       ['ad', '98'],
       ['hipster', '98']], dtype='<U18')

In [14]:
class_name_to_emb = {}
with open('../datasets/coco/zero-shot/glove.6B.300d.txt', 'r') as fin:
    for row in fin:
        row_tk = row.split()
        if row_tk[0] in counts:
            class_name_to_emb[row_tk[0]] = [float(num) for num in row_tk[1:]]


In [15]:
len(class_name_to_emb)

11678

In [16]:
counts_filtered = []
for item in counts_mtx[sort_idx]:
    if item[0] in class_name_to_emb:
        counts_filtered.append(item)

In [17]:
counts_filtered = counts_filtered[:1161]

In [18]:
counts_filtered[-10:]

[array(['wheelchair', '102'], dtype='<U18'),
 array(['gravy', '101'], dtype='<U18'),
 array(['cutter', '101'], dtype='<U18'),
 array(['loaf', '101'], dtype='<U18'),
 array(['pajamas', '100'], dtype='<U18'),
 array(['lying', '100'], dtype='<U18'),
 array(['porcelain', '100'], dtype='<U18'),
 array(['celery', '100'], dtype='<U18'),
 array(['pigeon', '100'], dtype='<U18'),
 array(['maker', '100'], dtype='<U18')]

In [19]:
embeddings = [class_name_to_emb[item[0]] for item in counts_filtered]

In [20]:
embeddings = np.asarray(embeddings)

In [21]:
embeddings.shape

(1161, 300)

In [22]:
class_names = [item[0] for item in counts_filtered]

In [58]:
with open('../datasets/coco/zero-shot/ov_nouns.pkl', 'wb') as fout:
    pickle.dump((class_names, embeddings), fout)

In [25]:
bert = BERT(cfg)

In [26]:
_ = bert.to('cuda')

In [27]:
encoded_class_list = bert(class_names)

In [28]:
mask = (1 - encoded_class_list['special_tokens_mask']).to(torch.float32)

In [29]:
bertembeddings = (encoded_class_list['input_embeddings'] * mask[:, :, None]).sum(1) / mask.sum(1)[:, None]

In [30]:
bertembeddings = bertembeddings.cpu().numpy()

In [31]:
bertembeddings.shape

(1161, 768)

In [32]:
with open('../datasets/coco/zero-shot/ov_nouns_bertemb.pkl', 'wb') as fout:
    pickle.dump((class_names, bertembeddings), fout)