In [70]:
'''
File: generate_vocab_dict.py
Author: Yutong Dai (rothdyt@gmail.com)
File Created: Saturday, 2018-11-17 14:39
Last Modified: Saturday, 2018-11-17 14:39
--------------------------------------------
Desscription:
'''
import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO
import logging

log_level = logging.INFO
logger = logging.getLogger()
logger.setLevel(log_level)
handler = logging.FileHandler("data-preprocess.log")
handler.setLevel(log_level)
formatter = logging.Formatter('%(asctime)s - [%(levelname)s] - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

json = "../../../../Desktop/annotations/captions_val2017.json"
portion = 0.95
save_path = "./vocab.pkl"

In [74]:
# Reference <https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/03-advanced/image_captioning/build_vocab.py#L8>
class Vocabulary(object):
    """Simple vocabulary wrapper."""

    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)


# construct coco instance
# Reference: <https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/03-advanced/image_captioning/build_vocab.py#L31>
coco = COCO(json)
ids = coco.anns.keys()
counter = Counter()
for i, id in enumerate(ids):
    caption = str(coco.anns[id]['caption'])
    tokens = nltk.tokenize.word_tokenize(caption.lower())
    counter.update(tokens)
    if (i+1) % 5000 == 0:
        print("Tokenization Process: {}%.".format((i+1)*100/len(ids)))

# Keep the most frequently appeared words
counts = []
for _, count in counter.items():
    counts.append(count)
counts.sort(reverse=True)
cum_ratio = np.cumsum(counts) / np.sum(counts)
threshold = counts[np.argmax(cum_ratio > portion)]
words = []
for word, count in counter.items():
    if count >= threshold:
        words.append(word)
words.sort()
vocab = Vocabulary()
vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word('<unk>')
# for padding purpose
vocab.add_word('<pad>')
# Add the words to the vocabulary.
for i, word in enumerate(words):
    vocab.add_word(word)

with open(save_path, 'wb') as f:
    pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))
print("Saved the vocabulary wrapper to '{}'".format(save_path))

loading annotations into memory...
Done (t=0.06s)
creating index...
index created!
Tokenization Process: 19.988806268489647%.
Tokenization Process: 39.977612536979294%.
Tokenization Process: 59.96641880546894%.
Tokenization Process: 79.95522507395859%.
Tokenization Process: 99.94403134244823%.
Total vocabulary size: 1532
Saved the vocabulary wrapper to './vocab.pkl'


In [75]:
vocab.idx2word

{0: '<start>',
 1: '<end>',
 2: '<unk>',
 3: '<pad>',
 4: "''",
 5: "'s",
 6: ',',
 7: '.',
 8: '2',
 9: '``',
 10: 'a',
 11: 'about',
 12: 'above',
 13: 'across',
 14: 'action',
 15: 'adult',
 16: 'adults',
 17: 'advertisement',
 18: 'advertising',
 19: 'aerial',
 20: 'after',
 21: 'against',
 22: 'air',
 23: 'airplane',
 24: 'airplanes',
 25: 'airport',
 26: 'all',
 27: 'almost',
 28: 'alone',
 29: 'along',
 30: 'alongside',
 31: 'also',
 32: 'american',
 33: 'among',
 34: 'an',
 35: 'and',
 36: 'animal',
 37: 'animals',
 38: 'another',
 39: 'antique',
 40: 'apartment',
 41: 'appears',
 42: 'apple',
 43: 'apples',
 44: 'appliances',
 45: 'approaching',
 46: 'are',
 47: 'area',
 48: 'arm',
 49: 'arms',
 50: 'around',
 51: 'arranged',
 52: 'art',
 53: 'as',
 54: 'asian',
 55: 'asleep',
 56: 'assorted',
 57: 'assortment',
 58: 'at',
 59: 'atop',
 60: 'attached',
 61: 'attempting',
 62: 'audience',
 63: 'away',
 64: 'baby',
 65: 'back',
 66: 'background',
 67: 'backpack',
 68: 'backs',
 