In [1]:
! pip install pycocotools

Collecting pycocotools
  Using cached pycocotools-2.0.1.tar.gz (23 kB)
Building wheels for collected packages: pycocotools
  Building wheel for pycocotools (setup.py) ... [?25ldone
[?25h  Created wheel for pycocotools: filename=pycocotools-2.0.1-cp36-cp36m-linux_x86_64.whl size=280629 sha256=bcc9801420aa65263a5c51b6fbe9a7d5d20c208582a4309a3be6dee0e42178c9
  Stored in directory: /home/ubuntu/.cache/pip/wheels/3f/ef/f8/0335de365305b04082b274251f59bdc6805238a9fe33cc17ae
Successfully built pycocotools
Installing collected packages: pycocotools
Successfully installed pycocotools-2.0.1


In [2]:
from collections import Counter, defaultdict
import cv2
from pathlib import Path
from pycocotools.coco import COCO
import numpy as np
import nltk
import spacy
import re
from tqdm import tqdm
import pickle

In [3]:
PATH = Path('data')
ANNOT_PATH = PATH/'annotations'

In [4]:
def get_annot_index(annot_path, dataset='train', year=2017):
    annot_file = annot_path/f'captions_{dataset}{year}.json'
    return COCO(annot_file)

## Glove embeddings and vocab

Adapted from class demo notebooks.

In [7]:
def unpack_glove():
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    ! mkdir data
    ! unzip glove.6B.zip -C data

def loadGloveModel(gloveFile=PATH/"glove.6B.50d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

def get_word_count(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    counts = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            counts[word] += 1
    return counts

def delete_rare_words(word_vecs, word_count, min_df=5):
    """ Deletes rare words from word_count
    
    Deletes words from word_count if they are not in word_vecs
    and don't have at least min_df occurrencies in word_count.
    """
    words_delete = []
    for word in word_count:
        if word_count[word] < min_df and word not in word_vecs:
            words_delete.append(word)
    for word in words_delete: word_count.pop(word)
    return word_count

def create_embedding_matrix(word_vecs, word_count, min_df=4, emb_size=50):
    """Creates embedding matrix from word vectors. """
    word_count = delete_rare_words(word_vecs, word_count, min_df)
    extra_words = ["<pad>", "<unk>", "<start>", "<end>"]
    vocab_len = len(word_count.keys()) + len(extra_words)
    W = np.zeros((vocab_len, emb_size), dtype="float32")
    
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    # adding a vector for rare words 
    W[1] = np.random.uniform(-0.25, 0.25, emb_size)
    # adding vectors for <start>, <end> tokens
    W[2] = np.random.uniform(-0.25, 0.25, emb_size)
    W[3] = np.random.uniform(-0.25, 0.25, emb_size)
    
    vocab2index = {word : i for i, word in enumerate(extra_words)}
    vocab = extra_words
    i = len(vocab)
    for word in word_count:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab2index[word] = i
        vocab.append(word)
        i += 1
    return W, np.array(vocab), vocab2index

In [8]:
coco = get_annot_index(ANNOT_PATH)

loading annotations into memory...
Done (t=1.43s)
creating index...
index created!


In [11]:
! pip install spacy



In [16]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [17]:
coco = get_annot_index(ANNOT_PATH)
annot_ids = coco.anns.keys()

loading annotations into memory...
Done (t=1.03s)
creating index...
index created!


In [18]:
all_captions = [str(coco.anns[id]['caption']) for id in tqdm(annot_ids)]
word_count = get_word_count(all_captions)

100%|██████████| 591753/591753 [00:00<00:00, 1109052.06it/s]


In [20]:
word_vecs = loadGloveModel()

In [21]:
print(len(word_vecs), len(word_count))

400000 53155


In [22]:
word_count = delete_rare_words(word_vecs, word_count, min_df=10)
len(word_count)

24178

In [23]:
pretrained_weights, vocab, vocab2index = create_embedding_matrix(word_vecs, word_count)

In [24]:
vocab[:20]

array(['<pad>', '<unk>', '<start>', '<end>', 'replica', 'a', 'wheel.',
       'bicycle', 'A', 'the', 'front', 'clock', 'as', 'with', 'door.',
       'white', 'walls', 'sink', 'room', 'and'], dtype='<U19')

In [25]:
pickle.dump(pretrained_weights, open(PATH/'pretrained_weights.pkl', 'wb'))
pickle.dump(vocab, open(PATH/'vocab.pkl', 'wb'))
pickle.dump(vocab2index, open(PATH/'vocab2index.pkl', 'wb'))