In [38]:
from collections import Counter, defaultdict
import cv2
from pathlib import Path
from pycocotools.coco import COCO
import numpy as np
import nltk
import spacy
import re
from tqdm import tqdm

In [2]:
PATH = Path('data')
ANNOT_PATH = PATH/'annotations'

In [3]:
def get_annot_index(annot_path, dataset='train', year=2017):
    annot_file = annot_path/f'captions_{dataset}{year}.json'
    return COCO(annot_file)

## Glove embeddings and vocab

Adapted from class demo notebooks.

In [34]:
def unpack_glove():
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    ! mkdir data
    ! unzip glove.6B.zip -C data

def loadGloveModel(gloveFile=PATH/"glove.6B.50d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

def get_word_count(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    counts = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            counts[word] += 1
    return counts

def delete_rare_words(word_vecs, word_count, min_df=5):
    """ Deletes rare words from word_count
    
    Deletes words from word_count if they are not in word_vecs
    and don't have at least min_df occurrencies in word_count.
    """
    words_delete = []
    for word in word_count:
        if word_count[word] < min_df and word not in word_vecs:
            words_delete.append(word)
    for word in words_delete: word_count.pop(word)
    return word_count

def create_embedding_matrix(word_vecs, word_count, min_df=4, emb_size=50):
    """Creates embedding matrix from word vectors. """
    word_count = delete_rare_words(word_vecs, word_count, min_df)
    extra_words = ["<pad>", "<unk>", "<start>", "<end>"]
    vocab_len = len(word_count.keys()) + len(extra_words)
    W = np.zeros((vocab_len, emb_size), dtype="float32")
    
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    # adding a vector for rare words 
    W[1] = np.random.uniform(-0.25, 0.25, emb_size)
    # adding vectors for <start>, <end> tokens
    W[2] = np.random.uniform(-0.25, 0.25, emb_size)
    W[3] = np.random.uniform(-0.25, 0.25, emb_size)
    
    vocab2index = {word : i for i, word in enumerate(extra_words)}
    vocab = extra_words
    i = len(vocab)
    for word in word_count:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab2index[word] = i
        vocab.append(word)
        i += 1
    return W, np.array(vocab), vocab2index

In [30]:
coco = get_annot_index(ANNOT_PATH)

loading annotations into memory...
Done (t=0.83s)
creating index...
index created!


In [5]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [36]:
coco = get_annot_index(ANNOT_PATH)
annot_ids = coco.anns.keys()

loading annotations into memory...
Done (t=0.77s)
creating index...
index created!


In [39]:
all_captions = [str(coco.anns[id]['caption']) for id in tqdm(annot_ids)]
word_count = get_word_count(all_captions)

100%|██████████| 591753/591753 [00:00<00:00, 1902873.39it/s]


In [12]:
word_vecs = loadGloveModel()

In [43]:
print(len(word_vecs), len(word_count))

400000 53155


In [44]:
word_count = delete_rare_words(word_vecs, word_count, min_df=10)
len(word_count)

24178

In [45]:
pretrained_weights, vocab, vocab2index = create_embedding_matrix(word_vecs, word_counts)

In [46]:
pretrained_weights

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.24465083, -0.11952231, -0.20686829, ...,  0.22003843,
         0.05354511, -0.21504742],
       [-0.03769932, -0.1310914 ,  0.0603082 , ..., -0.01164583,
         0.12908918,  0.1663074 ],
       ...,
       [ 0.11737   ,  0.25231   ,  0.071165  , ..., -0.94342   ,
         0.71416   ,  0.45284   ],
       [-0.12779   , -0.049853  , -0.77253   , ...,  0.7053    ,
         0.79276   ,  0.090612  ],
       [-0.99742   ,  0.25117   , -0.02051   , ...,  0.024489  ,
        -0.71593   ,  1.5538    ]], dtype=float32)

In [49]:
vocab[:20]

array(['<pad>', '<unk>', '<start>', '<end>', 'a', 'bicycle', 'replica',
       'with', 'clock', 'as', 'the', 'front', 'wheel', '.', 'room',
       'blue', 'walls', 'and', 'white', 'sink'], dtype='<U18')

In [51]:
vocab2index

{'<pad>': 0,
 '<unk>': 1,
 '<start>': 2,
 '<end>': 3,
 'a': 4,
 'bicycle': 5,
 'replica': 6,
 'with': 7,
 'clock': 8,
 'as': 9,
 'the': 10,
 'front': 11,
 'wheel': 12,
 '.': 13,
 'room': 14,
 'blue': 15,
 'walls': 16,
 'and': 17,
 'white': 18,
 'sink': 19,
 'door': 20,
 'car': 21,
 'that': 22,
 'seems': 23,
 'to': 24,
 'be': 25,
 'parked': 26,
 'illegally': 27,
 'behind': 28,
 'legally': 29,
 'large': 30,
 'passenger': 31,
 'airplane': 32,
 'flying': 33,
 'through': 34,
 'air': 35,
 'there': 36,
 'is': 37,
 'gol': 38,
 'plane': 39,
 'taking': 40,
 'off': 41,
 'in': 42,
 'partly': 43,
 'cloudy': 44,
 'sky': 45,
 'color': 46,
 'scheme': 47,
 'small': 48,
 'bathroom': 49,
 'this': 50,
 'wall': 51,
 'lifesaver': 52,
 'on': 53,
 'boat': 54,
 'themed': 55,
 'life': 56,
 'preserver': 57,
 'bike': 58,
 'has': 59,
 'tire': 60,
 'two': 61,
 'cars': 62,
 'sidewalk': 63,
 'street': 64,
 'an': 65,
 ',': 66,
 'either': 67,
 'landing': 68,
 'or': 69,
 'just': 70,
 'are': 71,
 'painted': 72,
 'baby': 