In [8]:
import numpy as np
vocab = {'<PAD>': 0, 'is': 1, 'it': 2, 'too': 3, 'late': 4, 'now': 5, 'say': 6, 'sorry': 7, 'ooh': 8, 'yeah': 9} 
X = [[0, 1, 2, 3, 4, 5, 6], 
    [7, 7], 
    [6, 8]]
# get the length of each sentence
X_lengths = [len(sentence) for sentence in X]
# create an empty matrix with padding tokens
pad_token = vocab['<PAD>']
longest_sent = max(X_lengths)
batch_size = len(X)
padded_X = np.ones((batch_size, longest_sent)) * pad_token
# copy over the actual sequences
for i, x_len in enumerate(X_lengths):
    sequence = X[i] 
    padded_X[i, 0:x_len] = sequence[:x_len]

In [9]:
padded_X

array([[0., 1., 2., 3., 4., 5., 6.],
       [7., 7., 0., 0., 0., 0., 0.],
       [6., 8., 0., 0., 0., 0., 0.]])

In [11]:
XX = [x+1 for sublist in X for x in sublist]

In [14]:
X[0] + 1

TypeError: can only concatenate list (not "int") to list

In [1]:
from pycocotools.coco import COCO
coco = COCO('captions_train2014.json')

loading annotations into memory...
Done (t=0.92s)
creating index...
index created!


In [2]:
ids = coco.anns.keys()

In [7]:
for key, item in coco.anns.items():
    print (key, item)
    break

48 {'image_id': 318556, 'id': 48, 'caption': 'A very clean and well decorated empty bathroom'}


In [9]:
coco.anns[48]

{'image_id': 318556,
 'id': 48,
 'caption': 'A very clean and well decorated empty bathroom'}

In [4]:
len(ids)

414113

In [26]:
from generate_vocab_dict import Vocabulary
import pickle
with open('./vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)

In [15]:
import nltk
from collections import Counter
from pycocotools.coco import COCO
import logging
import numpy as np
coco = COCO("./captions_train2014.json")
ids = coco.anns.keys()
counter = Counter()
for i, id in enumerate(ids):
    caption = str(coco.anns[id]['caption'])
    tokens = nltk.tokenize.word_tokenize(caption.lower())
    counter.update(tokens)
    if (i+1) % 5000 == 0:
        print("Tokenization Process: {0:.2f}%.".format((i+1)*100/len(ids)))
        #logger.info("Tokenization Process: {0:.2f}%.".format((i+1)*100/len(ids)))
# Keep the most frequently appeared words
counts = []
for _, count in counter.items():
    counts.append(count)
counts.sort(reverse=True)

loading annotations into memory...
Done (t=0.83s)
creating index...
index created!
Tokenization Process: 1.21%.
Tokenization Process: 2.41%.
Tokenization Process: 3.62%.
Tokenization Process: 4.83%.
Tokenization Process: 6.04%.
Tokenization Process: 7.24%.
Tokenization Process: 8.45%.
Tokenization Process: 9.66%.
Tokenization Process: 10.87%.
Tokenization Process: 12.07%.
Tokenization Process: 13.28%.
Tokenization Process: 14.49%.
Tokenization Process: 15.70%.
Tokenization Process: 16.90%.
Tokenization Process: 18.11%.
Tokenization Process: 19.32%.
Tokenization Process: 20.53%.
Tokenization Process: 21.73%.
Tokenization Process: 22.94%.
Tokenization Process: 24.15%.
Tokenization Process: 25.36%.
Tokenization Process: 26.56%.
Tokenization Process: 27.77%.
Tokenization Process: 28.98%.
Tokenization Process: 30.18%.
Tokenization Process: 31.39%.
Tokenization Process: 32.60%.
Tokenization Process: 33.81%.
Tokenization Process: 35.01%.
Tokenization Process: 36.22%.
Tokenization Process: 37.

In [17]:
len(counts)

25129

In [42]:
portion = 0.993
cum_ratio = np.cumsum(counts) / np.sum(counts)
threshold = min(4,counts[np.argmax(cum_ratio > portion)])

In [43]:
threshold

4

In [44]:
words = []
for word, count in counter.items():
    if count >= threshold:
        words.append(word)
words.sort()

In [45]:
len(words)

9952

In [2]:
import pickle
from generate_vocab_dict import Vocabulary
vocab_path = './vocab.pkl'
with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

## beam search

In [1]:
import torch
k = 3; vocab_size=6
k_prev_words = torch.LongTensor([[0]] * k)
complete_seqs = list()
complete_seqs_scores = list()
seqs = k_prev_words 
seqs

tensor([[0],
        [0],
        [0]])

In [2]:
top_k_scores = torch.zeros(k, 1)
top_k_scores

tensor([[0.],
        [0.],
        [0.]])

###  image inputs

In [3]:
#  repeat k times
scores = torch.FloatTensor([[1,2,3,4,5,1], [1,2,3,4,5,1],[1,2,3,4,5,1]])
scores = top_k_scores.expand_as(scores) + scores 
scores

tensor([[1., 2., 3., 4., 5., 1.],
        [1., 2., 3., 4., 5., 1.],
        [1., 2., 3., 4., 5., 1.]])

In [4]:
top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)
top_k_scores, top_k_words

(tensor([5., 4., 3.]), tensor([4, 3, 2]))

In [5]:
prev_word_inds = top_k_words / vocab_size 
next_word_inds = top_k_words % vocab_size
prev_word_inds, next_word_inds

(tensor([0, 0, 0]), tensor([4, 3, 2]))

In [6]:
seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)
seqs

tensor([[0, 4],
        [0, 3],
        [0, 2]])

In [7]:
incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                           next_word != 5]
complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))
incomplete_inds,complete_inds

([0, 1, 2], [])

In [8]:
if len(complete_inds) > 0:
    complete_seqs.extend(seqs[complete_inds].tolist())
    complete_seqs_scores.extend(top_k_scores[complete_inds])
k -= len(complete_inds)  # reduce beam length accordingly

In [9]:
seqs = seqs[incomplete_inds]
seqs

tensor([[0, 4],
        [0, 3],
        [0, 2]])

In [10]:
top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

top_k_scores, k_prev_words

(tensor([[5.],
         [4.],
         [3.]]), tensor([[4],
         [3],
         [2]]))

## text input

In [11]:
scores = torch.FloatTensor([[1,2,5,4,3,1], [1,5,3,4,2,1],[1,2,3,5,4,1]])
scores = top_k_scores.expand_as(scores) + scores 
scores

tensor([[ 6.,  7., 10.,  9.,  8.,  6.],
        [ 5.,  9.,  7.,  8.,  6.,  5.],
        [ 4.,  5.,  6.,  8.,  7.,  4.]])

**row: previous**, **column: next**

In [12]:
top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)
top_k_scores, top_k_words

(tensor([10.,  9.,  9.]), tensor([2, 3, 7]))

In [13]:
prev_word_inds = top_k_words / vocab_size 
next_word_inds = top_k_words % vocab_size
prev_word_inds, next_word_inds

(tensor([0, 0, 1]), tensor([2, 3, 1]))

In [14]:
incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                           next_word != 5]
complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))
incomplete_inds,complete_inds

([0, 1, 2], [])

In [15]:
if len(complete_inds) > 0:
    complete_seqs.extend(seqs[complete_inds].tolist())
    complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist())
    complete_seqs_scores.extend(top_k_scores[complete_inds])
k -= len(complete_inds)  # reduce beam length accordingly

In [16]:
seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)
seqs

tensor([[0, 4, 2],
        [0, 4, 3],
        [0, 3, 1]])

In [17]:
top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

top_k_scores, k_prev_words

(tensor([[10.],
         [ 9.],
         [ 9.]]), tensor([[2],
         [3],
         [1]]))

## Meet the `<<end>>`

In [18]:
scores = torch.FloatTensor([[1,2,5,4,3,100], [1,5,3,4,2,1],[1,2,3,5,4,1]])
scores = top_k_scores.expand_as(scores) + scores 
scores

tensor([[ 11.,  12.,  15.,  14.,  13., 110.],
        [ 10.,  14.,  12.,  13.,  11.,  10.],
        [ 10.,  11.,  12.,  14.,  13.,  10.]])

In [19]:
top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)
top_k_scores, top_k_words

(tensor([110.,  15.,  14.]), tensor([5, 2, 3]))

In [20]:
prev_word_inds = top_k_words / vocab_size 
next_word_inds = top_k_words % vocab_size
prev_word_inds, next_word_inds

(tensor([0, 0, 0]), tensor([5, 2, 3]))

In [21]:
incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                           next_word != 5]
complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))
incomplete_inds,complete_inds

([1, 2], [0])

In [22]:
if len(complete_inds) > 0:
    complete_seqs.extend(seqs[complete_inds].tolist())
    complete_seqs_scores.extend(top_k_scores[complete_inds])
k -= len(complete_inds)  # reduce beam length accordingly
k

2

In [23]:
complete_seqs, complete_seqs_scores

([[0, 4, 2]], [tensor(110.)])

In [24]:
seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)
seqs

tensor([[0, 4, 2, 5],
        [0, 4, 2, 2],
        [0, 4, 2, 3]])

In [None]:
def beam_search(img_feature_embedding, decoder_path, beam_size=3, vocab=vocab):
    k = beam_size
    vocab_size=len(vocab)
    decoder = torch.load(decoder_path)
    k_prev_words = torch.LongTensor([[vocab.word2idx["<<start>>"]]] * k)
    complete_seqs = list()
    complete_seqs_scores = list()
    seqs = k_prev_words 
    seqs

In [29]:
vocab.word2idx["<<start>>"]

1

In [28]:
len(vocab)

5910