In [28]:
import torch
import torch.nn as nn
import numpy as np
import torchvision.models as models
from torchvision import transforms
from PIL import Image
from torch.nn.utils.rnn import pack_padded_sequence

<h3>Encoder</h3>

In [29]:
cuda = torch.device('cuda')
resnet = models.resnet152(pretrained=True)
modules = list(resnet.children())[:-1]      # delete the last fc layer.
resnet = nn.Sequential(*modules)
# en_linear = nn.Linear(2048, 1024).to(cuda)
# bn = nn.BatchNorm1d(1024, momentum=0.01).to(cuda)

class EncoderCNN(nn.Module):
    def __init__(self):
       super(EncoderCNN, self).__init__()
       self.en_linear = nn.Linear(2048, 1024)
       self.bn = nn.BatchNorm1d(1024, momentum=0.01)
        
    def forward(self, images):
        features = self.bn(self.en_linear(images))
        return features


<h3>Pre-Processing</h3>

In [30]:
image_names = ['blow','eat','race','sleep','dance','drink','guitar','cook','fight','football']

images = []
for name in image_names: 
    input_image = Image.open('E:\jupyterNotebook\data\images\\'+name+'.jpg')
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    input_tensor = preprocess(input_image)
    images.append(input_tensor)
    
images = torch.stack(images)

<h3>Vocabulary</h3>

In [31]:
class Vocabulary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
        
    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
            
    def __len__(self):
        return len(self.word2idx)

            
vocab = Vocabulary()

vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word('<unk>')
vocab.add_word('<pad>')

captions = ['a girl is blowing candles from a birthday cake',
           'a boy is having meal on a table',
           'horses are racing on a race track',
            'a man is sleeping on the bed',
           'three people are dancing on the floor',
           'two person are drinking orange juice',
           'a girl is playing guitar',
            'a man is cutting capsicum',
           'two man are fighting',
           'men are playing football']

for caption in captions:
    words = caption.split()
    for word in words :
        vocab.add_word(word)
vocab.word2idx

{'<end>': 1,
 '<pad>': 3,
 '<start>': 0,
 '<unk>': 2,
 'a': 4,
 'are': 18,
 'bed': 25,
 'birthday': 10,
 'blowing': 7,
 'boy': 12,
 'cake': 11,
 'candles': 8,
 'capsicum': 38,
 'cutting': 37,
 'dancing': 28,
 'drinking': 32,
 'fighting': 39,
 'floor': 29,
 'football': 41,
 'from': 9,
 'girl': 5,
 'guitar': 36,
 'having': 13,
 'horses': 17,
 'is': 6,
 'juice': 34,
 'man': 22,
 'meal': 14,
 'men': 40,
 'on': 15,
 'orange': 33,
 'people': 27,
 'person': 31,
 'playing': 35,
 'race': 20,
 'racing': 19,
 'sleeping': 23,
 'table': 16,
 'the': 24,
 'three': 26,
 'track': 21,
 'two': 30}

<h3>Building Input and Target tensors from the captions</h3>

In [32]:
tensor_list_of_inputs = []
tensor_list_of_targets = []

for caption in captions:
    words = caption.split()
    
    input_ids = []
    target_ids = []
    input_ids.append(vocab.word2idx['<start>'])
    for word in words :
        input_ids.append(vocab.word2idx[word])
        target_ids.append(vocab.word2idx[word])
    target_ids.append(vocab.word2idx['<end>'])
    tensor_list_of_inputs.append(torch.tensor(input_ids))
    tensor_list_of_targets.append(torch.tensor(target_ids))
    
inputs = nn.utils.rnn.pad_sequence(tensor_list_of_inputs,batch_first = True, padding_value = vocab.word2idx['<pad>']).to(cuda)
targets = nn.utils.rnn.pad_sequence(tensor_list_of_targets,batch_first = True, padding_value = vocab.word2idx['<pad>']).to(cuda)

<h3>Decoder, Loss and Optimizer</h3>

In [33]:
embed = nn.Embedding(42,512).to(cuda)  # (vocabulary_size,embedding_dimension )
lstm1 = nn.LSTMCell(512,1024).to(cuda)     # (input_size, output/hidden size)
lstm2 = nn.LSTMCell(1024,1024).to(cuda)
linear = nn.Linear(1024,42).to(cuda)    # (hidden_size, vocab_size)
loss = nn.CrossEntropyLoss().to(cuda)

encoder = EncoderCNN().to(cuda)
params = list(lstm1.parameters())+list(lstm2.parameters()) + list(embed.parameters())+ list(linear.parameters())+ list(encoder.parameters())
optim = torch.optim.Adam(params, lr= 0.01)

<h3>Feature Extraction and Training</h3>

In [59]:
with torch.no_grad():
    resnet_features = resnet(images).squeeze()

resnet_features = resnet_features.to(cuda)

iteration = 1
error = 1000
while error > 0.001 :
    features = encoder(resnet_features)
    embedding = embed(inputs).permute(1,0,2)
    hidden1 = features.to(cuda)
    cell1 = features.to(cuda)

    outputs = []
    for i in range(10):
        hidden1, cell1 = lstm1(embedding[i], (hidden1,cell1))
        hidden2, _ = lstm2(hidden1)
        outputs.append(hidden2)
        hidden1 = hidden1+features

    outputs = torch.stack(outputs)
    outputs = outputs.to(cuda)
    outputs = linear(outputs)
    outputs = outputs.permute(1,2,0)
    error = loss(outputs,targets)

    lstm1.zero_grad()
    lstm2.zero_grad()
    embed.zero_grad()
    linear.zero_grad()
    encoder.zero_grad()

    error.backward()
    optim.step()
    print(iteration,' ',error)
    iteration += 1
    torch.cuda.empty_cache() 


1   tensor(4.1669, device='cuda:0', grad_fn=<NllLoss2DBackward>)
2   tensor(0.6005, device='cuda:0', grad_fn=<NllLoss2DBackward>)
3   tensor(0.4770, device='cuda:0', grad_fn=<NllLoss2DBackward>)
4   tensor(0.2919, device='cuda:0', grad_fn=<NllLoss2DBackward>)
5   tensor(0.1380, device='cuda:0', grad_fn=<NllLoss2DBackward>)
6   tensor(0.0761, device='cuda:0', grad_fn=<NllLoss2DBackward>)
7   tensor(0.0395, device='cuda:0', grad_fn=<NllLoss2DBackward>)
8   tensor(0.0220, device='cuda:0', grad_fn=<NllLoss2DBackward>)
9   tensor(0.0140, device='cuda:0', grad_fn=<NllLoss2DBackward>)
10   tensor(0.0093, device='cuda:0', grad_fn=<NllLoss2DBackward>)
11   tensor(0.0067, device='cuda:0', grad_fn=<NllLoss2DBackward>)
12   tensor(0.0046, device='cuda:0', grad_fn=<NllLoss2DBackward>)
13   tensor(0.0034, device='cuda:0', grad_fn=<NllLoss2DBackward>)
14   tensor(0.0027, device='cuda:0', grad_fn=<NllLoss2DBackward>)
15   tensor(0.0021, device='cuda:0', grad_fn=<NllLoss2DBackward>)
16   tensor(0.0017,

<h3>Test</h3>

In [62]:
name = 'guitar' 
input_image = Image.open('E:\jupyterNotebook\data\images\\'+name+'.jpg')
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image)
input_tensor = input_tensor.unsqueeze(0)
test_feature = resnet(input_tensor).view(1,-1).to(cuda)
test_encoder = EncoderCNN().eval().to(cuda)
image_feature = test_encoder(test_feature)
print(image_feature)
test_hidden1 = image_feature.clone().detach()
test_cell1 = image_feature.clone().detach()
test_embedding = embed(torch.tensor(vocab.word2idx['<start>']).to(cuda)).unsqueeze(0)


outputs = []
for i in range(10):
    test_hidden1, test_cell1 = lstm1(test_embedding,(test_hidden1,test_cell1))
    test_hidden2, _ = lstm2(test_hidden1)
    out = linear(test_hidden2)
    outputs.append(out)
    _, predicted = out.max(1)
    test_embedding = embed(predicted)
    test_hidden1 = test_hidden1+image_feature

outputs = torch.cat(outputs)
print(outputs.shape)

ids = torch.max(outputs,1)[1].tolist()
#print(ids)
for item in ids:
    word = vocab.idx2word[item] 
    print(word)
    if word == '<end>':
        break

tensor([[-0.2549, -0.5863,  0.5078,  ...,  0.1732,  0.3482, -0.0046]],
       device='cuda:0', grad_fn=<CudnnBatchNormBackward>)
torch.Size([10, 42])
a
man
is
sleeping
on
the
bed
<end>
