In [1]:
import torch
import torch.nn as nn 
from torchvision import transforms, utils, models
from torch.nn.utils.rnn import pack_padded_sequence

In [2]:
'''torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0,)

Parameters

in_channels (int) – Number of channels in the input image
out_channels (int) – Number of channels produced by the convolution
kernel_size (int or tuple) – Size of the convolving kernel
stride (int or tuple, optional) – Stride of the convolution. (Default: 1)
padding (int or tuple, optional) – Zero-padding added to both sides of the input (Default: 0)'''


'torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0,)\n\nParameters\n\nin_channels (int) – Number of channels in the input image\nout_channels (int) – Number of channels produced by the convolution\nkernel_size (int or tuple) – Size of the convolving kernel\nstride (int or tuple, optional) – Stride of the convolution. (Default: 1)\npadding (int or tuple, optional) – Zero-padding added to both sides of the input (Default: 0)'

In [19]:
# vgg19 = models.vgg19()

In [18]:
# print(vgg19)

In [5]:
class VGG19(nn.Module):
    def __init__(self, embed_size):
        super(VGG19, self).__init__()
        self.embed_size = embed_size
        self.model = nn.Sequential(nn.Conv2d(3, 64, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(64, 64, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.MaxPool2d(kernel_size=2, stride=2),
                                   nn.Conv2d(64, 128, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(128, 128, kernel_size=2, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.MaxPool2d(kernel_size=2, stride=2),
                                   nn.Conv2d(128, 256, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(256, 256, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(256, 256, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(256, 256, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.MaxPool2d(kernel_size=2, stride=2),
                                   nn.Conv2d(256, 512, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(512, 512, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(512, 512, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(512, 512, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.MaxPool2d(kernel_size=2, stride=2),
                                   nn.Conv2d(512, 512, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(512, 512, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(512, 512, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.Conv2d(512, 512, kernel_size=3, padding=1),
                                   nn.ReLU(inplace = True),
                                   nn.MaxPool2d(kernel_size=2, stride=2))
        self.avgpool = nn.AdaptiveAvgPool2d(output_size=(7,7))
        self.classifier = nn.Sequential(                  
           nn.Linear(in_features=25088, out_features=4096),
           nn.ReLU(inplace=True),
           nn.Dropout(p=0.5, inplace=False),
           nn.Linear(in_features=4096, out_features=4096),
           nn.ReLU(inplace=True),
           nn.Linear(in_features=4096, out_features=self.embed_size),
           nn.ReLU(inplace=True),
             )
    def forward(self,image):
        features = self.model(image)
        # to reshape the tensor features.
        features = self.avgpool(features)
        features = features.view(features.shape[0],-1)
        
        features = self.classifier(features)

        return features   

In [6]:
class Custom_RNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(Custom_RNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)
        self.l_softmax = nn.LogSoftmax((hidden_size,vocab_size))

    def forward(self, features, captions, lengths):
        embeddings = self.embed(captions)
        print('emb',embeddings.shape)
        inp = embeddings
        embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        # print('emmb cat:', embeddings.shape)
        embeddings = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted= False)
        hiddens, _ = self.lstm(embeddings)
        # h_0 = features.unsqueeze(0).float()
        # c_0 = torch.rand(1,features.shape[-2],features.shape[-1])
        # print(h_0.shape)
        # print(c_0.shape)
        # hiddens, _ = self.lstm(inp, (h_0, c_0))
        # hiddens = torch.tensor(hiddens)
        # print('h\'s sape',hiddens[0].shape)
        # print('h\'s sape',hiddens[1].shape)
        # print('h\'s sape',hiddens[2].shape)
        # print('h\'s sape',hiddens[3].shape)
        # print('h\'s sape',len(hiddens))
        outputs = self.linear(hiddens[0])
        # outputs = self.l_softmax(outputs)
        return outputs

In [7]:
class RCNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(RCNN, self).__init__()
        self.encoderCNN = VGG19(embed_size)
        self.decoderRNN = Custom_RNN(embed_size, hidden_size, vocab_size, num_layers)
        # for params in self.encoderCNN.parameters():
        #     params.require_grad = False
        # for params in self.encoderCNN.cnn.classifier[6].parameters():
        #     params.require_grad = True

    def forward(self, images, captions, lenghts):
        features = self.encoderCNN(images)
        #print('done for encoder')
        # mioght need to change type of features
        outputs = self.decoderRNN(features, captions, lenghts)
        #print('done for decoder')
        return outputs

    def caption_image(self, image, vocabulary, max_length=50):
        result_caption = []

        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0)
            states = None

            for _ in range(max_length):
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))
                predicted = output.argmax(1)
                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze(0)

                if vocabulary[predicted.item()] == "<end>":
                    break

        return [vocabulary[idx] for idx in result_caption]
    
    def get_bs_pred(self, features, hidden=None):
        ''' Helper Function for Beam Search'''
        if(hidden != None):
            features = self.embed(features).unsqueeze(1)
        output, hidden = self.lstm(features, hidden)
        output = self.linear(output.squeeze(1))
        return output, hidden

In [8]:
embed_size = 2
hidden_size = 2
vocab_size = 10
layers = 1
rcnn_local = RCNN(embed_size=embed_size,hidden_size=hidden_size, vocab_size=vocab_size, num_layers=layers)


In [17]:
# x = torch.rand((1,3,224,224))
# cap = torch.randn((32,10))
# cap = torch.tensor([[0,1,0,0,0,0,0,0,0,0]])
# # op = rcnn_local(x, cap.int(), [10])
# # print(op.shape)
# vocab = {0:'hi', 1:'hello', 2:'h',3:'g',4:'f',5:'e',6:'d',7:'c',8:'b',9:'a',10:'is'}
# op2 = rcnn_local.caption_image(x, vocab, 10)
# print(op2)

Feature Shape: torch.Size([1, 25088])
['hello', 'hello', 'hello', 'hello', 'hello', 'hello', 'hello', 'hello', 'hello', 'hello']
