In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models
from torch.nn.utils.rnn import pack_padded_sequence

In [17]:
# model = models.vgg11()

In [18]:
# model.classifier[6] = nn.Linear(in_features=4096, out_features=10, bias=True)

In [19]:
# model.classifier[6]

In [20]:
class Inc_CNN(nn.Module):
    def __init__(self, embed_size, train_CNN=False):
        super(Inc_CNN, self).__init__()
        self.train_CNN = train_CNN
        # self.inception = models.inception_v3(pretrained=True, aux_logits=False)
        # self.inception.fc = nn.Linear(self.inception.fc.in_features, embed_size)
        self.cnn = models.vgg16(pretrained=True)
        self.cnn.classifier[6] = nn.Linear(in_features=4096,out_features=embed_size)
        self.relu = nn.ReLU()
        self.times = []
        self.dropout = nn.Dropout(0.5)

    def forward(self, images):
        features = self.cnn(images)
        return self.dropout(self.relu(features))


In [21]:
# enc_c = Inc_CNN(10)

In [22]:
# x = torch.rand((1,3,244,244))
# print(x.shape)
# print(x)
# enc_c(x)

In [35]:
class Custom_RNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(Custom_RNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)
        self.l_softmax = nn.LogSoftmax((hidden_size,vocab_size))

    def forward(self, features, captions, lengths):
        embeddings = self.embed(captions)
        print('emb',embeddings.shape)
        inp = embeddings
        embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        # print('emmb cat:', embeddings.shape)
        embeddings = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted= False)
        hiddens, _ = self.lstm(embeddings)
        # h_0 = features.unsqueeze(0).float()
        # c_0 = torch.rand(1,features.shape[-2],features.shape[-1])
        # print(h_0.shape)
        # print(c_0.shape)
        # hiddens, _ = self.lstm(inp, (h_0, c_0))
        # hiddens = torch.tensor(hiddens)
        # print('h\'s sape',hiddens[0].shape)
        # print('h\'s sape',hiddens[1].shape)
        # print('h\'s sape',hiddens[2].shape)
        # print('h\'s sape',hiddens[3].shape)
        # print('h\'s sape',len(hiddens))
        outputs = self.linear(hiddens[0])
        # outputs = self.l_softmax(outputs)
        return outputs

In [36]:
crnn = Custom_RNN(10, 10, 15, 1)

In [None]:
cap = torch.LongTensor([[1, 2, 2, 0, 1, 0, 0, 0, 0, 0],
                        [0, 0, 9, 0, 1, 0, 0, 5, 0, 3],
                        [8, 9, 0, 0, 1, 0, 0, 0, 4, 0],
                        [9, 8, 1, 0, 5, 0, 0, 7, 0, 0]])
fet = torch.LongTensor([[0, 1, 1, 0, 0, 0, 0, 0, 0, 1],
                        [0, 1, 1, 0, 0, 0, 0, 0, 0, 1],
                        [0, 1, 1, 0, 0, 0, 0, 0, 0, 1],
                        [0, 1, 1, 0, 0, 0, 0, 0, 0, 1]])
# print('cap:', cap)
# print('fet:', fet)
# print('unsqz', fet.unsqueeze(0))
# cap = pack_padded_sequence(cap, [10], batch_first=True, enforce_sorted=False)
result = crnn(fet, cap, [5, 5, 5, 5])
print('Result:',result.shape)
# print(result[:])

In [11]:
# cap = torch.LongTensor([[1, 2, 2, 4, 1, 0, 1, 1, 1, 1]])
# fet = torch.LongTensor([[0, 1, 1, 0, 0, 0, 0, 0, 0, 1]])
# # print('cap:', cap)
# # print('fet:', fet)
# # print('unsqz', fet.unsqueeze(0))
# # cap = pack_padded_sequence(cap, [10], batch_first=True, enforce_sorted=False)
# result = crnn(fet, cap, [10])
# print('Result:',result.shape)
# print(result)

In [12]:
class RCNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(RCNN, self).__init__()
        self.encoderCNN = Inc_CNN(embed_size)
        self.decoderRNN = Custom_RNN(embed_size, hidden_size, vocab_size, num_layers)
        for params in self.encoderCNN.parameters():
            params.require_grad = False
        for params in self.encoderCNN.cnn.classifier[6].parameters():
            params.require_grad = True

    def forward(self, images, captions, lenghts):
        features = self.encoderCNN(images)
        #print('done for encoder')
        # mioght need to change type of features
        outputs = self.decoderRNN(features, captions, lenghts)
        #print('done for decoder')
        return outputs

    def caption_image(self, image, vocabulary, max_length=50):
        result_caption = []

        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0)
            states = None

            for _ in range(max_length):
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))
                predicted = output.argmax(1)
                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze(0)

                if vocabulary.itos[predicted.item()] == "<EOS>":
                    break

        return [vocabulary.itos[idx] for idx in result_caption]
    
    def get_bs_pred(self, features, hidden=None):
        ''' Helper Function for Beam Search'''
        if(hidden != None):
            features = self.embed(features).unsqueeze(1)
        output, hidden = self.lstm(features, hidden)
        output = self.linear(output.squeeze(1))
        return output, hidden

In [13]:
embed_size = 2
hidden_size = 2
vocab_size = 10
layers = 1
rcnn_local = RCNN(embed_size=embed_size,hidden_size=hidden_size, vocab_size=vocab_size, num_layers=layers)


In [14]:
# caption_l = torch.tensor([[0,1,2,1,0]])
# print(type(caption_l))
# op = rcnn_local(x, caption_l)
# print('Output shape',op.shape)
# print(op)