In [1]:
import numpy as np
import os
from data_loader import build_vocab, get_loader
from model import EncoderCNN, DecoderRNN 
from attn_model import AttnEncoder
from model import ResNet, ResidualBlock
import torch
from torch.autograd import Variable 
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
import pickle
import torch.nn as nn 
from Attention import Attn

h_dec = torch.rand(128,26)

def to_var(x, volatile=False):
    if torch.cuda.is_available():
        x = x.cuda(1)
    return Variable(x, volatile=volatile)

def rearrange_tensor(x, batch_size, caption_size):
    for i in range(caption_size):
        temp = x[i*batch_size:(i+1)*batch_size].view(batch_size, -1)
        if i == 0:
            temp_cat = temp 
        else: 
            temp_cat = torch.cat((temp_cat,  temp), 1)

    return temp_cat

root_path ='data/circle_and_rect/'
vocab_path ='data/vocab.pkl'
batch_size= 64
num_workers = 2 
embed_size = 256
hidden_size = 512
num_layers =1 

# Image preprocessing
transform = transforms.Compose([
    transforms.ToTensor(), 
    transforms.Normalize((0.485, 0.456, 0.406), 
                         (0.229, 0.224, 0.225))])

# Load vocabulary wrapper
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)
len_vocab = vocab.idx

data_loader = get_loader(root_path, vocab, 
                         transform, batch_size,
                         shuffle=True, num_workers=num_workers) 

encoder = ResNet(ResidualBlock, [3, 3, 3],len_vocab)
decoder = DecoderRNN(len_vocab, len_vocab, 
                     len(vocab), num_layers)

attn_encoder = AttnEncoder(ResidualBlock, [3,3,3])

if torch.cuda.is_available():
    attn_encoder.cuda(1)
    encoder.cuda(1)
    decoder.cuda(1)

In [2]:
embedding = nn.Embedding(len_vocab, 256).cuda(1)

for i, (images, captions, lengths) in enumerate(data_loader):
    if i > 1 : 
        break;
    cap_ = torch.unsqueeze(captions,2)
    one_hot_ = torch.FloatTensor(batch_size,captions.size(1),len_vocab).zero_()
    one_hot_caption = one_hot_.scatter_(2, cap_, 1)    
    leng = lengths
    
    images = to_var(images)  
    captions2 = to_var(captions)
    embed= embedding(captions2)
    captions_ = to_var(one_hot_caption)
    features = attn_encoder(images)
    #outputs = decoder(features, captions_, lengt

In [4]:
features[0]

Variable containing:
 0.5076  1.0960  0.0000  ...   0.0000  0.0000  0.0976
 0.2090  0.4434  0.5210  ...   0.3823  0.3628  0.7257
 0.7278  0.5846  1.3169  ...   0.2342  0.4350  1.7801
          ...             ⋱             ...          
 1.0747  0.0385  0.0000  ...   0.6958  0.5573  0.0000
 0.0815  0.0000  0.0000  ...   0.1402  0.9188  0.2954
 0.1069  0.2299  0.1398  ...   0.1414  0.2787  0.0000
[torch.cuda.FloatTensor of size 128x256 (GPU 1)]

In [12]:
f_t= features.transpose(2,1)

In [14]:
f_t.transpose(2,1)[0]

Variable containing:
 0.5076  1.0960  0.0000  ...   0.0000  0.0000  0.0976
 0.2090  0.4434  0.5210  ...   0.3823  0.3628  0.7257
 0.7278  0.5846  1.3169  ...   0.2342  0.4350  1.7801
          ...             ⋱             ...          
 1.0747  0.0385  0.0000  ...   0.6958  0.5573  0.0000
 0.0815  0.0000  0.0000  ...   0.1402  0.9188  0.2954
 0.1069  0.2299  0.1398  ...   0.1414  0.2787  0.0000
[torch.cuda.FloatTensor of size 128x256 (GPU 1)]

In [4]:
feature_size =256
init_layer = nn.Linear(feature_size, hidden_size).cuda(1)
attn = Attn('general', feature_size, hidden_size).cuda(1)
def init_lstm(features):

    sums = torch.sum(features, 1)
    out = torch.mul(sums, 1/features.size(1))
    out = out.squeeze(1).unsqueeze(0) # 1, batch, feature_size
    out = init_layer(out.squeeze(0)).unsqueeze(0)

    return out, out 
h_, c_ = init_lstm(features)
context = attn(h_,features)

In [9]:
temp = torch.rand(2,5,10)

In [13]:
embed = embedding(captions2)
lstm_input = torch.cat((context, embed[:,1].unsqueeze(1)),1)

In [14]:
lstm = nn.LSTM(256, hidden_size, num_layers, batch_first=True).cuda(1)
lstm_out, (h,c)  = lstm(lstm_input,(h_,c_))

In [19]:
hiddens, states = lstm(lstm_input) 

In [27]:
features[0]

Variable containing:
 1.1240  0.9178  1.0267  ...   0.4510  0.0968  0.6079
 0.0058  0.0327  0.0026  ...   0.0016  0.1136  0.0467
 0.0670  0.0000  0.0000  ...   0.0000  0.0017  0.0026
          ...             ⋱             ...          
 0.1510  0.3688  0.4252  ...   0.1172  0.3290  0.0373
 0.0000  0.2017  0.1676  ...   0.2163  0.3762  0.0000
 0.4142  0.3087  0.5983  ...   0.2978  0.1381  0.1541
[torch.cuda.FloatTensor of size 128x256 (GPU 1)]

In [28]:
features.transpose(2,1)[0]

Variable containing:
 1.1240  0.0058  0.0670  ...   0.1510  0.0000  0.4142
 0.9178  0.0327  0.0000  ...   0.3688  0.2017  0.3087
 1.0267  0.0026  0.0000  ...   0.4252  0.1676  0.5983
          ...             ⋱             ...          
 0.4510  0.0016  0.0000  ...   0.1172  0.2163  0.2978
 0.0968  0.1136  0.0017  ...   0.3290  0.3762  0.1381
 0.6079  0.0467  0.0026  ...   0.0373  0.0000  0.1541
[torch.cuda.FloatTensor of size 256x128 (GPU 1)]

In [15]:
h.size()

torch.Size([1, 64, 512])

In [None]:
lstm_out.size()

In [None]:
hiddens_temp.size()

In [None]:
x_int = Variable(torch.LongTensor([1])).cuda(1)

In [None]:
embedding(x_int)

In [None]:
temp = torch.rand(2,5,10)

In [None]:
temp

In [None]:
temp.view(-1,10)

In [None]:
features.size()

In [None]:
embed[:,1].unsqueeze(1)

In [22]:
lstm = nn.LSTM(256, hidden_size, num_layers, batch_first=True).cuda(1)
temp_cat = torch.cat((features,embed),1)
hiddens_temp, (h,c) = lstm(temp_cat)

In [None]:
hi

In [None]:
sums = torch.sum(features, 1)

In [None]:
out = torch.mul(sums, 1/128)

In [None]:
out.squeeze(1)

In [33]:
attn_weight.size()

torch.Size([64, 128, 1])

In [29]:
import torch.nn.functional as F

h_=(h.squeeze(0))
align_linear = nn.Linear(512, 256).cuda(1)
de_h = align_linear(h_).unsqueeze(2)
attn_weight = torch.bmm(features, de_h)
attn_weight = F.softmax(attn_weight)

In [31]:
context = torch.bmm(attn_weight.transpose(2,1),features)

In [32]:
context.size()

torch.Size([64, 1, 256])

In [None]:
temp = context.squeeze(1) + embed[:,1]

In [None]:
temp.size()

In [None]:
lstm_input = torch.cat((context, embed[:,1].unsqueeze(1)),1)

In [None]:
out, (h,c) = lstm(lstm_input, None)

In [None]:
h

In [None]:
states

In [None]:
temp_layer = nn.Linear(512,256)

In [None]:
temp = torch.rand(64,1,512)

In [None]:
h_ = h.squeeze(0)

In [None]:
align_linear = nn.Linear(512, 256).cuda(1)
de_h = align_linear(h_)

In [None]:
de_hidden = de_h.unsqueeze(2)

In [None]:
attn= torch.bmm(features, de_hidden)

In [None]:
attn.squeeze(2)

In [None]:
import torch.nn.functional as F
attn = F.softmax(attn.squeeze(2))

In [None]:
features[0]

In [None]:
features.transpose(1,2)[0]

In [None]:
features

In [None]:
features.dot(h.squeeze(0))

In [None]:
h

In [None]:
h.squeeze(0)

In [None]:
features

In [None]:
features.transpose(1,0)

In [None]:
lstm = nn.LSTM(len_vocab, hidden_size, num_layers, batch_first=True).cuda()
packed  = pack_padded_sequence(captions_, leng, batch_first=True)
lstm_out, _ = lstm(packed)

In [None]:
lstm_out