In [1]:
import numpy as np
import os
from data_loader import build_vocab, get_loader
from model import EncoderCNN, DecoderRNN 
from model import ResNet, ResidualBlock
import torch
from torch.autograd import Variable 
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
import pickle
import torch.nn as nn 
h_dec = torch.rand(128,26)

def to_var(x, volatile=False):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x, volatile=volatile)

def rearrange_tensor(x, batch_size, caption_size):
    for i in range(caption_size):
        temp = x[i*batch_size:(i+1)*batch_size].view(batch_size, -1)
        if i == 0:
            temp_cat = temp 
        else: 
            temp_cat = torch.cat((temp_cat,  temp), 1)

    return temp_cat

root_path ='data/circle_and_rect/'
vocab_path ='data/vocab.pkl'
batch_size= 128 
num_workers = 2 
embed_size = 256
hidden_size = 512
num_layers =1 

In [2]:
# Image preprocessing
transform = transforms.Compose([
    transforms.ToTensor(), 
    transforms.Normalize((0.485, 0.456, 0.406), 
                         (0.229, 0.224, 0.225))])

# Load vocabulary wrapper
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)
len_vocab = vocab.idx

data_loader = get_loader(root_path, vocab, 
                         transform, batch_size,
                         shuffle=True, num_workers=num_workers) 

encoder = ResNet(ResidualBlock, [3, 3, 3],len_vocab)
decoder = DecoderRNN(len_vocab, hidden_size, 
                     len(vocab), num_layers)

if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

In [3]:
for i, (images, captions, lengths) in enumerate(data_loader):
    if i > 1 : 
        break;
    cap_ = torch.unsqueeze(captions,2)
    one_hot_ = torch.FloatTensor(batch_size,captions.size(1),len_vocab).zero_()
    one_hot_caption = one_hot_.scatter_(2, cap_, 1)    
    leng = lengths
    
    images = to_var(images)  
    captions = to_var(captions)
    captions_ = to_var(one_hot_caption)
    features = encoder(images)
    targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
    outputs = decoder(features, captions_, lengths)
    


In [15]:
features

Variable containing:
 0.1514 -0.6299 -0.5498  ...   0.2470 -0.4758 -0.0528
 0.2110  0.3993  0.0387  ...  -0.2643 -0.3959  0.0107
 0.2059  0.5353  0.1640  ...  -0.3173 -0.2057 -0.2928
          ...             ⋱             ...          
 0.2427 -0.1323  0.2085  ...  -0.5680 -0.4116  1.1060
-0.8971  0.6601  1.3120  ...  -1.2357 -0.2281 -0.1544
 0.0348  0.0856 -0.0078  ...  -0.2732 -0.3577  0.1062
[torch.cuda.FloatTensor of size 128x26 (GPU 0)]

In [13]:
import torch.nn as nn 
lstm = nn.LSTM(len_vocab, hidden_size, num_layers, batch_first=True).cuda(1)
temp_cat = torch.cat((features.unsqueeze(1),captions_),1)
hiddens_temp, _ = lstm(temp_cat)

In [14]:
lstm(captions_)

(Variable containing:
 ( 0 ,.,.) = 
  -2.9497e-03 -1.2480e-02 -1.5294e-02  ...   1.6388e-02 -8.7888e-03  1.4579e-02
   1.1192e-02 -7.6828e-03 -3.0252e-02  ...   1.0948e-02  3.4776e-04  3.1918e-03
   1.1136e-02  9.9110e-05 -2.3322e-02  ...   1.4504e-02  4.7733e-03  2.1861e-03
   1.8795e-02  2.4935e-03 -2.1392e-02  ...   1.1449e-02  1.3152e-03  1.1527e-02
   2.2420e-02  3.3504e-03 -2.0521e-02  ...   1.0018e-02 -2.1358e-03  1.5562e-02
   1.5326e-02 -1.0396e-02 -3.0186e-02  ...   1.8326e-02 -1.2069e-02  9.6916e-03
 
 ( 1 ,.,.) = 
  -2.9497e-03 -1.2480e-02 -1.5294e-02  ...   1.6388e-02 -8.7888e-03  1.4579e-02
   3.0426e-03 -7.2670e-03 -2.3224e-02  ...   1.8275e-02 -6.5073e-03  1.7082e-02
   6.6743e-03 -4.1104e-04 -2.1789e-02  ...   1.9862e-02  3.2658e-04  1.0506e-02
   6.4547e-03 -1.0442e-02 -2.0746e-02  ...   1.3157e-02  8.1170e-04  1.3963e-02
   7.1653e-03 -1.6244e-02 -1.8721e-02  ...   9.0862e-03  1.2164e-04  1.5033e-02
   9.0729e-03 -2.1111e-02 -2.7527e-02  ...   1.7913e-02 -1.0420e-02 

In [None]:
linear = nn.Linear(hidden_size, len_vocab).cuda()

In [None]:
temp4= hiddens_temp.contiguous().view(-1,hidden_size)

In [None]:
packed2 = pack_padded_sequence(captions_, leng, batch_first=True)
packed2

In [None]:
hiddens_temp.contiguous().view(-1,512)

In [None]:
temp3 = torch.rand(128,7,512)

In [None]:
linear = nn.Linear(hidden_size, len_vocab

In [None]:
linear = nn.Linear(hidden_size, len_vocab).cuda(1)
temp2 = torch.rand(hidden_size).cuda(1)
linear(temp2).size()

In [None]:
linear = nn.Linear(hidden_size, len_vocab).cuda(1)
linear(hiddens_temp)

In [4]:
embeddings = torch.cat((features.unsqueeze(1),captions_), 1)
packed = pack_padded_sequence(embeddings, leng, batch_first=True)

In [6]:
embeddings

Variable containing:
( 0 ,.,.) = 
  0.1514 -0.6299 -0.5498  ...   0.2470 -0.4758 -0.0528
  0.0000  1.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  1.0000  ...   0.0000  0.0000  0.0000

( 1 ,.,.) = 
  0.2110  0.3993  0.0387  ...  -0.2643 -0.3959  0.0107
  0.0000  1.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  1.0000  ...   0.0000  0.0000  0.0000

( 2 ,.,.) = 
  0.2059  0.5353  0.1640  ...  -0.3173 -0.2057 -0.2928
  0.0000  1.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
   

In [7]:
packed[0]

Variable containing:
 0.1514 -0.6299 -0.5498  ...   0.2470 -0.4758 -0.0528
 0.2110  0.3993  0.0387  ...  -0.2643 -0.3959  0.0107
 0.2059  0.5353  0.1640  ...  -0.3173 -0.2057 -0.2928
          ...             ⋱             ...          
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
[torch.cuda.FloatTensor of size 709x26 (GPU 0)]

In [None]:
import torch.nn as nn 
lstm = nn.LSTM(len_vocab, hidden_size, num_layers, batch_first=True).cuda(1)

In [None]:
hiddens, _ = lstm(packed)

In [None]:
features

In [None]:
hiddens[0]

In [None]:
packed

In [None]:
y_onehot = torch.FloatTensor(1, 26)

one_hot = y_onehot.zero_()
#y_onehot.scatter_(1, 2, 1)
one_hot[:,1] = 1 

In [None]:
tep = torch.FloatTensor(1,1)
tep[0] = 1

In [None]:
tep[0].cpu().numpy()

In [None]:
y_onehot.scatter_(0,inx, 1.0)

In [None]:
import torch.nn as nn 
embed = nn.Embedding(26,512)

In [None]:
rand = Variable(torch.rand(26).long())

In [None]:
embed(rand)

In [None]:
captions2

In [None]:
captions2

In [None]:
torch.cat((features.unsqueeze(1), captions1), 1)

In [None]:
temp_images = to_var(images)
features= encoder(temp_images)


In [None]:
features.requires_grad

In [None]:
temp_images = to_var(images)
temp_caption = to_var(captions)
embedding =decoder.embed(temp_caption)
features= encoder(temp_images)
cat_embeddings = torch.cat((features.unsqueeze(1), embedding), 1)       
packed = pack_padded_sequence(cat_embeddings, leng, batch_first=True) 
hidden, _ = decoder.lstm(packed)

In [None]:
cap_ = torch.unsqueeze(captions,2)
one_hot_ = torch.FloatTensor(128,5,26).zero_()
one_hot_caption = one_hot_.scatter_(2, cap_, 1)
temp_images = to_var(images)
features= encoder(temp_images)
one_hot_caption_ = to_var(one_hot_caption)

In [None]:
torch.cat((features.unsqueeze(1), one_hot_caption_), 1) 

In [None]:
decoder(features, one_hot_caption_, leng)

In [None]:
packed

In [None]:
captions

In [None]:
tmep_cap_ = captions.unsqueeze(2)

In [None]:
tmep_cap_.cuda(1)

In [None]:
one_hot = torch.FloatTensor(128,5,26).zero_().cuda(1)

In [None]:
one_hot.scatter_(2,tmep_cap_, 1)

In [None]:
cap_ = torch.unsqueeze(captions,2)
one_hot_ = torch.LongTensor(128,5,26).zero_()
ddas = one_hot_.scatter_(2, cap_, 1)

In [None]:
ddas.cuda()

In [None]:
n = 10
inp = torch.LongTensor(16, 28) % n
inp_ = torch.unsqueeze(inp, 2)

one_hot = torch.FloatTensor(16, 28, n).zero_()
one_hot.scatter_(2, inp_, 1)

print(inp)
print(one_hot)

In [None]:
torch.dot(torch.Tensor)