# Show, Attend and Tell

- Reference: https://github.com/parksunwoo/show_attend_and_tell_pytorch

In [1]:
"""
Creates a MobileNetV2 model as defined in the paper: M. Sandler, 
A. Howard, M. Zhu, A. Zhmoginov, L.-C. Chen. "MobileNetV2: Inverted 
Residuals and Linear Bottlenecks.", arXiv:1801.04381, 2018."

Code reference: https://github.com/tonylins/pytorch-mobilenet-v2
ImageNet pretrained weights: https://drive.google.com/file/d/1jlto6HRVD3ipNkAl1lNhDbkBp7HylaqR
"""
import math
import torch
import torch.nn as nn



def conv_bn(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )


def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )


class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = round(inp * expand_ratio)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV2(nn.Module):
    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280
        interverted_residual_setting = [
            # t, c, n, s
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # building first layer
        assert input_size % 32 == 0
        input_channel = int(input_channel * width_mult)
        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
        self.features = [conv_bn(3, input_channel, 2)]
        # building inverted residual blocks
        for t, c, n, s in interverted_residual_setting:
            output_channel = int(c * width_mult)
            for i in range(n):
                if i == 0:
                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
                else:
                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
                input_channel = output_channel
        # building last several layers
        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)

        # building classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.last_channel, n_class),
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = x.mean(3).mean(2)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()
                
def MobileNet(pretrained=True, **kwargs):
    """
    Constructs a MobileNet V2 model.
    
    Parameters
    ----------
    pretrained: bool, use ImageNet pretrained model or not.
    n_class: int, 1000 classes in ImageNet data.
    weight_file: str, path to pretrained weights
    """
    weight_file = kwargs.pop('weight_file', '')
    model = MobileNetV2(**kwargs)
    if pretrained:
        state_dict = torch.load(weight_file)
        model.load_state_dict(state_dict)
    return model

In [2]:
import torch

# Load weights pretrained on ImageNet data
net = MobileNetV2(n_class=1000)
state_dict = torch.load('./mobilenet_v2.pth.tar')
net.load_state_dict(state_dict)

In [3]:
# Load weights pretrained on ImageNet data using function
model = MobileNet(pretrained=True, n_class=1000, weight_file='./mobilenet_v2.pth.tar')

x = torch.randn(1, 3, 256, 256)
y = model(x)

print('Feature map size: ', y.size())

Feature map size:  torch.Size([1, 1000])


In [4]:
# Check ResNet
import torchvision
resnet = torchvision.models.resnet101(pretrained=True)
modules = list(resnet.children())[:-2]
print(len(modules))

ResNet = nn.Sequential(*modules)

for s in [224, 256, 448, 512, 1024]:
    x = torch.randn(1, 3, s, s)
    y = ResNet(x)
    print('Feature map size for input size {}: {}'.format(s, y.size()))

8
Feature map size for input size 224: torch.Size([1, 2048, 7, 7])
Feature map size for input size 256: torch.Size([1, 2048, 8, 8])
Feature map size for input size 448: torch.Size([1, 2048, 14, 14])
Feature map size for input size 512: torch.Size([1, 2048, 16, 16])
Feature map size for input size 1024: torch.Size([1, 2048, 32, 32])


In [5]:
# Load weights pretrained on ImageNet data using function
model = MobileNet(pretrained=True, n_class=1000, weight_file='./mobilenet_v2.pth.tar')

# Use only CONV layers. NOTE: input_size=224
modules = list(model.children())[:-1]
print(len(modules))

mobilenet = nn.Sequential(*modules)

for s in [224, 256, 448, 512, 1024]:
    x = torch.randn(1, 3, s, s)
    y = mobilenet(x)
    print('Feature map size for input size {}: {}'.format(s, y.size()))

1
Feature map size for input size 224: torch.Size([1, 1280, 7, 7])
Feature map size for input size 256: torch.Size([1, 1280, 8, 8])
Feature map size for input size 448: torch.Size([1, 1280, 14, 14])
Feature map size for input size 512: torch.Size([1, 1280, 16, 16])
Feature map size for input size 1024: torch.Size([1, 1280, 32, 32])


In [6]:
# Adaptive Pool
feat_size = 14
adaptive_pool = nn.AdaptiveAvgPool2d((feat_size, feat_size))

# Load weights pretrained on ImageNet data using function
model = MobileNet(pretrained=True, n_class=1000, weight_file='./mobilenet_v2.pth.tar')

# Use only CONV layers. NOTE: input_size=224
modules = list(model.children())[:-1]
print(len(modules))

mobilenet = nn.Sequential(*modules)

for s in [224, 256, 448, 512, 1024]:
    x = torch.randn(1, 3, s, s)
    y = mobilenet(x)
    out = adaptive_pool(y)
    print('Feature map size for input size {}: {}'.format(s, out.size()))

1
Feature map size for input size 224: torch.Size([1, 1280, 14, 14])
Feature map size for input size 256: torch.Size([1, 1280, 14, 14])
Feature map size for input size 448: torch.Size([1, 1280, 14, 14])
Feature map size for input size 512: torch.Size([1, 1280, 14, 14])
Feature map size for input size 1024: torch.Size([1, 1280, 14, 14])


##### Fine-tune

In [7]:
# Disable gradient update
print(list(mobilenet.parameters())[5])

for param in mobilenet.parameters():
    param.requires_grad = False

print(list(mobilenet.parameters())[5])

Parameter containing:
tensor([ 1.3784,  0.1473,  1.4864,  0.0514, -0.1557,  0.8790,  0.8830,  1.6455,
         0.1455,  0.1629, -0.0010,  0.7300,  0.8758,  0.7512, -0.0252,  1.3816,
         1.2248, -0.0075,  1.1364,  1.2654, -0.0028,  1.0171, -0.0015,  1.6248,
        -0.0977,  0.2228,  0.0906, -0.2924,  0.3492, -0.0314,  0.0851,  1.2641],
       requires_grad=True)
Parameter containing:
tensor([ 1.3784,  0.1473,  1.4864,  0.0514, -0.1557,  0.8790,  0.8830,  1.6455,
         0.1455,  0.1629, -0.0010,  0.7300,  0.8758,  0.7512, -0.0252,  1.3816,
         1.2248, -0.0075,  1.1364,  1.2654, -0.0028,  1.0171, -0.0015,  1.6248,
        -0.0977,  0.2228,  0.0906, -0.2924,  0.3492, -0.0314,  0.0851,  1.2641])


In [8]:
# Enable gradient update for a few parameters
for c in list(mobilenet.children())[0][15:]:
    for p in c.parameters():
        p.requires_grad = True

# Encoder CNN

In [9]:
class EncoderCNN(nn.Module):
    """
    Convolutional Neural Network (MobileNetV2) that encodes input image 
    into encoded feature representations.
    """
    def __init__(self, weight_file, feature_size=14, tune_layer=None, finetune=False):
        """
        Parameters
        ----------
        weight_file: str, path to MobileNetV2 pretrained weights.
        feature_size: int, encoded feature map size to be used.
        tune_layer: int, tune layers from this layer onwards. For
            MobileNetV2 select integer from 0 (early) to 18 (final)
        finetune: bool, fine tune layers
        """
        super(EncoderCNN, self).__init__()
        self.weight_file = weight_file
        self.feature_size = feature_size
        self.tune_layer = tune_layer
        self.finetune = finetune
        self.pretrained = True
        
        # MobileNetV2 pretrained on ImageNet
        cnn = MobileNet(pretrained=self.pretrained, weight_file=self.weight_file)
        
        # Remove classification layer
        modules = list(cnn.children())[:-1]
        self.cnn = nn.Sequential(*modules)
        
        # Resize feature maps to fixed size to allow input images of variable size
        self.adaptive_pool = nn.AdaptiveAvgPool2d((self.feature_size, self.feature_size))
        
        # Fine-tune
        self.fine_tune()
        
    def forward(self, images):
        """
        Parameters
        ----------
        images: PyTorch tensor, size: [M, 3, H, W]
        """
        features = self.cnn(images) # size: [M, 1280, H/32, W/32]
        features = self.adaptive_pool(features) # size: [M, 1280, fs, fs]
        features = features.permute(0, 2, 3, 1) # size: [M, fs, fs, 1280]
        return features
    
    def fine_tune(self):
        """
        Fine-tuning CNN.
        """
        # Disable gradient computation
        for param in self.cnn.parameters():
            param.requires_grad = False
            
        # Enable gradient computation for few layers
        for child in list(self.cnn.children())[0][self.tune_layer:]:
            for param in child.parameters():
                param.requires_grad = self.finetune

In [10]:
# Check encoder with 3 images
encoder = EncoderCNN(weight_file='./mobilenet_v2.pth.tar')
encoder = encoder.to('cuda:1')
imgs = torch.randn(3, 3, 512, 512)
imgs = imgs.to('cuda:1')
feats = encoder(imgs)
print(feats.shape)

torch.Size([3, 14, 14, 1280])


In [11]:
# Check encoder with 3 images
encoder = EncoderCNN(weight_file='./mobilenet_v2.pth.tar')
imgs = torch.randn(3, 3, 512, 512)
feats = encoder(imgs)
print(feats.shape)

torch.Size([3, 14, 14, 1280])


# Attention Mechanism

In [12]:
class AttentionMechanism(nn.Module):
    """
    Attention Mechanism.
    """
    def __init__(self, encoder_size, decoder_size, attention_size):
        """
        Parameters
        ----------
        encoder_size: int, number of channels in encoder CNN output feature
            map (for MobileNetV2 it is 1280)
        decoder_size: int, number of features in the hidden state, i.e. LSTM 
            output size
        attention_size: int, size of MLP used to compute attention scores
        """
        super(AttentionMechanism, self).__init__()
        self.encoder_size = encoder_size
        self.decoder_size = decoder_size
        self.attention_size = attention_size
        
        # Linear layer to transform encoded features to attention size
        self.encoder_attn = nn.Linear(in_features=self.encoder_size, 
                                      out_features=self.attention_size)
        
        # Linear layer to transform decoders (hidden state) output to attention size
        self.decoder_attn = nn.Linear(in_features=self.decoder_size, 
                                      out_features=self.attention_size)
        
        # ReLU non-linearity
        self.relu = nn.ReLU()
        
        # Linear layer to compute attention scores at time t for L locations
        self.fc_attn = nn.Linear(in_features=self.attention_size, out_features=1)
        
        # Softmax layer to compute attention weights
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, encoder_out, decoder_out):
        """
        Parameters
        ----------
        encoder_out: PyTorch tensor, size: [M, L, D] where, L is feature
            map locations, and D is channels of encoded CNN feature map.
        decoder_out: PyTorch tensor, size: [M, h], where h is hidden
            dimension of the previous step output from decoder
            
        NOTE: M is batch size. k is attention size (see comments)
        
        Returns
        -------
        attn_weighted_encoding: PyTorch tensor, size: [M, D], attention weighted 
            annotation vector
        alpha: PyTorch tensor, size: [M, L], attention weights 
        """
        enc_attn = self.encoder_attn(encoder_out)  # size: [M, L, k]
        dec_attn = self.decoder_attn(decoder_out)  # size: [M, k]
        
        enc_dec_sum = enc_attn + dec_attn.unsqueeze(1)  # size: [M, L, k]
        
        # Compute attention scores for L locations at time t (Paper eq: 4)
        attn_scores = self.fc_attn(self.relu(enc_dec_sum))  # size: [M, L]
        
        # Compute for each location the probability that location i is the right 
        # place to focus for generating next word (Paper eq: 5)
        alpha = self.softmax(attn_scores.squeeze(2))  # size: [M, L]
        
        # Compute attention weighted annotation vector (Paper eq: 6)
        attn_weighted_encoding = torch.sum(encoder_out * alpha.unsqueeze(2), dim=1) # size: [M, D]
        
        return attn_weighted_encoding, alpha

### Attention Scratch

In [13]:
enc_size = 1280
attn_size = 512
dec_size = 1024

enc_attn = nn.Linear(enc_size, attn_size)
dec_attn = nn.Linear(dec_size, attn_size)
attn = nn.Linear(attn_size, 1)
relu = nn.ReLU()
sftmx = nn.Softmax(dim=1)

In [14]:
# Forward
enc_out = feats.view(-1, feats.size(1) * feats.size(2), enc_size)
print(enc_out.shape)

torch.Size([3, 196, 1280])


In [15]:
e_attn = enc_attn(enc_out)
print(e_attn.shape)

torch.Size([3, 196, 512])


In [16]:
dec_out = torch.randn(3, dec_size)
print(dec_out.shape)

torch.Size([3, 1024])


In [17]:
d_attn = dec_attn(dec_out)
print(d_attn.shape)
print(d_attn.unsqueeze(1).shape)

torch.Size([3, 512])
torch.Size([3, 1, 512])


In [18]:
ele_sum = d_attn.unsqueeze(1) + e_attn
print(ele_sum.shape)

torch.Size([3, 196, 512])


In [19]:
non_lin = relu(ele_sum)
print(non_lin.shape)

torch.Size([3, 196, 512])


In [20]:
f_attn = attn(non_lin)
print(f_attn.shape)
print(f_attn.squeeze(2).shape)

torch.Size([3, 196, 1])
torch.Size([3, 196])


In [21]:
smx = sftmx(f_attn.squeeze(2))
print(smx.shape)
print(smx.unsqueeze(2).shape)

torch.Size([3, 196])
torch.Size([3, 196, 1])


In [22]:
print((enc_out * smx.unsqueeze(2)).shape)
print(torch.sum((enc_out * smx.unsqueeze(2)), dim=1).shape)

torch.Size([3, 196, 1280])
torch.Size([3, 1280])


In [23]:
a = torch.Tensor([[[2, 4], [2, 4]], [[4, 4], [4, 4]], [[4, 8], [4, 8]]])
b = torch.tensor([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]])
print(a.shape, b.shape)
c = a * b.unsqueeze(2)
c, c.shape

torch.Size([3, 2, 2]) torch.Size([3, 2])


(tensor([[[1., 2.],
          [1., 2.]],
 
         [[2., 2.],
          [2., 2.]],
 
         [[2., 4.],
          [2., 4.]]]), torch.Size([3, 2, 2]))

# Decoder RNN /w Attention

In [24]:
class DecoderAttentionRNN(nn.Module):
    """
    RNN (LSTM) decoder to decode encoded images and generate sequences.
    """
    def __init__(self, encoder_size, decoder_size, attention_size, embedding_size, vocab_size, dropout=0.5):
        """
        encoder_size: int, number of channels in encoder CNN output feature
            map (for MobileNetV2 it is 1280)
        decoder_size: int, number of features in the hidden state, i.e. LSTM 
            output size
        attention_size: int, size of MLP used to compute attention scores
        embedding_size: int, size of embedding
        vocab_size: int, vocabulary size
        dropout: float, dropout probability
        """
        super(DecoderAttentionRNN, self).__init__()
        self.encoder_size = encoder_size
        self.decoder_size = decoder_size
        self.attention_size = attention_size
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.drop_prob = dropout
        
        # Create attention mechanism
        self.attention = AttentionMechanism(self.encoder_size, self.decoder_size, self.attention_size)
        
        # Create embedding layer
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)  # size: [V, E]
        
        # Create dropout module
        self.dropout = nn.Dropout(p=self.drop_prob)
        
        # Create LSTM cell (uses for loop) for decoding
        self.rnn = nn.LSTMCell(input_size=self.embedding_size + self.encoder_size, 
                               hidden_size=self.decoder_size, bias=True)
        
        # MLPs for LSTM cell's initial states
        self.init_h = nn.Linear(self.encoder_size, self.decoder_size)
        self.init_c = nn.Linear(self.encoder_size, self.decoder_size)
        
        # MLP to compute beta (gating scalar, paper section 4.2.1)
        self.f_beta = nn.Linear(self.decoder_size, 1) # scalar
        
        # Sigmoid to compute beta
        self.sigmoid = nn.Sigmoid()
        
        # FC layer to compute scores over vocabulary
        self.fc = nn.Linear(self.decoder_size, self.vocab_size)
        
    def init_lstm_states(self, encoder_out):
        """
        Initialize LSTM's initial hidden and cell memory states based on encoded
        feature representation. NOTE: Encoded feature map locations mean is used.
        """
        # Compute mean of encoder output locations
        mean_encoder_out = torch.mean(encoder_out, dim=1)  # size: [M, L, D] -> [M, D]
        
        # Initialize LSTMs hidden state
        h0 = self.init_h(mean_encoder_out)  # size: [M, h]
        
        # Initialize LSTMs cell memory state
        c0 = self.init_c(mean_encoder_out)  # size: [M, h]
        
        return h0, c0
    
    def forward(self, encoder_out, encoded_captions, caption_lengths):
        """
        Parameters
        ----------
        encoder_out: PyTorch tensor, size: [M, fs, fs, D] where, fs is feature
            map size, and D is channels of encoded CNN feature map.
        encoded_captions: PyTorch long tensor
        caption_lengths: PyTorch tensor
        """
        batch_size = encoder_out.size(0)
        
        # Flatten encoded feature maps from size [M, fs, fs, D] to [M, L, D]
        encoder_out = encoder_out.view(batch_size, -1, self.encoder_size)
        num_locations = encoder_out.size(1)
        
        # Sort caption lengths in descending order
        caption_lengths, sorted_idx = torch.sort(caption_lengths.squeeze(1), dim=0, 
                                                 descending=True)
        
        # Compute decode lengths to decode. Sequence generation ends when <END> token
        # is generated. A typical caption is [<START>, ..., <END>, <PAD>, <PAD>], caption
        # lengths only considers [<START>, ..., <END>], so when <END> is generated there
        # is no need to decode further. Decode lengths = caption lengths - 1
        decode_lengths = (caption_lengths - 1).tolist()
        
        # Sort encoded feature maps and captions as per caption lengths. REASON: Since a 
        # batch contains different caption lengths (and decode lengths). At each time step 
        # up to max decode length T in a batch we need to apply attention mechanism to only 
        # those images in batch whose decode length is greater than current time step
        encoder_out = encoder_out[sorted_idx]
        encoded_captions = encoded_captions[sorted_idx]
        
        # Get embeddings for encoded captions
        embeddings = self.embedding(encoded_captions) # size: [M, T, E]
        
        # Initialize LSTM's states
        h, c = self.init_lstm_states(encoder_out) # sizes: [M, h], [M, h]
        
        # Compute max decode length
        T = int(max(decode_lengths))
        
        # Create placeholders to store predicted scores and alphas (alphas for doubly stochastic attention)
        pred_scores = torch.zeros(batch_size, T, self.vocab_size) # size: [M, T, V]
        alphas = torch.zeros(batch_size, T, num_locations) # size: [M, T, L]
        
        # Decoding step: (1) Compute attention weighted encoding and attention weights
        # using encoder output, and initial hidden state; (2) Generate a new encoded word
        for t in range(T):
            # Compute batch size at step t (At step t how many decoding lengths are greater than t)
            batch_size_t = sum([dl > t for dl in decode_lengths])
            
            # Encoder output and encoded captions are already sorted by caption lengths
            # in descending order. So based on the number of decoding lengths that are 
            # greater than current t, extract data from encoded output and initial hidden state
            # as input to attention mechanism. 
            attn_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_t],
                                                           h[:batch_size_t])
                        
            # Compute gating scalar beta (paper section: 4.2.1)
            beta_t = self.sigmoid(self.f_beta(h[:batch_size_t])) # size: [M, 1]
                        
            # Multiply gating scalar beta to attention weighted encoding
            context_vector = beta_t * attn_weighted_encoding  # size: [M, D]
                        
            # Concatenate embeddings and context vector, size: [M, E] and [M, D] -> [M, E+D]
            concat_input = torch.cat([embeddings[:batch_size_t, t, :], context_vector], dim=1) # size: [M, E+D]
                        
            # LSTM input states from time step t-1
            previous_states = (h[:batch_size_t], c[:batch_size_t])
                        
            # Generate decoded word
            h, c = self.rnn(concat_input, previous_states)
            
            print('batch_size_t: ', batch_size_t)
            print('encoder_out[:batch_size_t] shape: ', encoder_out[:batch_size_t].shape)
            print('h[:batch_size_t] shape: ', h[:batch_size_t].shape)
            print('attn_weighted_encoding shape: ', attn_weighted_encoding.shape)
            print('aplha shape: ', alpha.shape)
            print('beta_t shape: ', beta_t.shape)
            print('embeddings[:batch_size_t, t, :] shape: ', embeddings[:batch_size_t, t, :].shape)
            print('context_vector shape: ', context_vector.shape)
            print('concat_input shape: ', concat_input.shape)
            print('h (after rnn) shape: ', h.shape)
            print('c (after rnn) shape: ', c.shape)
            print('rnn run')
            print('step "{}" done!'.format(t))
            print('#####' * 10)
            
            # Compute scores over vocabulary
            scores = self.fc(self.dropout(h)) # size: [M, V]
            
            # Populate placeholders for predicted scores and alphas
            pred_scores[:batch_size_t, t, :] = scores
            alphas[:batch_size_t, t, :] = alpha # alpha size: [M, L]
            
        return pred_scores, encoded_captions, decode_lengths, alphas, sorted_idx

In [25]:
# Check encoder with 3 images: GPU
encoder_size = 1280
decoder_size = 1024
attention_size = 512
embedding_size = 256
vocab_size = 10000

imgs = torch.randn(3, 3, 512, 512)

encoder = EncoderCNN(weight_file='./mobilenet_v2.pth.tar')
decoder = DecoderAttentionRNN(encoder_size, decoder_size, attention_size, embedding_size, vocab_size)

encoder = encoder.to('cuda:1')
decoder = decoder.to('cuda:1')

encoder_out = encoder(imgs.to('cuda:1'))
encoded_captions = torch.randint(1, 9999, size=(3, 15)).long().to('cuda:1')
caption_lengths = torch.randint(1, 15, size=(3, 1)).to('cuda:1')

print(encoder_out.shape)
print(encoded_captions)
print(caption_lengths)

decoder_out = decoder(encoder_out, encoded_captions, caption_lengths)

torch.Size([3, 14, 14, 1280])
tensor([[9902, 2996, 3133, 9635, 3315, 6483, 7494, 3461, 3994, 6708, 2103, 5506,
         2180, 6515,  606],
        [6601, 2879, 5645,  163, 7685, 1172, 3993, 3136, 7757, 7350, 8597,  900,
         1815, 7622, 6741],
        [7990, 3099, 1810, 5251, 9167, 2080, 7723, 5033, 5226, 2909, 3793,  504,
         5083, 3416, 5549]], device='cuda:1')
tensor([[ 5.],
        [13.],
        [ 1.]], device='cuda:1')
batch_size_t:  2
encoder_out[:batch_size_t] shape:  torch.Size([2, 196, 1280])
h[:batch_size_t] shape:  torch.Size([2, 1024])
attn_weighted_encoding shape:  torch.Size([2, 1280])
aplha shape:  torch.Size([2, 196])
beta_t shape:  torch.Size([2, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([2, 256])
context_vector shape:  torch.Size([2, 1280])
concat_input shape:  torch.Size([2, 1536])
h (after rnn) shape:  torch.Size([2, 1024])
c (after rnn) shape:  torch.Size([2, 1024])
rnn run
step "0" done!
##################################################
bat

In [26]:
# Check encoder with 3 images: CPU
encoder_size = 1280
decoder_size = 1024
attention_size = 512
embedding_size = 256
vocab_size = 10000

imgs = torch.randn(3, 3, 512, 512)

encoder = EncoderCNN(weight_file='./mobilenet_v2.pth.tar')
decoder = DecoderAttentionRNN(encoder_size, decoder_size, attention_size, embedding_size, vocab_size)

encoder_out = encoder(imgs)
encoded_captions = torch.randint(1, 9999, size=(3, 15)).long()
caption_lengths = torch.randint(1, 15, size=(3, 1))

print(encoder_out.shape)
print(encoded_captions)
print(caption_lengths)

decoder_out = decoder(encoder_out, encoded_captions, caption_lengths)

torch.Size([3, 14, 14, 1280])
tensor([[ 939, 1561, 1254, 4585, 2522, 9261, 8944, 1827, 1502,  218, 1279, 8239,
          290, 4980, 5505],
        [1994, 5959, 3623, 9418, 8750, 7638, 1805, 5160, 5294, 6303, 7210, 9020,
         9034, 8980, 6152],
        [3993, 5221, 5264, 5449, 5647, 5083, 1632, 3724, 9803, 2194,  367, 3429,
         4531, 7973, 2079]])
tensor([[ 8.],
        [ 2.],
        [10.]])
batch_size_t:  3
encoder_out[:batch_size_t] shape:  torch.Size([3, 196, 1280])
h[:batch_size_t] shape:  torch.Size([3, 1024])
attn_weighted_encoding shape:  torch.Size([3, 1280])
aplha shape:  torch.Size([3, 196])
beta_t shape:  torch.Size([3, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([3, 256])
context_vector shape:  torch.Size([3, 1280])
concat_input shape:  torch.Size([3, 1536])
h (after rnn) shape:  torch.Size([3, 1024])
c (after rnn) shape:  torch.Size([3, 1024])
rnn run
step "0" done!
##################################################
batch_size_t:  2
encoder_out[:batch_s

In [27]:
params = []
for k, v in dict(decoder.named_parameters()).items():
    if v.requires_grad:
        print(k)

fc.weight
attention.fc_attn.weight
attention.encoder_attn.weight
init_h.weight
init_c.bias
rnn.weight_hh
f_beta.weight
fc.bias
rnn.bias_hh
init_h.bias
attention.encoder_attn.bias
attention.decoder_attn.bias
f_beta.bias
embedding.weight
attention.fc_attn.bias
rnn.bias_ih
rnn.weight_ih
init_c.weight
attention.decoder_attn.weight


### Decoder Scratch

In [28]:
enc_size = 1280
attn_size = 512
dec_size = 1024
emb_size = 300
voc_size = 1000

In [29]:
attn = AttentionMechanism(enc_size, dec_size, attn_size)
embd = nn.Embedding(voc_size, emb_size)
drpt = nn.Dropout(0.5)
rnn = nn.LSTMCell(emb_size + enc_size, dec_size)

$$\begin{array}{ll}
        i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
        f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
        g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
        o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
        c' = f * c + i * g \\
        h' = o \tanh(c') \\
        \end{array}$$

In [30]:
# Embedding test
emb = nn.Embedding(4, 5) 
inp = torch.LongTensor([[1, 0, 3], [3, 2, 0]])
emb(inp)

tensor([[[-1.5058,  0.8413, -0.3112,  0.5813, -0.1170],
         [ 1.8942,  1.4617,  1.0215, -1.0060, -0.4453],
         [-0.2335, -0.9802, -0.6816,  0.7181,  0.4305]],

        [[-0.2335, -0.9802, -0.6816,  0.7181,  0.4305],
         [-1.8149, -0.3531, -0.9202, -0.3187,  0.1318],
         [ 1.8942,  1.4617,  1.0215, -1.0060, -0.4453]]],
       grad_fn=<EmbeddingBackward>)

In [31]:
# Initial hidden state initialized by encoder output
init_h = nn.Linear(enc_size, dec_size)

In [32]:
# Initial cell state of LSTMCell
init_c = nn.Linear(enc_size, dec_size)

In [33]:
# Gating scalar beta
f_beta = nn.Linear(dec_size, 1) # enc_size)
sig = nn.Sigmoid()

In [34]:
# For predicting words from vocab
fc = nn.Linear(dec_size, voc_size)  

In [35]:
# Initialize LSTM Cell's hidden and cell state with transformed encoded feature map
print(enc_out.shape)

mean_enc_out = torch.mean(enc_out, dim=1)
print(mean_enc_out.shape)

torch.Size([3, 196, 1280])
torch.Size([3, 1280])


In [36]:
h = init_h(mean_enc_out)
print(h.shape)

c = init_c(mean_enc_out)
print(c.shape)

torch.Size([3, 1024])
torch.Size([3, 1024])


In [37]:
# Check encoder with 3 images
encoder = EncoderCNN(weight_file='./mobilenet_v2.pth.tar')
imgs = torch.randn(3, 3, 512, 512)
feats = encoder(imgs)
print(feats.shape)

torch.Size([3, 14, 14, 1280])


In [38]:
# Decoder forward propagation
bs = feats.size(0) 
enc_size = feats.size(-1)

# Flatten feature representation
print(feats.shape)
enc_out = feats.view(bs, -1, enc_size)
print(enc_out.shape)

num_encoded_pixels = enc_out.size(1)
print(num_encoded_pixels)  # 14 x 14

torch.Size([3, 14, 14, 1280])
torch.Size([3, 196, 1280])
196


In [39]:
# 3 captions of max length 15 to test (3 because batch size is 3)
enc_cap = torch.randint(1, 1000, size=(3, 15)).long()
print(enc_cap)
print(enc_cap.shape)

tensor([[292, 331, 747, 483, 908, 727,  31, 691,  43, 796,  68, 991, 308, 111,
         521],
        [487, 960,  93, 828, 994, 308, 147, 371, 749, 723, 503, 728, 440, 172,
         472],
        [770, 346, 148, 640, 161, 734, 413, 931, 419, 671, 317, 293, 661, 954,
         279]])
torch.Size([3, 15])


In [40]:
# Embeddings (input must be long tensor)
embds = embd(enc_cap)
print(embds.shape)

torch.Size([3, 15, 300])


In [41]:
# Caption lengths for 3 captions to test
cap_len = torch.randint(1, 15, size=(3, 1))
print(cap_len)
print(cap_len.shape)

tensor([[14.],
        [ 7.],
        [ 1.]])
torch.Size([3, 1])


In [42]:
# Sort caption lengths in descending order
cap_len, idx = torch.sort(cap_len.squeeze(1), dim=0, descending=True)
print(cap_len)
print(idx)

tensor([14.,  7.,  1.])
tensor([0, 1, 2])


In [43]:
# Sort encoded feature maps and captions as per caption lengths
enc_cap = enc_cap[idx]
enc_out = enc_out[idx]

In [44]:
ebs = embd(enc_cap.type(torch.LongTensor))
print(ebs.shape)

torch.Size([3, 15, 300])


In [45]:
# No decoding at <END> position...seq generation ends when <END> is generated
dec_len = (cap_len - 1).tolist()
print(dec_len)

[13.0, 6.0, 0.0]


In [46]:
# Create placeholder tensors for prediction scores and alphas
preds = torch.zeros(bs, int(max(dec_len)), voc_size)
print(preds.shape) # softmax over vocab size scores

alpha = torch.zeros(bs, int(max(dec_len)), num_encoded_pixels)
print(alpha.shape) # distribution over feature locations L

torch.Size([3, 13, 1000])
torch.Size([3, 13, 196])


In [47]:
max_dec_len = int(max(dec_len))
print(max_dec_len)

13


In [48]:
print(enc_out.shape)
print(enc_out[:2].shape)
print(h.shape)
print(h[:2].shape)

torch.Size([3, 196, 1280])
torch.Size([2, 196, 1280])
torch.Size([3, 1024])
torch.Size([2, 1024])


In [49]:
# Computations at time step 0
t = 0
bs_t = 3

print(enc_out[:bs_t].shape)
print(h[:bs_t].shape)
attn_wtd_enc, alp = attn(enc_out[:bs_t], h[:bs_t])
print(attn_wtd_enc.shape, alp.shape)

torch.Size([3, 196, 1280])
torch.Size([3, 1024])
torch.Size([3, 1280]) torch.Size([3, 196])


In [50]:
# Check
beta_f = nn.Linear(dec_size, 1)
bta = beta_f(h[:bs_t])
print(bta.shape)
bs = sig(bta) # batch_size_t x 1
print(bs.shape)

print('awe: ', attn_wtd_enc.shape)
(bs * attn_wtd_enc).shape

torch.Size([3, 1])
torch.Size([3, 1])
awe:  torch.Size([3, 1280])


torch.Size([3, 1280])

In [51]:
# Linear layer: [M, 512] -> [M, 1280]
bb = f_beta(h[:bs_t])
print(bb.shape)

# Map values in range [0, 1], why not 0 or 1 based on value?
ss = sig(bb)
print(ss.shape)

torch.Size([3, 1])
torch.Size([3, 1])


In [52]:
# Compute attention weighted encoding: element wise multiplication
print(attn_wtd_enc.shape)
attn_wtd_enc = ss * attn_wtd_enc
print(attn_wtd_enc.shape)

torch.Size([3, 1280])
torch.Size([3, 1280])


In [53]:
# Decode using RNN
print(embds.shape)  # M x 300, each row is word t in caption for M images
print(embds[:bs_t].shape)
print(embds[:bs_t, t, :].shape)

torch.Size([3, 15, 300])
torch.Size([3, 15, 300])
torch.Size([3, 300])


In [54]:
print('embed size at t: ', embds[:bs_t, t, :].shape)
print('attn wtd enc size at t: ', attn_wtd_enc.shape)

# Concatenate the embeddings and attention weighted encoding
rnn_input = torch.cat([embds[:bs_t, t, :], attn_wtd_enc], dim=1)
print(rnn_input.shape)

# LSTM input states from time step t-1
prev_states = (h, c)

# RNN decode step
h, c = rnn(rnn_input, prev_states)

embed size at t:  torch.Size([3, 300])
attn wtd enc size at t:  torch.Size([3, 1280])
torch.Size([3, 1580])


In [55]:
# Predictions
print(h.shape)

# Linear layer: [m, 512] --> [m, vocab_size]
pred = fc(drpt(h))
print(pred.shape)

# Populate predictions tensor (pred tensor for each batch)
print(preds.shape)  # [m, max_decode_len, vocab_size]
preds[:bs_t, t, :] = pred
print(torch.min(preds).item(), torch.max(preds).item())

torch.Size([3, 1024])
torch.Size([3, 1000])
torch.Size([3, 13, 1000])
-0.3142455220222473 0.3750569224357605


In [56]:
# Populate alpha
print(alpha.shape)  # [m, max_decode_len, enc_num_pix]
alpha[:bs_t, t, :] = alp
print(torch.min(alpha).item(), torch.max(alpha).item())

torch.Size([3, 13, 196])
0.0 0.007868754677474499


##### Extract subtensors according to time step

In [57]:
# At time step t how many decoding lengths are greater than t
# then extract those many from flattened encoder output and hidden state
# IDEA: Since in a batch of size 3, caption lengths (and decode lengths) 
# are different, so at each time step up to max decode length in a batch,
# we need to apply attention only for those images in batch whose decode
# length is greater than current time step


d_len = [8, 5, 2]
print(d_len)
print('---' * 10)

t = 0
print(sum([l > t for l in d_len]), [l > t for l in d_len])
print(enc_out[:sum([l > t for l in d_len])].shape)
print(h[:sum([l > t for l in d_len])].shape)
print('---' * 10)

t = 1
print(sum([l > t for l in d_len]), [l > t for l in d_len])
print(enc_out[:sum([l > t for l in d_len])].shape)
print(h[:sum([l > t for l in d_len])].shape)
print('---' * 10)

t = 2
print(sum([l > t for l in d_len]), [l > t for l in d_len])
print(enc_out[:sum([l > t for l in d_len])].shape)
print(h[:sum([l > t for l in d_len])].shape)

t = 3
print(sum([l > t for l in d_len]), [l > t for l in d_len])
print(enc_out[:sum([l > t for l in d_len])].shape)
print(h[:sum([l > t for l in d_len])].shape)

t = 4
print(sum([l > t for l in d_len]), [l > t for l in d_len])
print(enc_out[:sum([l > t for l in d_len])].shape)
print(h[:sum([l > t for l in d_len])].shape)

[8, 5, 2]
------------------------------
3 [True, True, True]
torch.Size([3, 196, 1280])
torch.Size([3, 1024])
------------------------------
3 [True, True, True]
torch.Size([3, 196, 1280])
torch.Size([3, 1024])
------------------------------
2 [True, True, False]
torch.Size([2, 196, 1280])
torch.Size([2, 1024])
2 [True, True, False]
torch.Size([2, 196, 1280])
torch.Size([2, 1024])
2 [True, True, False]
torch.Size([2, 196, 1280])
torch.Size([2, 1024])


In [58]:
print(d_len)
for t in range(max(d_len)):
    bst = sum([l > t for l in d_len])
    print('time step: {} and extracted: {}'.format(t, bst))

[8, 5, 2]
time step: 0 and extracted: 3
time step: 1 and extracted: 3
time step: 2 and extracted: 2
time step: 3 and extracted: 2
time step: 4 and extracted: 2
time step: 5 and extracted: 1
time step: 6 and extracted: 1
time step: 7 and extracted: 1


In [59]:
x = torch.randint(1, 9, size=(3, 2, 5))
print(x.shape)
print('---' * 10)
print(x)
print('---' * 10)
print(x[1:])
print('---' * 10)
print(x[:1])

torch.Size([3, 2, 5])
------------------------------
tensor([[[6., 6., 1., 4., 4.],
         [8., 3., 3., 7., 4.]],

        [[5., 7., 5., 1., 8.],
         [2., 1., 4., 3., 8.]],

        [[8., 7., 4., 2., 8.],
         [3., 6., 1., 1., 2.]]])
------------------------------
tensor([[[5., 7., 5., 1., 8.],
         [2., 1., 4., 3., 8.]],

        [[8., 7., 4., 2., 8.],
         [3., 6., 1., 1., 2.]]])
------------------------------
tensor([[[6., 6., 1., 4., 4.],
         [8., 3., 3., 7., 4.]]])


In [60]:
x = torch.randint(1, 9, size=(3, 4))
print(x.shape)
print('---' * 10)
print(x)
print('---' * 10)
print(x[1:])
print('---' * 10)
print(x[:1])

torch.Size([3, 4])
------------------------------
tensor([[3., 5., 4., 2.],
        [1., 3., 3., 7.],
        [4., 8., 3., 2.]])
------------------------------
tensor([[1., 3., 3., 7.],
        [4., 8., 3., 2.]])
------------------------------
tensor([[3., 5., 4., 2.]])


##### Extract Embeddings at time step `t`

In [61]:
x = torch.randint(1, 9, size=(3, 2, 5))
print(x)

print(x[:3, 0, :])

tensor([[[7., 6., 7., 1., 3.],
         [3., 1., 3., 6., 4.]],

        [[6., 1., 3., 6., 1.],
         [5., 6., 7., 6., 3.]],

        [[1., 4., 4., 6., 4.],
         [7., 1., 3., 2., 7.]]])
tensor([[7., 6., 7., 1., 3.],
        [6., 1., 3., 6., 1.],
        [1., 4., 4., 6., 4.]])


In [62]:
for t in range(max_dec_len):
    bs_t = sum([l > t for l in dec_len])
    print(embds[:bs_t, t, :].shape)

torch.Size([2, 300])
torch.Size([2, 300])
torch.Size([2, 300])
torch.Size([2, 300])
torch.Size([2, 300])
torch.Size([2, 300])
torch.Size([1, 300])
torch.Size([1, 300])
torch.Size([1, 300])
torch.Size([1, 300])
torch.Size([1, 300])
torch.Size([1, 300])
torch.Size([1, 300])


In [63]:
import os
import json
import h5py
import numpy as np

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class COCODataset(Dataset):
    """
    COCO Dataset to be used in DataLoader for creating batches 
    during training.
    """
    def __init__(self, config, split='TRAIN', transform=None):
        self.config = config
        self.split = split
        self.transform = transform
        
        # Open files where images are stored in HDF5 data fromat, captions & their lengths
        if self.split == 'TRAIN':
            self.hdf5 = h5py.File(name=self.config.train_hdf5, mode='r')
            self.captions = self.read_json(self.config.train_captions)
        else:
            self.hdf5 = h5py.File(name=self.config.val_hdf5, mode='r')
            self.captions = self.read_json(self.config.val_captions)
            
        # Get image data
        self.images = self.hdf5['images']
                    
    def read_json(self, json_path):
        with open(json_path, 'r') as j:
            json_data = json.load(j)
        return json_data
        
    def __len__(self):
        return len(self.captions)
    
    def __getitem__(self, idx):
        img = torch.FloatTensor(self.images[idx])
        if self.transform is not None:
            img = self.transform(img)
         
        # There are 5 captions so randomly sample 1 caption
        cap_idx = np.random.randint(0, high=5)
        caption = torch.LongTensor(self.captions[idx][0][cap_idx])
        length = torch.LongTensor([self.captions[idx][1][cap_idx]])
        
        if self.split == 'TRAIN':
            return img, caption, length
        else:
            captions = torch.LongTensor(self.captions[idx][0])
            return img, caption, length, captions

class DataConfig(object):
    def __init__(self):
        # Word to index mapping
        self.word2idx_file = './WORD2IDX_COCO.json'
        
        # Training data 
        self.train_hdf5 = './TRAIN_IMAGES_COCO.hdf5'
        self.train_captions = './TRAIN_CAPTIONS_COCO.json'
        
        # Validation data
        self.val_hdf5 = './VAL_IMAGES_COCO.hdf5'
        self.val_captions = './VAL_CAPTIONS_COCO.json'

In [64]:
# Validation 
config = DataConfig()
coco = COCODataset(config, split='VAL')
loader = DataLoader(coco, batch_size=4, shuffle=True)

for i, (j, k, l, m) in enumerate(loader):
    print(j.shape)
    print(k.shape)
    print(l.shape)
    print(m.shape)
    if i == 0:
        break
        
# Train 
config = DataConfig()
coco = COCODataset(config)
loader = DataLoader(coco, batch_size=4, shuffle=True)

for i, (j, k, l) in enumerate(loader):
    print(j.shape)
    print(k.shape)
    print(l.shape)
    if i == 0:
        break

torch.Size([4, 3, 224, 224])
torch.Size([4, 18])
torch.Size([4, 1])
torch.Size([4, 5, 18])
torch.Size([4, 3, 224, 224])
torch.Size([4, 18])
torch.Size([4, 1])


In [65]:
# Train
encoder_size = 1280
decoder_size = 1024
attention_size = 512
embedding_size = 256
vocab_size = 10000

encoder = EncoderCNN(weight_file='./mobilenet_v2.pth.tar')
decoder = DecoderAttentionRNN(encoder_size, decoder_size, attention_size, embedding_size, vocab_size)

encoder = encoder.to('cuda:1')
decoder = decoder.to('cuda:1')

config = DataConfig()
coco = COCODataset(config)
loader = DataLoader(coco, batch_size=32, shuffle=True)

for i, (imgs, caps, lengths) in enumerate(loader):
    imgs = imgs.to('cuda:1')
    caps = caps.to('cuda:1')
    lengths = lengths.to('cuda:1')
    encoder_out = encoder(imgs)
    decoder_out = decoder(encoder_out, caps, lengths)
    if i == 0:
        print(i)
        break
        
pred_scores, sorted_captions, decode_lengths, alphas, sorted_idx = decoder_out

batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  torch.Size([32, 1024])
c (after rnn) shape:  torch.Size([32, 1024])
rnn run
step "0" done!
##################################################
batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  to

In [66]:
print('scores: ', pred_scores.shape)
print('sorted captions: ', sorted_captions.shape)
print('decode lengths: ', decode_lengths)
print('alphas: ', alphas.shape)
print('sorted idx: ', sorted_idx.shape)

scores:  torch.Size([32, 17, 10000])
sorted captions:  torch.Size([32, 18])
decode lengths:  [17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8]
alphas:  torch.Size([32, 17, 196])
sorted idx:  torch.Size([32])


In [67]:
from torch.nn.utils.rnn import pack_padded_sequence

# scores shape: [M, max(dl), V]
print(max(decode_lengths))
print('pred_scores shape: ', pred_scores.shape)
scores = pack_padded_sequence(pred_scores, decode_lengths, batch_first=True)
print('scores batch sizes: ', scores.batch_sizes)
print('scores data shape: ', scores.data.shape)  # size: [M, V] or [M, C-1]

17
pred_scores shape:  torch.Size([32, 17, 10000])
scores batch sizes:  tensor([32, 32, 32, 32, 32, 32, 32, 32, 31, 20, 17, 15,  5,  4,  3,  2,  1],
       grad_fn=<PackPaddedBackward>)
scores data shape:  torch.Size([354, 10000])


In [68]:
# Select all words after <START> till <END>
targets_ = sorted_captions[:, 1:]
print(targets_.shape)  # shape: [M, 17], (16 words + <START> + <END>) = 18

torch.Size([32, 17])


In [69]:
targets = pack_padded_sequence(targets_, decode_lengths, batch_first=True)
print('targets batch sizes: ', targets.batch_sizes)
print('targets batch sizes sum: ', torch.sum(targets.batch_sizes))
print('targets data shape: ', targets.data.shape) # size: [M]

targets batch sizes:  tensor([32, 32, 32, 32, 32, 32, 32, 32, 31, 20, 17, 15,  5,  4,  3,  2,  1])
targets batch sizes sum:  tensor(354)
targets data shape:  torch.Size([354])


**`CrossEntropyLoss`** combines **`nn.LogSoftmax`** and **`nn.NLLLoss`**
- **m** mini batch size
- **C** classes
- `input` has to be a Tensor of size: $[m, C]$ or $[m, C, d_1, d_2, ..., d_k]$ with $k \geq 2$
- It expects a class index $(0 \text{ to } C-1)$ as `target` for each value of a 1D tensor of size `[m]`

$$\text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right)
                       = -x[class] + \log\left(\sum_j \exp(x[j])\right)$$

In [70]:
# Compute loss
x_entropy = nn.CrossEntropyLoss()
loss = x_entropy(scores.data.to('cuda:1'), targets.data.to('cuda:1'))
print(loss)

tensor(9.2041, device='cuda:1', grad_fn=<NllLossBackward>)


In [71]:
alpha_c = 1.0
loss += (alpha_c * ((1.0 - alphas.sum(dim=1))**2).mean()).to('cuda:1')
print(loss)

tensor(10.0946, device='cuda:1', grad_fn=<ThAddBackward>)


### Accuracy

In [72]:
bs = targets.data.numel()
print(bs)

# Get indices of the k largest elements 
_, topk_idx = scores.data.topk(5, dim=1)
print(topk_idx.shape)

# Compute element wise equality
correct = torch.eq(topk_idx.to('cpu'), targets.data.view(-1, 1).to('cpu'))

# Total correct
tot_correct = torch.sum(correct)

acc = tot_correct.float().item()/bs
print(acc)

354
torch.Size([354, 5])
0.0


In [73]:
smx = nn.Softmax(dim=0)
a = torch.arange(start=-2.0, end=2.0, step=0.25)
print(a)

s = smx(a)
print(s)

print(a.topk(3))
print(s.topk(3))  # Softmaxed result the same indices so no need to do softmax!

tensor([-2.0000, -1.7500, -1.5000, -1.2500, -1.0000, -0.7500, -0.5000, -0.2500,
         0.0000,  0.2500,  0.5000,  0.7500,  1.0000,  1.2500,  1.5000,  1.7500])
tensor([0.0053, 0.0068, 0.0087, 0.0112, 0.0144, 0.0185, 0.0237, 0.0305, 0.0392,
        0.0503, 0.0646, 0.0829, 0.1064, 0.1367, 0.1755, 0.2253])
(tensor([1.7500, 1.5000, 1.2500]), tensor([15, 14, 13]))
(tensor([0.2253, 0.1755, 0.1367]), tensor([15, 14, 13]))


# BLEU Score

In [74]:
from nltk.translate.bleu_score import corpus_bleu

# Val
encoder_size = 1280
decoder_size = 1024
attention_size = 512
embedding_size = 256
vocab_size = 10000

encoder = EncoderCNN(weight_file='./mobilenet_v2.pth.tar')
decoder = DecoderAttentionRNN(encoder_size, decoder_size, attention_size, embedding_size, vocab_size)

config = DataConfig()
coco = COCODataset(config, split='VAL')
loader = DataLoader(coco, batch_size=4, shuffle=True)

for i, (imgs, caps, lengths, captions) in enumerate(loader):
    encoder_out = encoder(imgs)
    decoder_out = decoder(encoder_out, caps, lengths)
    if i == 0:
        print(i)
        break
        
pred_scores, sorted_captions, decode_lengths, alphas, sorted_idx = decoder_out

batch_size_t:  4
encoder_out[:batch_size_t] shape:  torch.Size([4, 196, 1280])
h[:batch_size_t] shape:  torch.Size([4, 1024])
attn_weighted_encoding shape:  torch.Size([4, 1280])
aplha shape:  torch.Size([4, 196])
beta_t shape:  torch.Size([4, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([4, 256])
context_vector shape:  torch.Size([4, 1280])
concat_input shape:  torch.Size([4, 1536])
h (after rnn) shape:  torch.Size([4, 1024])
c (after rnn) shape:  torch.Size([4, 1024])
rnn run
step "0" done!
##################################################
batch_size_t:  4
encoder_out[:batch_size_t] shape:  torch.Size([4, 196, 1280])
h[:batch_size_t] shape:  torch.Size([4, 1024])
attn_weighted_encoding shape:  torch.Size([4, 1280])
aplha shape:  torch.Size([4, 196])
beta_t shape:  torch.Size([4, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([4, 256])
context_vector shape:  torch.Size([4, 1280])
concat_input shape:  torch.Size([4, 1536])
h (after rnn) shape:  torch.Size([4, 1024])


In [75]:
# Read word to index
with open('./data/WORD2IDX_COCO_5_WordCountThresh.json', 'r') as j:
    w2i = json.load(j)

In [76]:
# Prepare y_true for BLEU
references = []

# Sort captions based on sorted indices from decoder
captions = captions[sorted_idx]
remove_idx = [w2i['<START>'], w2i['<PAD>']]

temp_references = []
for c in range(captions.size(0)):
    img_caps = captions[c].tolist()
    # Remove indices corresponding to <START> and <PAD>
    img_caps = [[ix for ix in cap if ix not in remove_idx] for cap in img_caps]
    temp_references.append(img_caps)
    
len(temp_references)
references.extend(temp_references)
print(len(references))

4


In [77]:
scores_clone = pred_scores.clone()
print(scores_clone.shape)

hypotheses = []

# Get indixes of words with max score
_, preds = torch.max(scores_clone, dim=2)
print(preds.shape)

preds = preds.tolist()
print(len(preds))

temp_hypotheses = []
for i, pred in enumerate(preds):
    img_hyp = preds[i][:decode_lengths[i]]
    temp_hypotheses.append(img_hyp)
    
print(len(temp_hypotheses))
hypotheses.extend(temp_hypotheses)
print(len(hypotheses))

torch.Size([4, 13, 10000])
torch.Size([4, 13])
4
4
4


In [78]:
# Compute BLEU4 score
bleu = corpus_bleu(references, hypotheses)
print(bleu)

0


In [79]:
class AverageMeter(object):
    """
    Computes and stores the average and current value of some metric.
    
    Reference: https://github.com/pytorch/examples/blob/master/imagenet/main.py
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

L = AverageMeter()
l = []
for i in range(10):
    t = torch.randint(0, 9, size=(1, 4)).tolist()
    l.append(t[0])
    L.update(sum(t[0]), len(t[0]))
    
print(L.val, L.sum, L.avg, L.count)

s = 0
for i in l:
    s += sum(i)
    
print(s/10.0)

22.0 672.0 16.8 40
16.8


# Cross Entropy Loss using Mask

https://gist.github.com/williamFalcon/f27c7b90e34b4ba88ced042d9ef33edd

In [80]:
from torch.nn import functional as F

In [81]:
# Softmax activation
print(pred_scores.shape) # size: [M, dl, V]

torch.Size([4, 13, 10000])


In [82]:
log_softmax_scores = F.log_softmax(pred_scores, dim=2)
print(log_softmax_scores.shape)

torch.Size([4, 13, 10000])


In [83]:
sc = sorted_captions.view(-1)
lss = log_softmax_scores.view(-1, log_softmax_scores.size(2))

In [84]:
mask = (sc < 9490).float()
mask

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [85]:
num_tokens = int(torch.sum(mask).item())
print(num_tokens)

72


In [86]:
print('scores: ', pred_scores.shape)
print('sorted captions: ', sorted_captions.shape)
print('decode lengths: ', decode_lengths)
print('alphas: ', alphas.shape)
print('sorted idx: ', sorted_idx.shape)

scores:  torch.Size([4, 13, 10000])
sorted captions:  torch.Size([4, 18])
decode lengths:  [13, 12, 11, 10]
alphas:  torch.Size([4, 13, 196])
sorted idx:  torch.Size([4])


In [87]:
print(sorted_captions.shape)
y = sorted_captions[:, 1:].contiguous().view(-1)
print('y shape: ', y.shape)

y_hat = log_softmax_scores.view(-1, log_softmax_scores.size(2))
print('y_hat shape: ', y_hat.shape)

mask = (y < 9490).float()
print('mask shape: ', mask.shape)

num_tokens = int(torch.sum(mask).item())
print('num_tokens: ', num_tokens)

torch.Size([4, 18])
y shape:  torch.Size([68])
y_hat shape:  torch.Size([52, 10000])
mask shape:  torch.Size([68])
num_tokens:  68


In [88]:
s = torch.randint(0, 9, size=(4, 5, 8))
print(s.shape)
print(s[0, :, :])

torch.Size([4, 5, 8])
tensor([[5., 6., 5., 7., 3., 0., 1., 1.],
        [7., 0., 5., 8., 5., 2., 1., 1.],
        [1., 5., 8., 8., 0., 1., 2., 3.],
        [1., 2., 1., 4., 2., 8., 1., 2.],
        [8., 7., 2., 2., 6., 5., 7., 2.]])


In [89]:
smx = F.softmax(s, dim=2)
print(smx.shape)
print(smx[0, :, :])
print(smx)

torch.Size([4, 5, 8])
tensor([[0.0814, 0.2212, 0.0814, 0.6014, 0.0110, 0.0005, 0.0015, 0.0015],
        [0.2499, 0.0002, 0.0338, 0.6793, 0.0338, 0.0017, 0.0006, 0.0006],
        [0.0004, 0.0242, 0.4852, 0.4852, 0.0002, 0.0004, 0.0012, 0.0033],
        [0.0009, 0.0024, 0.0009, 0.0178, 0.0024, 0.9723, 0.0009, 0.0024],
        [0.5186, 0.1908, 0.0013, 0.0013, 0.0702, 0.0258, 0.1908, 0.0013]])
tensor([[[0.0814, 0.2212, 0.0814, 0.6014, 0.0110, 0.0005, 0.0015, 0.0015],
         [0.2499, 0.0002, 0.0338, 0.6793, 0.0338, 0.0017, 0.0006, 0.0006],
         [0.0004, 0.0242, 0.4852, 0.4852, 0.0002, 0.0004, 0.0012, 0.0033],
         [0.0009, 0.0024, 0.0009, 0.0178, 0.0024, 0.9723, 0.0009, 0.0024],
         [0.5186, 0.1908, 0.0013, 0.0013, 0.0702, 0.0258, 0.1908, 0.0013]],

        [[0.4639, 0.0628, 0.0231, 0.1706, 0.0628, 0.0231, 0.0231, 0.1706],
         [0.0110, 0.6032, 0.2219, 0.0110, 0.0300, 0.0110, 0.0816, 0.0300],
         [0.0491, 0.0066, 0.3628, 0.1335, 0.0491, 0.0181, 0.3628, 0.0181],
     

# GPU Check

In [90]:
import torch.optim as optim

# Train
encoder_size = 1280
decoder_size = 1024
attention_size = 512
embedding_size = 256
vocab_size = 10000

encoder = EncoderCNN(weight_file='./mobilenet_v2.pth.tar')
decoder = DecoderAttentionRNN(encoder_size, decoder_size, attention_size, embedding_size, vocab_size)

encoder = encoder.to('cuda:1')
decoder = decoder.to('cuda:1')

config = DataConfig()
coco = COCODataset(config)
loader = DataLoader(coco, batch_size=32, shuffle=True)

def get_optimizer(net, opt):
    params = []
    for key, value in dict(net.named_parameters()).items():
        if value.requires_grad:
            params += [{'params': [value], 'lr': opt['lr']}]
    optimizer = optim.Adam(params=params, weight_decay=opt['weight_decay'])
    return optimizer

optimizer = get_optimizer(decoder, opt={'lr': 0.001, 'weight_decay': 0.5})

criterion = nn.CrossEntropyLoss()

for i, (imgs, caps, lengths) in enumerate(loader):
    imgs = imgs.to('cuda:1')
    caps = caps.to('cuda:1')
    lengths = lengths.to('cuda:1')
    encoder_out = encoder(imgs)
    pred_scores, sorted_captions, decode_lengths, alphas, sorted_idx = decoder(encoder_out, caps, lengths)
    
    # Select all words after <START> till <END>
    target_caps = sorted_captions[:, 1:]
            
    # Pack padded sequences. Before computing Cross Entropy Loss (Log Softmax and Negative Log
    # Likelihood Loss) we do not want to take into account padded items in the predicted scores
    scores, _ = pack_padded_sequence(pred_scores, decode_lengths, batch_first=True)
    targets, _ = pack_padded_sequence(target_caps, decode_lengths, batch_first=True)
    
    scores = scores.data.to('cuda:1')
    targets = targets.data.to('cuda:1')
    
    loss = criterion(scores, targets)
    
    loss += (1.0 * ((1.0 - alphas.sum(dim=1))**2).mean()).to('cuda:1')
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print('loss: {} at batch: {}'.format(loss, i))
    print('-----' * 10)
    
    if i == 5:
        break

batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  torch.Size([32, 1024])
c (after rnn) shape:  torch.Size([32, 1024])
rnn run
step "0" done!
##################################################
batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  to

batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  torch.Size([32, 1024])
c (after rnn) shape:  torch.Size([32, 1024])
rnn run
step "0" done!
##################################################
batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  to

batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  torch.Size([32, 1024])
c (after rnn) shape:  torch.Size([32, 1024])
rnn run
step "1" done!
##################################################
batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  to

batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  torch.Size([32, 1024])
c (after rnn) shape:  torch.Size([32, 1024])
rnn run
step "0" done!
##################################################
batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  to

batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  torch.Size([32, 1024])
c (after rnn) shape:  torch.Size([32, 1024])
rnn run
step "0" done!
##################################################
batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  to

batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  torch.Size([32, 1024])
c (after rnn) shape:  torch.Size([32, 1024])
rnn run
step "4" done!
##################################################
batch_size_t:  32
encoder_out[:batch_size_t] shape:  torch.Size([32, 196, 1280])
h[:batch_size_t] shape:  torch.Size([32, 1024])
attn_weighted_encoding shape:  torch.Size([32, 1280])
aplha shape:  torch.Size([32, 196])
beta_t shape:  torch.Size([32, 1])
embeddings[:batch_size_t, t, :] shape:  torch.Size([32, 256])
context_vector shape:  torch.Size([32, 1280])
concat_input shape:  torch.Size([32, 1536])
h (after rnn) shape:  to