In [1]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, bidirectional = True):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.bidirectional = bidirectional

        self.lstm = nn.LSTM(input_size, hidden_size, bidirectional = bidirectional)
  
    def forward(self, inputs, hidden):
        output, hidden = self.lstm(inputs, hidden)
        return output, hidden

    def init_hidden(self,bs=1,num_l=1):
        return (torch.zeros(num_l*(1 + int(self.bidirectional)), bs, self.hidden_size),
          torch.zeros(num_l*(1 + int(self.bidirectional)), bs, self.hidden_size))

In [219]:
class AttentionDecoder(nn.Module):
  
    def __init__(self, hidden_size, output_size, vocab_size, input_size=30):
        super(AttentionDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.attn = nn.Linear(hidden_size + input_size, 1)
        self.lstm = nn.LSTM(hidden_size + vocab_size, output_size) #if we are using embedding hidden_size should be added with embedding of vocab size
        self.final = nn.Linear(output_size, vocab_size)

    def init_hidden(self,bs=1,num_l=1):
        return (torch.zeros(num_l, bs, self.hidden_size),
          torch.zeros(num_l, bs, self.hidden_size))
  
    
    def forward(self, decoder_hidden, encoder_outputs, input):
    
        weights = []
        print (decoder_hidden[0].shape)
        print ("encoder_outputs ", encoder_outputs.shape)
        print (input.shape)
        weights = self.attn(torch.cat((decoder_hidden[0].repeat(input.shape[0],1,1), input), dim = 2))
        print (weights.shape)

        normalized_weights = F.softmax(weights.squeeze()).unsqueeze(0)
        
        print ("normalized_weights ", normalized_weights.shape)
        print (encoder_outputs.view(1, -1, self.hidden_size).shape)
        
        attn_applied = torch.bmm(normalized_weights.unsqueeze(1),
                                 encoder_outputs)
        print ("attn_applied ", attn_applied.shape)
        print("input[0].shape", input[0].shape)
        input_lstm = torch.cat((attn_applied[0], input[0]), dim = 1) #if we are using embedding, use embedding of input here instead
        print (input_lstm.shape)
        output, hidden = self.lstm(input_lstm.unsqueeze(0), decoder_hidden)

        output = self.final(output[0])

        return output, hidden, normalized_weights

In [220]:
bidirectional = True
c = Encoder(input_size=10, hidden_size=20, bidirectional=bidirectional)
enc_hidden = c.init_hidden(num_l=1)
a, b = c.forward(torch.randn(2,1,10), enc_hidden)
print("a.shape ", a.shape)
print("b[0].shape ", b[0].shape)
print("b[1].shape", b[1].shape)

x = AttentionDecoder(hidden_size=20 * (1 + bidirectional), output_size=25, vocab_size=30)
y, z, w = x.forward(x.init_hidden(), torch.cat((a,a)), torch.zeros(2, 1, 30)) #Assuming <SOS> to be all zeros
print("y.shape ", y.shape)

print("z[0].shape ", z[0].shape)
print("z[1].shape ", z[1].shape)
print("w ", w)

a.shape  torch.Size([2, 1, 40])
b[0].shape  torch.Size([2, 1, 20])
b[1].shape torch.Size([2, 1, 20])
torch.Size([1, 1, 40])
encoder_outputs  torch.Size([4, 1, 40])
torch.Size([2, 1, 30])
torch.Size([2, 1, 1])
normalized_weights  torch.Size([1, 2])
torch.Size([1, 4, 40])


  normalized_weights = F.softmax(weights.squeeze()).unsqueeze(0)


RuntimeError: Expected tensor to have size 1 at dimension 0, but got size 4 for argument #2 'batch2' (while checking arguments for bmm)

In [246]:
model = AttnDecoderRNN(10,10)
test = torch.rand(10, 10).type(torch.LongTensor)
hidden = torch.zeros(1, 1, 10)
encoder_outputs = torch.zeros(10, 10)
# input_length = input_tensor.size(0)
# target_length = target_tensor.size(0)
model(test, hidden, encoder_outputs)[0].shape

RuntimeError: size mismatch, m1: [1 x 1010], m2: [20 x 20] at /opt/conda/conda-bld/pytorch_1587428094786/work/aten/src/TH/generic/THTensorMath.cpp:41

In [309]:
t = torch.rand(2,3,2,2)
nor = torch.norm(t,dim=-1)
sort = torch.sort(nor, dim=-1)


In [310]:
print (t, t.shape)
print (nor, nor.shape)
print ("sort[1] ", sort[1], sort[1].shape)
print ("sort[0] ", sort[0], sort[0].shape)

# torch.cat( [nor[i][sort[1][i]] for i in range(2)]).reshape(nor.shape)
sort = torch.cat([torch.cat([t[i][j] for j in sort[1][i]]).reshape(3,2) for i in [0, 1]]).reshape(t.shape)   #(sort[1][0])


tensor([[[[0.7027, 0.2775],
          [0.8796, 0.2359]],

         [[0.8802, 0.0744],
          [0.6797, 0.5256]],

         [[0.9282, 0.2064],
          [0.5283, 0.5215]]],


        [[[0.5255, 0.5019],
          [0.1919, 0.9171]],

         [[0.7988, 0.0710],
          [0.3204, 0.0604]],

         [[0.9466, 0.7941],
          [0.5817, 0.6739]]]]) torch.Size([2, 3, 2, 2])
tensor([[[0.7555, 0.9107],
         [0.8833, 0.8592],
         [0.9509, 0.7423]],

        [[0.7267, 0.9370],
         [0.8020, 0.3260],
         [1.2356, 0.8902]]]) torch.Size([2, 3, 2])
sort[1]  tensor([[[0, 1],
         [1, 0],
         [1, 0]],

        [[0, 1],
         [1, 0],
         [1, 0]]]) torch.Size([2, 3, 2])
sort[0]  tensor([[[0.7555, 0.9107],
         [0.8592, 0.8833],
         [0.7423, 0.9509]],

        [[0.7267, 0.9370],
         [0.3260, 0.8020],
         [0.8902, 1.2356]]]) torch.Size([2, 3, 2])


RuntimeError: shape '[3, 2]' is invalid for input of size 24

In [271]:
tt = torch.rand(2,3,2)

In [314]:
tt_n = torch.norm(tt,dim=-1)
tt_f = tt_n.flatten()

sort = torch.sort(tt_f, dim=-1)
print (tt_n)
tt_f[sort[1]].reshape(tt_n.shape)


tensor([[0.9648, 0.8062, 0.3164],
        [0.4375, 0.7637, 0.5865]])


tensor([[0.3164, 0.4375, 0.5865],
        [0.7637, 0.8062, 0.9648]])

In [320]:
ra = torch.rand(2,2,10)
r = torch.zeros(2,2,20)



In [333]:
t = torch.rand(2,2)
print(t)
mask = torch.eye(2).long()
print (mask)
print(t[torch.tensor([1,0])])
print (t.permute(1,0))

tensor([[0.7028, 0.8551],
        [0.8610, 0.0162]])
tensor([[1, 0],
        [0, 1]])
tensor([[0.8610, 0.0162],
        [0.7028, 0.8551]])
tensor([[0.7028, 0.8610],
        [0.8551, 0.0162]])


In [334]:

class AdditiveAttention(nn.Module):
    # Implementing the attention module of Bahdanau et al. 2015 where
    # score(h_j, s_(i-1)) = v . tanh(W_1 h_j + W_2 s_(i-1))
    def __init__(self, encoder_hidden_state_dim, decoder_hidden_state_dim, internal_dim=None):
        super(AdditiveAttention, self).__init__()

        if internal_dim is None:
            internal_dim = int((encoder_hidden_state_dim + decoder_hidden_state_dim) / 2)

        self.w1 = nn.Linear(encoder_hidden_state_dim, internal_dim, bias=False)
        self.w2 = nn.Linear(decoder_hidden_state_dim, internal_dim, bias=False)
        self.v = nn.Linear(internal_dim, 1, bias=False)

    def score(self, encoder_state, decoder_state):
        # encoder_state is of shape (batch, enc_dim)
        # decoder_state is of shape (batch, dec_dim)
        # return value should be of shape (batch, 1)
        return self.v(torch.tanh(self.w1(encoder_state) + self.w2(decoder_state)))

    def forward(self, encoder_states, decoder_state):
        # encoder_states is of shape (batch, num_enc_states, enc_dim)
        # decoder_state is of shape (batch, dec_dim)
        score_vec = torch.cat([self.score(encoder_states[:, i], decoder_state) for i in range(encoder_states.shape[1])],
                              dim=1)
        # score_vec is of shape (batch, num_enc_states)

        attention_probs = torch.unsqueeze(F.softmax(score_vec, dim=1), dim=2)
        # attention_probs is of shape (batch, num_enc_states, 1)

        final_context_vec = torch.sum(attention_probs * encoder_states, dim=1)
        # final_context_vec is of shape (batch, enc_dim)

        return final_context_vec, attention_probs


class TemporallyBatchedAdditiveAttention(AdditiveAttention):
    # Implementing the attention module of Bahdanau et al. 2015 where
    # score(h_j, s_(i-1)) = v . tanh(W_1 h_j + W_2 s_(i-1))
    def __init__(self, encoder_hidden_state_dim, decoder_hidden_state_dim, internal_dim=None):
        super(TemporallyBatchedAdditiveAttention, self).__init__(encoder_hidden_state_dim,
                                                                 decoder_hidden_state_dim,
                                                                 internal_dim)

    def score(self, encoder_state, decoder_state):
        # encoder_state is of shape (batch, num_enc_states, max_time, enc_dim)
        # decoder_state is of shape (batch, max_time, dec_dim)
        # return value should be of shape (batch, num_enc_states, max_time, 1)
        return self.v(torch.tanh(self.w1(encoder_state) + torch.unsqueeze(self.w2(decoder_state), dim=1)))

    def forward(self, encoder_states, decoder_state):
        # encoder_states is of shape (batch, num_enc_states, max_time, enc_dim)
        # decoder_state is of shape (batch, max_time, dec_dim)
        score_vec = self.score(encoder_states, decoder_state)
        # score_vec is of shape (batch, num_enc_states, max_time, 1)

        attention_probs = F.softmax(score_vec, dim=1)
        # attention_probs is of shape (batch, num_enc_states, max_time, 1)

        final_context_vec = torch.sum(attention_probs * encoder_states, dim=1)
        # final_context_vec is of shape (batch, max_time, enc_dim)

        return final_context_vec, torch.squeeze(torch.transpose(attention_probs, 1, 2), dim=3)

In [341]:
att = TemporallyBatchedAdditiveAttention(encoder_hidden_state_dim=5, decoder_hidden_state_dim=20, internal_dim=10)
inp = torch.rand(3,8,2)
encoder_states = torch.rand(3,8,5)
decoder_state = torch.rand(3,8,20)
att(encoder_states, decoder_state)[0].shape

torch.Size([3, 8, 5])