## RNN

In [9]:
import torch
import torch.nn as nn  
# from torch import optim  #optimizer
import torch.nn.functional as F  #relu, softmax, etc.
# import csv
# import random
# import re
# import os
# import unicodedata
# import codecs
# import itertools

### Defining the Encoder

In [7]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layer=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        # initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        # because our input sie is a word embedding with number of features == hidden_size
        seld.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
    
    def forward(self, input_seq, input_lengths, hidden=None):
        # input_seq: batch of input sentences; shape=(max_length, batch_size)
        # input_lengths: list of sentence lengths corresponding to each sentence in the batch
        # hidden state, of shape: (n_layers x num_directions, batch_size, hidden_size)
        # convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # pack padded batch of sequences for the RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputts[:,:,self.hidden_size:]
        # return output and final hidden state
        return outputs, hidden
        # outputs: the output features h_t from the last layer of the GRU, for each timestep (sum of bidirectional outputs)
        # outputs shape=(max_length, batch_size, hidden_size)
        # hidden: hidden state for the last timestep, of shape=(n_layers x num_directions, batch_size, hidden_size)

In [5]:
a = torch.randn(5,4,3)
a[:,:,:2]

tensor([[[-1.5500, -0.8938],
         [-0.1461,  0.6787],
         [ 0.2025,  0.8576],
         [ 0.1137, -1.6217]],

        [[-0.9726, -0.5376],
         [-0.0859,  0.2042],
         [ 1.0472, -1.5976],
         [ 0.4279,  0.1457]],

        [[-1.0674, -2.4240],
         [-0.3903,  0.5151],
         [ 0.3786,  0.7431],
         [-0.1456,  0.0630]],

        [[ 0.5778,  0.3427],
         [-0.8794, -0.5316],
         [-0.2524, -0.3511],
         [-1.5978,  0.1027]],

        [[-0.3802, -0.7170],
         [-0.2591, -0.3310],
         [-1.8272, -1.1497],
         [-0.3459, -0.5028]]])

### Understanding Pack Padded Sequence

In [None]:
a = torch.randn(6,7)  #6 batches, max 7 words pre batch
lengths = [7,7,6,5,4,2]  #length of each batch
targets = pack_padded_sequence(a, lengths, batch_first=True)

In [None]:
targets[0].shape

In [None]:
sum(lengths)

In [None]:
print(a)
print(targets[0])
print(targets[1])

### Designing the Attention Model

In [10]:
# luong attention layer
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        
    def dot_score(self, hidden, encoder_output):
        # element-wise multiply the current target decoder state with the encoder output and sum them
        return torch.sum(hidden * encoder_output, dim=2)
    
    def forward(self, hidden, encoder_outputs):
        # hidden of shape: (1, batch_size, hidden_size)
        # encoder_outputs of shape: (mmax_length, batch_size, hidden_size)
        
        # calculate the attention weights (energies)
        attn_energies = self.dot_score(hidden, encoder_outputs)  #(max_length, batch_size)
        # transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()  #(batch_size, max_length)
        # return the softmax normalized probability score (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)  #(batch_size, 1, max_length)

For the decoder, we will manually feed our batch one time step at a time. This means that our embedded ord tensor and GRU output will both have shape (1, batch_size, hidden_size). The steps are: Get embedding of current input word, Forward through unidirectional GRU, Calculate attention weights from the current GRU output, Multiply attention weights to encoder outputs to get new "weighted sum" context vector, Concatenate weighted context vector and GRU output, Predict next word, and finally Return output and final hidden state.

In [11]:
a = torch.rand(5,7)
a

tensor([[0.8365, 0.5777, 0.8596, 0.7298, 0.7634, 0.4625, 0.6062],
        [0.7194, 0.8295, 0.3138, 0.4103, 0.7714, 0.0983, 0.7104],
        [0.5396, 0.9785, 0.5689, 0.8912, 0.4184, 0.8996, 0.2706],
        [0.0667, 0.4212, 0.5936, 0.2655, 0.8596, 0.4887, 0.0099],
        [0.9633, 0.7603, 0.5566, 0.5069, 0.6149, 0.8789, 0.5651]])

In [16]:
b = F.softmax(a, dim=1)
b

tensor([[0.1638, 0.1264, 0.1676, 0.1472, 0.1522, 0.1127, 0.1301],
        [0.1640, 0.1830, 0.1093, 0.1204, 0.1727, 0.0881, 0.1625],
        [0.1237, 0.1918, 0.1274, 0.1758, 0.1096, 0.1773, 0.0945],
        [0.0998, 0.1423, 0.1691, 0.1218, 0.2206, 0.1522, 0.0943],
        [0.1848, 0.1508, 0.1230, 0.1171, 0.1304, 0.1698, 0.1241]])

In [17]:
print(b[0])
b[0].sum()

tensor([0.1638, 0.1264, 0.1676, 0.1472, 0.1522, 0.1127, 0.1301])


tensor(1.0000)

### Designing the Decoder

In [None]:
class LeongAttnDecoderRNN(nn.Module):
    def __init__(self,attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        # define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size, output_size)
        
        self.attn = Attn(attn_model, hidden_size)
        
    def forward(self, input_step, last_hidden, encoder_outputs):
        # input_step: one time step (one word) of inout sequence batch; shape=(1,batch_size) 
        # last_hidden: final hidden layer of GRU; shape=(n_layers x num_directions, batch_size, hidden_size)
        # encoder_outputs: encoder model's output; shape=(max_lengths, batch_size, hidden_size)
        # note: we run this one step (batch of word) at a time
        
        # get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = step.embedding_dropout(embedded)
        # forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # multiply attention weights to encoder outputs to get new weighted sum context vector
        

### Creating the Loss Function