# Test for Backend

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import random, math

In [2]:
device = torch.device('cpu')

## 1) Loading Vocab

In [3]:
vocab_transform = torch.load('vocab_transform.pth')

## 2) Model

### 2.1) Encoder

In [4]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, bidirectional = True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_len):
        
        #src = [src len, batch size]
        #src_len = [batch size]
        
        embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
                
        #need to explicitly put lengths on cpu!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.to('cpu'), enforce_sorted=False)
                
        packed_outputs, hidden = self.rnn(packed_embedded)        
        #packed_outputs is a packed sequence containing all hidden states
        #hidden is now from the final non-padded element in the batch
            
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
        #outputs is now a non-packed sequence, all hidden states obtained
        #  when the input is a pad token are all zeros
            
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src len, batch size, hid dim * 2]
        #hidden = [batch size, hid dim]
        
        return outputs, hidden

### 2.2) Attention

In [5]:
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        
        self.v = nn.Linear(hid_dim, 1, bias = False)
        self.W = nn.Linear(hid_dim,     hid_dim) #for decoder
        self.U = nn.Linear(hid_dim * 2, hid_dim) #for encoder outputs
                
    def forward(self, hidden, encoder_outputs, mask):
        
        #hidden = [batch size, hid dim]
        #encoder_outputs = [src len, batch size, hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        #hidden = [batch size, src len, hid dim]

        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        #encoder_outputs = [batch size, src len, hid dim * 2]
        
        energy = torch.tanh(self.W(hidden) + self.U(encoder_outputs))
        #energy = [batch size, src len, hid dim]
        
        attention = self.v(energy).squeeze(2)
        #attention = [batch size, src len]
        
        #use masked_fill_ if you want in-place
        attention = attention.masked_fill(mask, -1e10)
        
        return F.softmax(attention, dim = 1)

### 2.3) Decoder

In [6]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.gru = nn.GRU((hid_dim * 2) + emb_dim, hid_dim)
        self.fc = nn.Linear((hid_dim * 2) + hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs, mask):
             
        #input = [batch size]
        #hidden = [batch size, hid dim]
        #encoder_outputs = [src len, batch size, hid dim * 2]
        #mask = [batch size, src len]
        
        input = input.unsqueeze(0)
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs, mask)
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        #encoder_outputs = [batch size, src len, hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        #weighted = [batch size, 1, hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        #weighted = [1, batch size, hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        #rnn_input = [1, batch size, (hid dim * 2) + emb dim]
            
        output, hidden = self.gru(rnn_input, hidden.unsqueeze(0))
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc(torch.cat((output, weighted, embedded), dim = 1))
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0), a.squeeze(1)

### 2.3) Putting them together (become Seq2Seq!)

In [7]:
class Seq2SeqPackedAttention(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.device = device
        
    def create_mask(self, src):
        mask = (src == self.src_pad_idx).permute(1, 0)  #permute so it's the same shape as attention
        return mask
        
    def forward(self, src, src_len, max_trg_len, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #src_len = [batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
                    
        batch_size     = src.shape[1]
        trg_len        = max_trg_len
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #tensor to store attentiont outputs from decoder
        attentions = torch.zeros(trg_len, batch_size, src.shape[0]).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src, src_len)
                
        #first input to the decoder is the <sos> tokens
        input_ = torch.tensor([2])
        
        mask = self.create_mask(src)
        #mask = [batch size, src len]
                
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state, all encoder hidden states 
            #  and mask
            #receive output tensor (predictions) and new hidden state
            output, hidden, attention = self.decoder(input_, hidden, encoder_outputs, mask)
            #output    = [batch size, output dim]
            #hidden    = [batch size, hid dim]
            #attention = [batch size, src len]
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #place attentions in a tensor holding attention for each token
            attentions[t] = attention
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input_ = trg[t] if teacher_force else top1
            
        return outputs, attentions

In [8]:
SRC_LANGUAGE = 'hi'
TRG_LANGUAGE = 'en'

In [9]:
def initialize_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

In [10]:
PAD_IDX, SOS_IDX, EOS_IDX = 1, 2, 3

In [11]:
input_dim   = len(vocab_transform[SRC_LANGUAGE])
output_dim  = len(vocab_transform[TRG_LANGUAGE])
emb_dim     = 256  
hid_dim     = 512  
dropout     = 0.5
SRC_PAD_IDX = PAD_IDX

attn = Attention(hid_dim)
enc  = Encoder(input_dim,  emb_dim,  hid_dim, dropout)
dec  = Decoder(output_dim, emb_dim,  hid_dim, dropout, attn)

model = Seq2SeqPackedAttention(enc, dec, SRC_PAD_IDX, device).to(device)
model.apply(initialize_weights)

Seq2SeqPackedAttention(
  (encoder): Encoder(
    (embedding): Embedding(1768, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (v): Linear(in_features=512, out_features=1, bias=False)
      (W): Linear(in_features=512, out_features=512, bias=True)
      (U): Linear(in_features=1024, out_features=512, bias=True)
    )
    (embedding): Embedding(1863, 256)
    (gru): GRU(1280, 512)
    (fc): Linear(in_features=1792, out_features=1863, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

## 3) Inference

In [12]:
import stanza

In [13]:
token_transform = {}

In [14]:
hindi_tokenizer = stanza.Pipeline('hi', processors='tokenize', download_method=None)

def tokenizeHindiSent(text):
    doc = hindi_tokenizer(text)
    
    for sentence in doc.sentences:
        hindi_tokens = [token.text for token in sentence.tokens]
    return hindi_tokens

2023-03-05 22:52:38 INFO: Loading these models for language: hi (Hindi):
| Processor | Package |
-----------------------
| tokenize  | hdtb    |

2023-03-05 22:52:38 INFO: Use device: gpu
2023-03-05 22:52:38 INFO: Loading: tokenize
2023-03-05 22:52:43 INFO: Done loading processors!


In [15]:
from torchtext.data.utils import get_tokenizer

token_transform[TRG_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_md')
token_transform[SRC_LANGUAGE] = tokenizeHindiSent

In [16]:
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([SOS_IDX]), 
                      torch.tensor(token_ids), 
                      torch.tensor([EOS_IDX])))

# src and trg language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor

In [17]:
sample  = ('अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें', 'Give your application an accessibility workout')
sample2 = ('निचले पटल के लिए डिफोल्ट प्लग-इन खाका', 'The default plugin layout for the bottom panel')

### Sample Test

In [18]:
src_text = text_transform[SRC_LANGUAGE](sample[0]).to(device)
src_text

tensor([   2,  443,  137,    7,  264, 1178,    8, 1165,  451,    3])

In [19]:
trg_text = text_transform[TRG_LANGUAGE](sample[1]).to(device)
trg_text

tensor([   2,  879,  111,  297,   42,  288, 1330,    3])

In [20]:
src_text = src_text.reshape(-1, 1)  #because batch_size is 1

In [21]:
trg_text = trg_text.reshape(-1, 1)

In [22]:
src_text.shape, trg_text.shape

(torch.Size([10, 1]), torch.Size([8, 1]))

In [23]:
text_length = torch.tensor([src_text.size(0)]).to(dtype=torch.int64)

In [24]:
path = './models/Seq2SeqPackedAttention.pt' 

model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))

model.eval()
with torch.no_grad():
    output, attentions = model(src_text, text_length, trg_text.shape[0], 0) #turn off teacher forcing

In [25]:
output.shape #trg_len, batch_size, trg_output_dim

torch.Size([8, 1, 1863])

In [26]:
output = output.squeeze(1)

In [27]:
output.shape

torch.Size([8, 1863])

In [28]:
output = output[1:]
output.shape #trg_len, trg_output_dim

torch.Size([7, 1863])

In [29]:
output_max = output.argmax(1) #returns max indices

In [30]:
output_max

tensor([879, 111, 297,  42,   0,  42,   0])

In [31]:
mapping = vocab_transform[TRG_LANGUAGE].get_itos()

In [32]:
for token in output_max:
    print(mapping[token.item()])

Give
your
application
an
<unk>
an
<unk>


### Sample2 Test

In [33]:
src_text = text_transform[SRC_LANGUAGE](sample2[0]).to(device)
src_text

tensor([  2,   0, 315,   9,  22, 400, 184, 528,   3])

In [34]:
trg_text = text_transform[TRG_LANGUAGE](sample2[1]).to(device)
trg_text

tensor([  2,  43, 194,  59, 178,  14,   4, 422, 223,   3])

In [35]:
src_text = src_text.reshape(-1, 1)  #because batch_size is 1

In [36]:
trg_text = trg_text.reshape(-1, 1)

In [37]:
src_text.shape, trg_text.shape

(torch.Size([9, 1]), torch.Size([10, 1]))

In [38]:
text_length = torch.tensor([src_text.size(0)]).to(dtype=torch.int64)

In [39]:
with torch.no_grad():
    output, attentions = model(src_text, text_length, trg_text.shape[0], 0) #turn off teacher forcing

In [40]:
output.shape #trg_len, batch_size, trg_output_dim

torch.Size([10, 1, 1863])

In [41]:
output = output.squeeze(1)

In [42]:
output.shape

torch.Size([10, 1863])

In [43]:
output = output[1:]
output.shape #trg_len, trg_output_dim

torch.Size([9, 1863])

In [44]:
output_max = output.argmax(1) #returns max indices

In [45]:
output_max

tensor([ 43, 202, 115,  14,   4,  59,   5,   3,   3])

In [46]:
mapping = vocab_transform[TRG_LANGUAGE].get_itos()

In [47]:
for token in output_max:
    print(mapping[token.item()])

The
compiler
flags
for
the
plugin
.
<eos>
<eos>


In [48]:
output_max

tensor([ 43, 202, 115,  14,   4,  59,   5,   3,   3])

In [49]:
if torch.Tensor([2]) == 2:
    print("ok")

ok
