In [1]:
%matplotlib inline

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
!wget https://download.pytorch.org/tutorial/data.zip

!unzip data.zip

--2021-07-24 16:57:00--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 99.86.37.74, 99.86.37.37, 99.86.37.116, ...
Connecting to download.pytorch.org (download.pytorch.org)|99.86.37.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2021-07-24 16:57:01 (33.6 MB/s) - ‘data.zip’ saved [2882130/2882130]

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt  

In [4]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [6]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [7]:

MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [8]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['je suis heureux de vous avoir invites .', 'i m glad i invited you .']


In [9]:
pairs[0:5]

[['j ai ans .', 'i m .'],
 ['je vais bien .', 'i m ok .'],
 ['ca va .', 'i m ok .'],
 ['je suis gras .', 'i m fat .'],
 ['je suis gros .', 'i m fat .']]

In [10]:
sample = random.choice(pairs)
sample

['je suis loyal .', 'i m loyal .']

In [11]:
input_size = input_lang.n_words
hidden_size = 256

In [12]:
input_size

4345

In [13]:
for word in sample[0].split(' '):
  print(word)

je
suis
loyal
.


In order to work with embedding layer and the LSTM the inputs should be in the form of tensor, So we need to convert the sentences(words) to tensors.
First we'll split the sentences by whitespaces and convert each words into indices(using word2index[word])

In [14]:
input_sentence = sample[0]
target_sentence = sample[1]

input_indices = [input_lang.word2index[word] for word in input_sentence.split(' ')]
target_indices = [output_lang.word2index[word] for word in target_sentence.split(' ')]

input_indices, target_indices

([6, 11, 92, 5], [2, 3, 62, 4])

Add, EOS to the end of sentences to indicate the end of tensor

In [15]:
input_indices.append(EOS_token)
target_indices.append(EOS_token)

input_indices, target_indices

([6, 11, 92, 5, 1], [2, 3, 62, 4, 1])

Convert to tensor

In [16]:
input_tensor = torch.tensor(input_indices, dtype=torch.long, device= device)
output_tensor = torch.tensor(target_indices, dtype=torch.long, device= device)

Define Embedding layer and LSTM layer for encoder 

In [17]:
embedding = nn.Embedding(input_size, hidden_size).to(device)
lstm = nn.LSTM(hidden_size, hidden_size).to(device)

In [18]:
embedded_input = embedding(input_tensor[0]) #first word only
embedded_input.shape

torch.Size([256])


We are working with 1 sample, but we would be working for a batch. Let's fix that by converting our input_tensor into a fake batch

In [19]:
print(embedded_input.shape)
embedded_input = embedding(input_tensor[0].view(-1, 1))
print(embedded_input.shape)

torch.Size([256])
torch.Size([1, 1, 256])


Let's build our LSTM, initialize the hidden state and cell state with Zeros(Empty state)

In [20]:
(hidden,ct) = torch.zeros(1, 1, 256, device=device),torch.zeros(1, 1, 256, device=device)

embedded_input = embedding(input_tensor[0].view(-1, 1))
output, (hidden,ct) = lstm(embedded_input, (hidden,ct))

In [21]:
embedded_input.shape,hidden.shape,ct.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [22]:

output.shape, output[0, 0].shape

(torch.Size([1, 1, 256]), torch.Size([256]))

In [23]:
encoder_outputs = torch.zeros(MAX_LENGTH, 256, device=device)
encoder_outputs.shape

torch.Size([10, 256])

In [24]:
input_tensor.size(), input_tensor.size()[0]

(torch.Size([5]), 5)

Now we will define a empty tensor with size MAX_LENGTH to store the Encoder outputs.
Then we can get the encoder outputs for each of the word in the Sentence

In [25]:
encoder_outputs = torch.zeros(MAX_LENGTH, 256, device=device)

(encoder_hidden,encoder_ct) = torch.zeros(1, 1, 256, device=device),torch.zeros(1, 1, 256, device=device)

for i in range(input_tensor.size()[0]):  
  embedded_input = embedding(input_tensor[i].view(-1, 1))
  output, (encoder_hidden,encoder_ct) = lstm(embedded_input, (encoder_hidden,encoder_ct))
  encoder_outputs[i] += output[0,0]

In [26]:
input_sentence = sample[0]
target_sentence = sample[1]
print("Input Sentence:",sample[0])
print("Target Sentence:",sample[1])

input_indices = [input_lang.word2index[word] for word in input_sentence.split(' ')]
target_indices = [output_lang.word2index[word] for word in target_sentence.split(' ')]
print("Input indices:",input_indices)
print("Target indices:",input_indices)

input_indices.append(EOS_token)
target_indices.append(EOS_token)
print("After adding the <EOS> token")
print("Input indices:",input_indices)
print("Target indices:",input_indices)

input_tensor = torch.tensor(input_indices, dtype=torch.long, device= device)
output_tensor = torch.tensor(target_indices, dtype=torch.long, device= device)
print("Input tensor:",input_tensor)
print("Target tensor:",output_tensor)
print("\n\n")

print("Encoder part\n")
encoder_outputs = torch.zeros(MAX_LENGTH, 256, device=device)
(encoder_hidden,encoder_ct) = torch.zeros(1, 1, 256, device=device),torch.zeros(1, 1, 256, device=device)

for i in range(input_tensor.size()[0]):
  
  if (i<input_tensor.size()[0]-1):
    input_word = input_sentence.split(' ')[i] 
  else:
    input_word = "<EOS>"
  print('Step %d\nWord => %s\n'%(i,input_word))
  print("Input Tensor =>",input_tensor[i]) 
  embedded_input = embedding(input_tensor[i].view(-1, 1))
  embedded_input_cpu = embedded_input.cpu()
  # plt.figure(figsize = (16,16))
  # seaborn.heatmap(embedded_input_cpu.detach().numpy().reshape(-1, 16), fmt=".2f",annot=True, cmap="Blues").set(title=f"Embeddings for word => {input_word}")
  # plt.xticks([])
  # plt.yticks([])
  # plt.show()
  output, (encoder_hidden,encoder_ct) = lstm(embedded_input, (encoder_hidden,encoder_ct))
  output_cpu = output[0,0].cpu()
  # plt.figure(figsize = (16,16))
  # seaborn.heatmap(output_cpu.detach().numpy().reshape(-1, 16), fmt=".2f",annot=True, cmap="Blues").set(title=f"Encoder hidden state for word => {input_word}")
  # plt.xticks([])
  # plt.yticks([])
  # plt.show()
  encoder_outputs[i] += output[0,0]
  # print("\n\n")

Input Sentence: je suis loyal .
Target Sentence: i m loyal .
Input indices: [6, 11, 92, 5]
Target indices: [6, 11, 92, 5]
After adding the <EOS> token
Input indices: [6, 11, 92, 5, 1]
Target indices: [6, 11, 92, 5, 1]
Input tensor: tensor([ 6, 11, 92,  5,  1], device='cuda:0')
Target tensor: tensor([ 2,  3, 62,  4,  1], device='cuda:0')



Encoder part

Step 0
Word => je

Input Tensor => tensor(6, device='cuda:0')
Step 1
Word => suis

Input Tensor => tensor(11, device='cuda:0')
Step 2
Word => loyal

Input Tensor => tensor(92, device='cuda:0')
Step 3
Word => .

Input Tensor => tensor(5, device='cuda:0')
Step 4
Word => <EOS>

Input Tensor => tensor(1, device='cuda:0')



We completed the Encoder part now, Now we can start building the Attention Decoder

First input to the decoder will be SOS_token, later inputs would be the words it predicted (unless we implement teacher forcing).
Decoder/LSTM's hidden state will be initialized with the encoder's last hidden state.
We will use LSTM's hidden state and last prediction to generate attention weight using a FC layer.
This attention weight will be used to weigh the encoder_outputs using batch matric multiplication. This will give us a NEW view on how to look at encoder_states.
this attention applied encoder_states will then be concatenated with the input, and then sent a linear layer and then sent to the LSTM.
LSTM's output will be sent to a FC layer to predict one of the output_language words

In [27]:
# first input
decoder_input = torch.tensor([[SOS_token]], device=device)
(decoder_hidden,decoder_ct) = (encoder_hidden,encoder_ct)
decoded_words = []

In [28]:
output_size = output_lang.n_words

embedding = nn.Embedding(output_size, 256).to(device)

In [29]:
embedded = embedding(decoder_input)
embedded.shape

torch.Size([1, 1, 256])

In [30]:
attn_weight_layer = nn.Linear(256 * 2, 10).to(device)

In [31]:
embedded.shape, decoder_hidden.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [32]:
torch.cat((embedded, decoder_hidden), 1).shape

torch.Size([1, 2, 256])

In [33]:
torch.cat((embedded[0], decoder_hidden[0]), 1).shape

torch.Size([1, 512])

Now we will calaculate the attentions. We will calculating the attentions by conacatinating the embeddings and last decoder hidden state and giving as input to the fully connected layer.

In [34]:
attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights

tensor([[-0.8699,  0.8735, -0.6135, -0.1851,  0.3014, -0.1604,  0.1392, -0.6633,
         -0.3715,  0.3680]], device='cuda:0', grad_fn=<AddmmBackward>)

In [35]:
import torch.nn.functional as F

attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_weights

tensor([[0.1876, 0.0679, 0.0809, 0.0534, 0.0822, 0.2576, 0.0746, 0.0894, 0.0625,
         0.0439]], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [36]:
attn_weights.shape, encoder_outputs.shape

(torch.Size([1, 10]), torch.Size([10, 256]))

In [37]:
attn_weights.unsqueeze(0).shape, encoder_outputs.unsqueeze(0).shape

(torch.Size([1, 1, 10]), torch.Size([1, 10, 256]))

In [38]:
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
attn_applied.shape

torch.Size([1, 1, 256])

In [39]:
input_to_lstm_layer = nn.Linear(256 * 2, 256).to(device)

embedded.shape, attn_applied.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [40]:
torch.cat((embedded, attn_applied), 1).shape, torch.cat((embedded[0], attn_applied[0]), 1).shape

(torch.Size([1, 2, 256]), torch.Size([1, 512]))

In [41]:
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_lstm.shape

torch.Size([1, 256])

In [42]:
lstm = nn.LSTM(256, 256).to(device)

In [43]:
decoder_hidden.shape, input_to_lstm.shape

(torch.Size([1, 1, 256]), torch.Size([1, 256]))

In [44]:
input_to_lstm.unsqueeze(0).shape

torch.Size([1, 1, 256])

In [45]:
output, (decoder_hidden,decoder_ct) = lstm(input_to_lstm.unsqueeze(0), (decoder_hidden,decoder_ct))
output.shape, decoder_hidden.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [46]:
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)

In [47]:
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
output, output.shape

(tensor([[0.0004, 0.0003, 0.0004,  ..., 0.0004, 0.0004, 0.0004]],
        device='cuda:0', grad_fn=<SoftmaxBackward>), torch.Size([1, 2803]))

In [48]:
output.data.topk(1)

torch.return_types.topk(values=tensor([[0.0004]], device='cuda:0'), indices=tensor([[1437]], device='cuda:0'))

In [49]:
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()]

'prankster'

In [50]:
top_index.item()

1437

In [51]:
decoder_input = torch.tensor([[SOS_token]], device=device)
(decoder_hidden,decoder_ct) = (encoder_hidden,encoder_ct)
output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)
attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_lstm_layer = nn.Linear(256 * 2, 256).to(device)
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
lstm = nn.LSTM(256, 256).to(device)
input_to_lstm = input_to_lstm.unsqueeze(0)
output, (decoder_hidden,decoder_ct) = lstm(input_to_lstm, (decoder_hidden,decoder_ct))
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()]

'cooking'

In [52]:
embedding = nn.Embedding(output_size, 256).to(device)
attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
input_to_lstm_layer = nn.Linear(256 * 2, 256).to(device)
lstm = nn.LSTM(256, 256).to(device)
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)


decoder_input = torch.tensor([[SOS_token]], device=device)
(decoder_hidden,decoder_ct) = (encoder_hidden,encoder_ct)
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_lstm = input_to_lstm.unsqueeze(0)
output, (decoder_hidden,decoder_ct) = lstm(input_to_lstm, (decoder_hidden,decoder_ct))
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('politics',
 tensor([[0.1348, 0.0700, 0.0775, 0.0967, 0.0755, 0.0949, 0.0301, 0.1141, 0.1631,
          0.1432]], device='cuda:0', grad_fn=<SoftmaxBackward>))

In [53]:

decoder_input = torch.tensor([[top_index.item()]], device=device)
(decoder_hidden,decoder_ct) = (encoder_hidden,encoder_ct)
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_lstm = input_to_lstm.unsqueeze(0)
output, (decoder_hidden,decoder_ct) = lstm(input_to_lstm, (decoder_hidden,decoder_ct))
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('boat',
 tensor([[0.1100, 0.0716, 0.1370, 0.1274, 0.0511, 0.1684, 0.0740, 0.1001, 0.0931,
          0.0674]], device='cuda:0', grad_fn=<SoftmaxBackward>))

In [54]:
decoder_input = torch.tensor([[target_indices[2]]], device=device)
(decoder_hidden,decoder_ct) = (encoder_hidden,encoder_ct)
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_lstm = input_to_lstm.unsqueeze(0)
output, (decoder_hidden,decoder_ct) = lstm(input_to_lstm, (decoder_hidden,decoder_ct))
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('soccer',
 tensor([[0.0408, 0.0974, 0.0516, 0.0746, 0.0863, 0.1366, 0.0982, 0.0747, 0.1881,
          0.1517]], device='cuda:0', grad_fn=<SoftmaxBackward>))

In [55]:
target_indices

[2, 3, 62, 4, 1]

In [57]:

print("Attention Decoder part With full Teacher forcing\n")
for i in range(4):
  print('\nStep %d'%(i))
  print('Expected output(word) => %s '% target_sentence.split(" ")[i])
  print('Expected output(Index) => %d '% target_indices[i])
  decoder_input = torch.tensor([[target_indices[i]]], device=device)
  (decoder_hidden,decoder_ct) = (encoder_hidden,encoder_ct)
  output_size = output_lang.n_words
  embedded = embedding(decoder_input)
  attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
  attn_weights = F.softmax(attn_weights, dim = 1)
  attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
  attn_weights_cpu = attn_weights.cpu()
  input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
  input_to_lstm = input_to_lstm.unsqueeze(0)
  output, (decoder_hidden,decoder_ct) = lstm(input_to_lstm, (decoder_hidden,decoder_ct))
  output = F.relu(output)  
  output = F.softmax(output_word_layer(output[0]), dim = 1)
  top_value, top_index = output.data.topk(1)
  print('Predicted output(word) => %s '% output_lang.index2word[top_index.item()])
  print('Predicted output(Index) => %d '% top_index.item())  
  # plt.figure(figsize = (10,1))
  # seaborn.heatmap(attn_weights_cpu.detach().numpy().reshape(-1, 10), fmt=".2f",annot=True, cmap="Blues").set(title=f"Attention Weights")
  # plt.xticks([])
  # plt.yticks([])
  # plt.show()
  #print(attn_weights)

Attention Decoder part With full Teacher forcing


Step 0
Expected output(word) => i 
Expected output(Index) => 2 
Predicted output(word) => buy 
Predicted output(Index) => 1888 

Step 1
Expected output(word) => m 
Expected output(Index) => 3 
Predicted output(word) => better 
Predicted output(Index) => 89 

Step 2
Expected output(word) => loyal 
Expected output(Index) => 62 
Predicted output(word) => soccer 
Predicted output(Index) => 1294 

Step 3
Expected output(word) => . 
Expected output(Index) => 4 
Predicted output(word) => boat 
Predicted output(Index) => 1316 


In [58]:
target_indices, target_sentence, input_sentence

([2, 3, 62, 4, 1], 'i m loyal .', 'je suis loyal .')