## Import libraries

In [1]:
print("importing os")
import os
print("importing random")
import random

print("importing torch")
import torch
print("from torch.autograd import Variable")
from torch.autograd import Variable
print("importing torch.nn as nn")
import torch.nn as nn
print("from torch import optim")
from torch import optim
print("importing torch.nn.functional as F")
import torch.nn.functional as F
print("from torch.utils.data import Dataset, DataLoader")
from torch.utils.data import Dataset, DataLoader

# Constants
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_CUDA = (str(DEVICE)=="cuda")
print(f"Using device: {DEVICE}")

CURR_DIR = globals()['_dh'][0]
DATA_DIR = os.path.join(CURR_DIR, "..", "data")

SOS_TOKEN = 0
EOS_TOKEN = 1

importing os
importing random
importing torch
from torch.autograd import Variable
importing torch.nn as nn
from torch import optim
importing torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
Using device: cuda


## Preparing dataset

In [2]:
class G2PDataset(Dataset) :
  def __init__(self, graphemes_list, phonemes_list) -> None :
    assert len(graphemes_list) == len(phonemes_list)
    self.graphemes_list = graphemes_list
    self.grapheme2index = {}
    self.grapheme2count = {}
    self.index2grapheme = {0: "<SOS>", 1: "<EOS>"}
    self.n_graphemes = 2
    for graphemes in graphemes_list :
      for grapheme in graphemes :
        self.add_grapheme(grapheme)

    self.phonemes_list = phonemes_list
    self.phoneme2index = {}
    self.phoneme2count = {}
    self.index2phoneme = {0: "<SOS>", 1: "<EOS>"}
    self.n_phonemes = 2
    for phonemes in phonemes_list :
      for phoneme in phonemes :
        self.add_phoneme(phoneme)

  def add_grapheme(self, grapheme) :
    if grapheme not in self.grapheme2index :
      self.grapheme2index[grapheme] = self.n_graphemes
      self.grapheme2count[grapheme] = 1
      self.index2grapheme[self.n_graphemes] = grapheme
      self.n_graphemes += 1
    else :
      self.grapheme2count[grapheme] += 1

  def add_phoneme(self, phoneme) :
    if phoneme not in self.phoneme2index :
      self.phoneme2index[phoneme] = self.n_phonemes
      self.phoneme2count[phoneme] = 1
      self.index2phoneme[self.n_phonemes] = phoneme
      self.n_phonemes += 1
    else :
      self.phoneme2count[phoneme] += 1

  def __len__(self) :
    return len(self.graphemes_list)

  def __getitem__(self, index) -> str:
    graphemes = self.graphemes_list[index]
    phonemes = self.phonemes_list[index]
    return ' '.join(graphemes), ' '.join(phonemes)

In [3]:
def prepare_dataset() :
  print("Reading entries ..")

  # Read the file and split into lines
  with open(os.path.join(DATA_DIR, "ma/train.csv"), encoding="utf-8") as f_csv :
    next(f_csv, None)

    # Split every row into pairs
    pairs = [[s.strip('\n') for s in row.split(',')] for row in f_csv]

    #TODO: Add train val test split logic
    #TODO: Add experimentation on different types of grapheme (i.e. letters, morphemes, subword, syllables, etc.)
    train_graphemes_list = [[*pair[0]] for pair in pairs] # Split grapheme as desired
    train_phonemes_list = [[*pair[1]] for pair in pairs] # Split phoneme as desired

  train_g2p_dataset = G2PDataset(train_graphemes_list, train_phonemes_list)
  #TODO: add valid and test split
  # valid_g2p_dataset = G2PDataset(valid_graphemes_list, valid_phonemes_list)
  # test_g2p_dataset = G2PDataset(test_graphemes_list, test_phonemes_list)

  max_length = -999
  for graphemes in train_g2p_dataset.graphemes_list :
    if max_length < len(graphemes) :
      max_length = len(graphemes)
  for phonemes in train_g2p_dataset.phonemes_list :
    if max_length < len(phonemes) :
      max_length = len(phonemes)
  # for graphemes in valid_g2p_dataset.graphemes_list :
  #   if max_length < len(graphemes) :
  #     max_length = len(graphemes)
  # for phonemes in valid_g2p_dataset.phonemes_list :
  #   if max_length < len(phonemes) :
  #     max_length = len(phonemes)
  # for graphemes in test_g2p_dataset.graphemes_list :
  #   if max_length < len(graphemes) :
  #     max_length = len(graphemes)
  # for phonemes in test_g2p_dataset.phonemes_list :
  #   if max_length < len(phonemes) :
  #     max_length = len(phonemes)

  return train_g2p_dataset, max_length

train_g2p_dataset, MAX_LENGTH = prepare_dataset()

BATCH_SIZE = 32
train_dataloader = DataLoader(train_g2p_dataset, batch_size=BATCH_SIZE, shuffle=True)
#TODO: Add train val test split
# valid_dataloader = DataLoader(valid_g2p_dataset, batch_size=BATCH_SIZE, shuffle=True)
# test_dataloader = DataLoader(test_g2p_dataset, batch_size=BATCH_SIZE, shuffle=True)

print((train_dataloader.dataset[0]))
# print(train_dataloader.dataset.phonemes_list)
for batch, (a,b) in enumerate(train_dataloader) :
  print(len(a), a)
  print(b)
  print()


Reading entries ..
('a b a', 'a b a')
32 ('b e r g a m b a r', 'a n g a n k a n', 'b e s e k', 'b e g i t u', 'b e r m a t a', 'b e r d e g u m', 'b a n c a n g', 'a d i r a t n a', 'b e r a k a d', 'b a d a n g', 'b e r u j a r', 'b e l u k u t', 'b a k i r', 'b e r k e s i n a m b u n g a n', 'a u t o b i o g r a f i', 'b e r d e g i l', 'a t e n s i', 'b e b e r', 'a j i', 'a n g g a i', 'b e s a l e n', 'a r o g a n s i', 'b e r g a b u n g', 'b e t', 'b i l a s', 'b e r d u r h a k a', 'b e r b o h o n g', 'a n a r k i s', 'b i n a r a g a', 'b e r o n g s o n g', 'b e r s i t e g a n g', 'a e r o b i k')
('b ə r g a m b a r', 'a ŋ a n k a n', 'b e s e ʔ', 'b ə g i t u', 'b ə r m a t a', 'b ə r d ə g u m', 'b a n t ʃ a ŋ', 'a d i r a t n a', 'b ə r a k a d', 'b a d a ŋ', 'b ə r u d ʒ a r', 'b ə l u k u t', 'b a k i r', 'b ə r k ə s i n a m b u ŋ a n', 'a u t o b i o g r a f i', 'b ə r d ə g i l', 'a t e n s i', 'b ə b ə r', 'a d ʒ i', 'a ŋ g a i', 'b ə s a l e n', 'a r o g a n s i

## Turning training data into Tensors/Variables

In [4]:
def indexes_from_pair(dataset, pair) :
  """
  pair: [graphemes, phonemes]
  """

  #TODO: Add experimentation on different types of grapheme (i.e. letters, morphemes, subword, syllables, etc.)
  graphemes_indexes = [dataset.grapheme2index[grapheme] for grapheme in pair[0].split(' ')] # Letter as grapheme

  phonemes_indexes = [dataset.phoneme2index[phoneme] for phoneme in pair[1].split(' ')] # Single-charater phoneme
  return graphemes_indexes, phonemes_indexes

def variables_from_pair(dataset, pair) :
  graphemes_indexes, phonemes_indexes = indexes_from_pair(dataset, pair)
  graphemes_indexes.append(EOS_TOKEN)
  phonemes_indexes.append(EOS_TOKEN)
  graphemes_var = Variable(torch.LongTensor(graphemes_indexes).view(-1, 1))
  phonemes_var = Variable(torch.LongTensor(phonemes_indexes).view(-1, 1))

  if USE_CUDA :
    graphemes_var = graphemes_var.cuda()
    phonemes_var = phonemes_var.cuda()
  return graphemes_var, phonemes_var

print(train_g2p_dataset, train_dataloader.dataset[0])
print(variables_from_pair(train_g2p_dataset, train_dataloader.dataset[2]))
print("grp", train_g2p_dataset.index2grapheme)
print("phn", train_g2p_dataset.index2phoneme)
print(len(train_dataloader))

<__main__.G2PDataset object at 0x7f9e583bf8b0> ('a b a', 'a b a')
(tensor([[2],
        [3],
        [4],
        [2],
        [5],
        [1]], device='cuda:0'), tensor([[2],
        [3],
        [4],
        [2],
        [5],
        [1]], device='cuda:0'))
grp {0: '<SOS>', 1: '<EOS>', 2: 'a', 3: 'b', 4: "'", 5: 'd', 6: 'i', 7: 'h', 8: 'k', 9: 'n', 10: 'm', 11: 'u', 12: 's', 13: 'g', 14: 'y', 15: 'r', 16: 't', 17: 'o', 18: 'e', 19: 'l', 20: 'c', 21: 'j', 22: 'v', 23: 'p', 24: 'f', 25: 'w', 26: 'z', 27: '-', 28: 'q'}
phn {0: '<SOS>', 1: '<EOS>', 2: 'a', 3: 'b', 4: 'ʔ', 5: 'd', 6: 'i', 7: 'h', 8: 'k', 9: 'n', 10: 'm', 11: 'u', 12: 's', 13: 'ŋ', 14: 'ɲ', 15: 'r', 16: 't', 17: 'o', 18: 'e', 19: 'l', 20: 'ʃ', 21: 'ə', 22: 'ʒ', 23: 'v', 24: 'p', 25: 'f', 26: 'j', 27: 'w', 28: 'g', 29: 'z'}
160


## Model definitions

### Encoder

In [5]:
class Encoder(nn.Module) :
  def __init__(self, input_size, hidden_size, n_layers=1) -> None :
    super(Encoder, self).__init__()

    self.input_size = input_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers

    self.embedding = nn.Embedding(input_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, n_layers)

  def forward(self, word_inputs, hidden) :
    seq_len = len(word_inputs)
    embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
    output, hidden = self.gru(embedded, hidden)
    return output, hidden
  
  def init_hidden(self) :
    hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
    if USE_CUDA :
      hidden = hidden.cuda()
    return hidden

### Attention

In [6]:
class Attn(nn.Module) :
  def __init__(self, method, hidden_size) -> None :
    super(Attn, self).__init__()

    self.method = method
    self.hidden_size = hidden_size

    if self.method == "general" :
      self.attn = nn.Linear(self.hidden_size, hidden_size)
    elif self.method == "concat" :
      self.attn = nn.Linear(self.hidden_size*2, hidden_size)
      self.other = nn.Parameter(torch.FloatTensor(hidden_size))
  
  def forward(self, hidden, encoder_outputs) :
    seq_len = len(encoder_outputs)

    # Create variable to store attention energies
    attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
    if USE_CUDA :
      attn_energies = attn_energies.cuda()
    
    # Calculate energies for each encoder output
    for i in range(seq_len) :
      attn_energies[i] = self.score(hidden.squeeze(0), encoder_outputs[i].squeeze(0))
    
    # Normalize energiees to weights in range 0 to 1, resize to 1 x 1 x seq_len
    return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)

  def score(self, hidden, encoder_output) :
    # print("hidden shape", hidden.shape, hidden)
    # print("encoder_output shape", encoder_output.shape, encoder_output)
    if self.method == "dot" :
      energy = hidden.dot(encoder_output)
    elif self.method == "general" :
      energy = self.attn(encoder_output)
      energy = hidden.dot(energy)
    elif self.method == "concat" :
      hidden = hidden.unsqueeze(0)
      encoder_output = encoder_output.unsqueeze(0)
      energy = self.attn(torch.cat((hidden, encoder_output), 1))
      energy = self.other.dot(energy.squeeze(0))
    return energy

### AttnDecoderRNN

In [7]:
class AttnDecoderRNN(nn.Module) :
  def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_proba=.1) -> None :
    super(AttnDecoderRNN, self).__init__()
    
    self.attn_model = attn_model
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout_proba = dropout_proba

    # Define layers
    self.embedding = nn.Embedding(output_size, hidden_size)
    self.gru = nn.GRU(hidden_size*2, hidden_size, n_layers, dropout=dropout_proba)
    self.out = nn.Linear(hidden_size*2, output_size)

    # Choose attention model
    if attn_model != "none" :
      self.attn = Attn(attn_model, hidden_size)
  
  def forward(self, word_input, last_context, last_hidden, encoder_outputs) :
    # Get the embedding of the current input word (last output word)
    word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
    
    # Combine embedded input word and last context, run through RNN
    rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), dim=2)
    rnn_output, hidden = self.gru(rnn_input, last_hidden)

    # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
    attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
    context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N

    # Final output layer (next word prediction) usint the RNN hidden state and context vector
    rnn_output = rnn_output.squeeze(0)  # S=1 x B x N => B x N
    context = context.squeeze(1)        # B x S=1 x N => B x N
    output = F.log_softmax(self.out(torch.cat((rnn_output, context), dim=1)))

    # Return final output, hidden state, and attention weights (for visualization)
    return output, context, hidden, attn_weights

## Testing the models

In [8]:
encoder_test = Encoder(10, 10, 2)
decoder_test = AttnDecoderRNN("general", 10, 10, 2)
print(encoder_test)
print(decoder_test)

encoder_hidden = encoder_test.init_hidden()
word_input = Variable(torch.LongTensor([1, 2, 3]))
if USE_CUDA :
  encoder_test.cuda()
  word_input = word_input.cuda()
encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)

word_inputs = Variable(torch.LongTensor([1, 2, 3]))
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))

if USE_CUDA :
  decoder_test.cuda()
  word_inputs = word_inputs.cuda()
  decoder_context = decoder_context.cuda()

for i in range(3) :
  decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[i], decoder_context, decoder_hidden, encoder_outputs)
  print(decoder_output.size(), decoder_hidden.size(), decoder_attn.size())
  decoder_attns[0, i] = decoder_attn.squeeze(0).cpu().data

Encoder(
  (embedding): Embedding(10, 10)
  (gru): GRU(10, 10, num_layers=2)
)
AttnDecoderRNN(
  (embedding): Embedding(10, 10)
  (gru): GRU(20, 10, num_layers=2, dropout=0.1)
  (out): Linear(in_features=20, out_features=10, bias=True)
  (attn): Attn(
    (attn): Linear(in_features=10, out_features=10, bias=True)
  )
)
torch.Size([1, 10]) torch.Size([2, 1, 10]) torch.Size([1, 1, 3])
torch.Size([1, 10]) torch.Size([2, 1, 10]) torch.Size([1, 1, 3])
torch.Size([1, 10]) torch.Size([2, 1, 10]) torch.Size([1, 1, 3])


  return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)
  output = F.log_softmax(self.out(torch.cat((rnn_output, context), dim=1)))


## Helper functions

In [9]:
import time
import math
import matplotlib.pyplot as plt
plt.switch_backend("agg")
import matplotlib.ticker as ticker
%matplotlib inline
import numpy as np

def as_minutes(seconds) :
  minutes = math.floor(seconds/60)
  seconds -= minutes*60
  return f"{minutes}m {seconds}s"

def time_since(since, percent) :
  now = time.time()
  seconds = now - since
  eta_seconds = seconds/(percent)
  remaining_seconds = eta_seconds - seconds
  return f"{as_minutes(seconds)} (- {as_minutes(remaining_seconds)})"

## Training script

In [10]:
teacher_forcing_ratio = .5

def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) :
  # Zero gradients of both optimizers
  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()
  loss = 0 # Added onto for each word

  # Get size of input and target sentences
  input_length = input_variable.size()[0]
  target_length = target_variable.size()[0]

  # Run words through the encoder
  encoder_hidden = encoder.init_hidden()
  encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

  # Prepare input and output variables
  decoder_input = Variable(torch.LongTensor([[SOS_TOKEN]]))
  decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
  decoder_hidden = encoder_hidden # Use last hidden state from encoder to start decoder
  if USE_CUDA :
    decoder_input = decoder_input.cuda()
    decoder_context = decoder_context.cuda()
  
  # Choose whether to use teacher forcing or not
  use_teacher_forcing = random.random() < teacher_forcing_ratio
  if use_teacher_forcing :
    # Teacher forcing: Use the ground-truth target as the next input
    for di in range(target_length) :
      decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
      loss += criterion(decoder_output, target_variable[di])
      decoder_input = target_variable[di] # Next target is the next input
  else :
    # Without teacher forcing: use network's own prediction as the next input
    for di in range(target_length) :
      decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
      loss += criterion(decoder_output, target_variable[di])

      # Get most likely word index (highest value) from output
      topv, topi = decoder_output.data.topk(1)
      ni = topi[0][0]

      decoder_input = Variable(torch.LongTensor([[ni]])) # Chosen word is next input
      if USE_CUDA :
        decoder_input = decoder_input.cuda()
      
      # Stop at the end of sentence (not necessary when using known targets)
      if ni == EOS_TOKEN :
        break
  
  # Backpropagation
  loss.backward()
  encoder_optimizer.step()
  decoder_optimizer.step()

  return loss.item()/target_length

## Initialize the model

In [11]:
attn_model = "general"
hidden_size = 500
n_layers = 2
dropout_proba = .1

# Initialize models
encoder = Encoder(train_g2p_dataset.n_graphemes, hidden_size, n_layers)
decoder = AttnDecoderRNN(attn_model, hidden_size, train_g2p_dataset.n_phonemes, n_layers, dropout_proba=dropout_proba)

# Move models to GPU
if USE_CUDA :
  encoder.cuda()
  decoder.cuda()

# Initialize optimizers and criterion
learning_rate = .001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

## Training the model

In [12]:
# Training configurations
n_epochs = 10
plot_every = 1
print_every = 1

# Keep track of time elapsed and running averages
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

start = time.time()
for epoch in range(1, n_epochs+1) :
  print(f"Training for epoch {epoch} has started. Found {len(train_dataloader)} batch(es).")
  # Get all the training data batch
  for batch, (grps, phns) in enumerate(train_dataloader) :
    n_samples = len(grps)
    print(f"\tStart processing batch {batch+1} out of {len(train_dataloader)} batch(es). Found {n_samples} sample(s).")
    batch_loss = 0
    # Iterate through all samples in each batch
    for i in range(n_samples) :
      input_variable, target_variable = variables_from_pair(train_g2p_dataset, (grps[i], phns[i]))
      # Run the train function
      loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
      batch_loss += loss
    batch_loss /= n_samples
    print_loss_total += batch_loss
    plot_loss_total += batch_loss
    print(f"\tBatch {batch+1} out of {len(train_dataloader)} finished. batch loss: {batch_loss}")
  print_loss_total /= len(train_dataloader)
  plot_loss_total /= len(train_dataloader)

  if epoch%print_every == 0 :
    print_loss_avg = print_loss_total/print_every
    print_loss_total = 0
    print(f"Epoch {epoch} finished in {time_since(start, epoch/n_epochs)} ({epoch} {epoch*100/n_epochs}%) {round(print_loss_avg, 4)}")

  if epoch%plot_every == 0 :
    plot_loss_avg = plot_loss_total/plot_every
    plot_losses.append(plot_loss_avg)
    plot_loss_total = 0

Training for epoch 2 has started. Found 160 batch(es).
	Start processing batch 1 out of 160 batch(es). Found 32 sample(s).


  return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)
  output = F.log_softmax(self.out(torch.cat((rnn_output, context), dim=1)))


	Batch 1 out of 160 finished. batch loss: 2.4723350263411694
	Start processing batch 2 out of 160 batch(es). Found 32 sample(s).
	Batch 2 out of 160 finished. batch loss: 2.2944891576003417
	Start processing batch 3 out of 160 batch(es). Found 32 sample(s).
	Batch 3 out of 160 finished. batch loss: 1.9405427638680746
	Start processing batch 4 out of 160 batch(es). Found 32 sample(s).
	Batch 4 out of 160 finished. batch loss: 1.9731040976780316
	Start processing batch 5 out of 160 batch(es). Found 32 sample(s).
	Batch 5 out of 160 finished. batch loss: 1.8381945989703459
	Start processing batch 6 out of 160 batch(es). Found 32 sample(s).
	Batch 6 out of 160 finished. batch loss: 1.8166968332181586
	Start processing batch 7 out of 160 batch(es). Found 32 sample(s).
	Batch 7 out of 160 finished. batch loss: 1.6349641768522507
	Start processing batch 8 out of 160 batch(es). Found 32 sample(s).
	Batch 8 out of 160 finished. batch loss: 1.7568679828707443
	Start processing batch 9 out of 160

## Visualize training loss

In [None]:
def show_plot(points) :
  plt.figure()
  fig, ax = plt.subplots()
  # This locator puts ticks at regular intervals
  loc = ticker.MultipleLocator(base=.2)
  ax.yaxis.set_major_locator(loc)
  plt.title("Training loss")
  plt.
  plt.plot(points)

show_plot(plot_losses)

## Evaluate the model

In [None]:
def evaluate(sentence, max_length=MAX_LENGTH) :
  input_variable = variable_from_sentence(input_lang, sentence)
  input_length = input_variable.size()[0]
  print("input_length", input_length)

  # Run through encoder
  encoder_hidden = encoder.init_hidden()
  encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
  print("encoder_outputs", encoder_outputs.shape)
  print("encoder_hidden", encoder_hidden.shape)

  # Create starting vectors for decoder
  decoder_input = Variable(torch.LongTensor([[SOS_TOKEN]])) # SOS
  decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
  if USE_CUDA :
    decoder_input = decoder_input.cuda()
    decoder_context = decoder_context.cuda()
  
  decoder_hidden = encoder_hidden
  decoded_words = []
  decoder_attentions = torch.zeros(max_length, max_length)
  
  # Run through decoder
  for di in range(max_length) :
    decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
    decoder_attentions[di, :decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data

    # Choose top word from output
    topv, topi = decoder_output.data.topk(1)
    ni = topi[0][0]
    if ni.item() == EOS_TOKEN :
      decoded_words.append("<EOS>")
      break
    else :
      decoded_words.append(output_lang.index2word[ni.item()])
    
    # Next input is chosen word
    decoder_input = Variable(torch.LongTensor([[ni.item()]]))
    if USE_CUDA :
      decoder_input = decoder_input.cuda()
  print(decoder_attentions[:di+1, :len(encoder_outputs)])
  return decoded_words, decoder_attentions[:di+1, :len(encoder_outputs)]

def evaluate_randomly() :
  pair = random.choice(pairs)

  output_words, decoder_attns = evaluate(pair[0])
  output_sentence = ' '.join(output_words)

  print('>', pair[0])
  print('=', pair[1])
  print('<', output_sentence)
  print('')
  return pair, output_words, decoder_attns

pair, output_words, decoder_attns = evaluate_randomly()
plt.matshow(decoder_attns.numpy())