<a href="https://colab.research.google.com/github/benschlup/csck504assemblyfactory/blob/main/QA_Sequence_2_sequence_model_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Text Translation using a RNN

In this tutorial we will learn how to RNNs to build Sequence-2-Sequence (seq-2-seq) model for Neural machine Translation. We will build LSTM-based seq-2-seq model for translating German sentences into English sentences. 

Let us import the necessary libraries

1. Pytorch for using LSTM layer
2. Spacy for text processing

In [11]:
import torch
import tarfile
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import os
import pandas
import spacy
import urllib
from spacy.lang.en import English
from spacy.lang.de import German
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tqdm import tqdm_notebook
import random
from collections import Counter
 

In [4]:
# Make sure the GPU is visible to our runtime
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [7]:
# Download data
urllib.request.urlretrieve("https://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz", "Question_Answer_Dataset_v1.2.tar.gz")

('Question_Answer_Dataset_v1.2.tar.gz',
 <http.client.HTTPMessage at 0x7fa62b678350>)

In [10]:
# Extract files
file = tarfile.open('Question_Answer_Dataset_v1.2.tar.gz')
file.extractall('.')
file.close()

In [12]:
# Import questions and answers from all courses in Spring 2008, 2009 and 2010 respectively
qa_df = pd.DataFrame()
for course in ['S08', 'S09', 'S10']:
    print(f'Reading questions and answers from course {course}')
    course_qa_df = pd.read_csv( f'./Question_Answer_Dataset_v1.2/{course}/question_answer_pairs.txt', sep='\t', encoding='ISO-8859-1')
    course_qa_df['course'] = course
    qa_df = pd.concat([qa_df, course_qa_df])

Reading questions and answers from course S08
Reading questions and answers from course S09
Reading questions and answers from course S10


In [13]:
# Remove lines not having answers (or not even having questions, in some cases...):
qa_df = qa_df[qa_df['Answer'].notna()]

In [19]:
def prepare_data():
  # get english/french sentence pairs
  en = qa_df['Question'].to_list()
  de = qa_df['Answer'].to_list()
  
  # We'll be using the spaCy's English and German tokenizers
  spacy_en = English()
  spacy_de = German()
  
  en_words = Counter()
  de_words = Counter()
  en_inputs = []
  de_inputs = []
  
  # Tokenizing the English and German sentences and creating our word banks for both languages
  for i in range(len(en)):
      en_tokens = spacy_en(en[i])
      de_tokens = spacy_de(de[i])
      if len(en_tokens)==0 or len(de_tokens)==0:
          continue
      for token in en_tokens:
          en_words.update([token.text.lower()])
      en_inputs.append([token.text.lower() for token in en_tokens] + ['_EOS'])
      for token in de_tokens:
          de_words.update([token.text.lower()])
      de_inputs.append([token.text.lower() for token in de_tokens] + ['_EOS'])
    
  # Assigning an index to each word token, including the Start Of String(SOS), End Of String(EOS) and Unknown(UNK) tokens
  en_words = ['_SOS','_EOS','_UNK'] + sorted(en_words,key=en_words.get,reverse=True)
  en_w2i = {o:i for i,o in enumerate(en_words)}
  en_i2w = {i:o for i,o in enumerate(en_words)}
  de_words = ['_SOS','_EOS','_UNK'] + sorted(de_words,key=de_words.get,reverse=True)
  de_w2i = {o:i for i,o in enumerate(de_words)}
  de_i2w = {i:o for i,o in enumerate(de_words)}
  
  # Converting our English and German sentences to their token indexes
  for i in range(len(en_inputs)):
      en_sentence = en_inputs[i]
      de_sentence = de_inputs[i]
      en_inputs[i] = [en_w2i[word] for word in en_sentence]
      de_inputs[i] = [de_w2i[word] for word in de_sentence]
  
  return en_words, de_words, en_w2i, en_i2w, de_w2i, de_i2w, en_inputs, de_inputs


In [20]:
en_words, de_words, en_w2i, en_i2w, de_w2i, de_i2w, en_inputs, de_inputs = prepare_data()

en_inputs[0], de_inputs[0]

([10, 737, 93, 4, 1480, 68, 5, 4, 130, 134, 3, 1], [3, 1])

Let's write our Encoder Class

In [25]:
class EncoderLSTM(nn.Module):
  def __init__(self, vocab_len, input_dim, hidden_dim, n_layers=1, drop_prob=0):
    super(EncoderLSTM, self).__init__()

    self.input_dim = input_dim
    self.hidden_dim = hidden_dim
    self.n_layers = n_layers
 
    self.embedding = nn.Embedding(vocab_len, input_dim)
    self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, 
                        dropout=drop_prob, batch_first=True)
 
  def forward(self, inputs, encoder_state_vector, encoder_cell_vector):
    embedded = self.embedding(inputs)
    # Pass the embedded word vectors into LSTM and return all outputs
    output, hidden = self.lstm(embedded, (encoder_state_vector, encoder_cell_vector))
    return output, hidden
 
  def init_hidden(self, batch_size=1):
    return (torch.zeros(self.n_layers, batch_size, 
                        self.hidden_dim),
            torch.zeros(self.n_layers, batch_size, 
                        self.hidden_dim))


Let's write our Decoder class

In [26]:
class DecoderLSTM(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_vocab_len, n_layers=1, drop_prob=0.1):
    super(DecoderLSTM, self).__init__()
    self.hidden_dim = hidden_dim
    self.output_vocab_len = output_vocab_len
    self.n_layers = n_layers
    self.drop_prob = drop_prob
    self.input_dim = input_dim
 
    self.embedding = nn.Embedding(self.output_vocab_len, self.input_dim)
    self.dropout = nn.Dropout(self.drop_prob) 
    self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, batch_first=True)
    self.classifier = nn.Linear(self.hidden_dim, self.output_vocab_len)

  def forward(self, inputs, decoder_state_vector, decoder_context_vector):
    # Embed input words
    embedded = self.embedding(inputs).view(1, -1)
    embedded = self.dropout(embedded)
    embedded = embedded.unsqueeze(0)
    
    output, hidden = self.lstm(embedded, (decoder_state_vector, 
                                          decoder_context_vector))

    # Pass LSTM outputs through a Linear layer acting as a classifier
    output = F.log_softmax(self.classifier(output[0]), dim=1)

    return output, hidden



Let's train our model and save the trained model to the "model" directory.

In [27]:
input_dim = 100
hidden_dim = 256

encoder = EncoderLSTM(len(en_words), input_dim, hidden_dim)
decoder = DecoderLSTM(input_dim, hidden_dim, len(de_words))
 
lr = 0.001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=lr)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=lr)

EPOCHS = 10
teacher_forcing_prob = 0.5
encoder.train()
decoder.train()
tk0 = range(1,EPOCHS+1)
for epoch in tk0:
    avg_loss = 0.
    tk1 = enumerate(en_inputs)

    for i, sentence in tk1:

        loss = 0.

        #initialise encoder state vector and cell state vector
        h = encoder.init_hidden()
        encoder_state_vector = h[0]
        encoder_cell_vector = h[0]

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        inp = torch.tensor(sentence).unsqueeze(0)

        #print('inp: ', epoch, inp)
        if (i % 100) == 0:
          print('inp: ', i, epoch)
        encoder_outputs, h = encoder(inp, encoder_state_vector, encoder_cell_vector)
        
        #First decoder input will be the SOS token
        decoder_input = torch.tensor([en_w2i['_SOS']])
        #First decoder hidden state will be last encoder hidden state
        decoder_hidden = h

        output = []
        teacher_forcing = True if random.random() < teacher_forcing_prob else False

        for ii in range(len(de_inputs[i])):
          decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden[0], decoder_hidden[1])

          # Get the index value of the word with the highest score from the decoder output
          top_value, top_index = decoder_output.topk(1)
          if teacher_forcing:
            decoder_input = torch.tensor([de_inputs[i][ii]])
          else:
            decoder_input = torch.tensor([top_index.item()])
            
          output.append(top_index.item())
          # Calculate the loss of the prediction against the actual word
          loss += F.nll_loss(decoder_output.view(1,-1), torch.tensor([de_inputs[i][ii]]))

        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        avg_loss += loss.item()/len(en_inputs)
    print(avg_loss)

# Save model after every epoch (Optional)
torch.save({"encoder":encoder.state_dict(),"decoder":decoder.state_dict(),"e_optimizer":encoder_optimizer.state_dict(),"d_optimizer":decoder_optimizer},root_path+"model/model_enc_dec.pt")


inp:  0 1
inp:  100 1
inp:  200 1
inp:  300 1
inp:  400 1
inp:  500 1
inp:  600 1
inp:  700 1
inp:  800 1
inp:  900 1
inp:  1000 1
inp:  1100 1
inp:  1200 1
inp:  1300 1
inp:  1400 1
inp:  1500 1
inp:  1600 1
inp:  1700 1
inp:  1800 1
inp:  1900 1
inp:  2000 1
inp:  2100 1
inp:  2200 1
inp:  2300 1
inp:  2400 1
inp:  2500 1
inp:  2600 1
inp:  2700 1
inp:  2800 1
inp:  2900 1
inp:  3000 1
inp:  3100 1
inp:  3200 1
inp:  3300 1
inp:  3400 1
31.72295134307297
inp:  0 2
inp:  100 2
inp:  200 2
inp:  300 2
inp:  400 2
inp:  500 2
inp:  600 2
inp:  700 2
inp:  800 2
inp:  900 2
inp:  1000 2
inp:  1100 2
inp:  1200 2
inp:  1300 2
inp:  1400 2
inp:  1500 2
inp:  1600 2
inp:  1700 2
inp:  1800 2
inp:  1900 2
inp:  2000 2
inp:  2100 2
inp:  2200 2
inp:  2300 2
inp:  2400 2
inp:  2500 2
inp:  2600 2
inp:  2700 2
inp:  2800 2
inp:  2900 2
inp:  3000 2
inp:  3100 2
inp:  3200 2
inp:  3300 2
inp:  3400 2
26.391889648460893
inp:  0 3
inp:  100 3
inp:  200 3
inp:  300 3
inp:  400 3
inp:  500 3
inp:  6

NameError: ignored

Use the pretrained model to check translation for some random sentences in the corpus.

In [31]:
#checkpoint = torch.load(root_path+"model/model_enc_dec.pt")

#encoder.load_state_dict(checkpoint['encoder'])
#decoder.load_state_dict(checkpoint['decoder'])
#encoder_optimizer.load_state_dict(checkpoint['e_optimizer'])
#decoder_optimizer.load_state_dict(checkpoint['d_optimizer'])

encoder.eval()
decoder.eval()

# get some random numbers to choose random sentences
rand_integers = [random.randint(0, len(en_inputs)) for i in range(1, 20)]

for i in rand_integers:
  h = encoder.init_hidden()
  inp = torch.tensor(en_inputs[i]).unsqueeze(0)
  encoder_outputs, h = encoder(inp, h[0], h[1])
   
  decoder_input = torch.tensor([en_w2i['_SOS']])
  decoder_hidden = h
  output = []
  attentions = []
  while True:
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden[0], decoder_hidden[1])
    _, top_index = decoder_output.topk(1)
    decoder_input = torch.tensor([top_index.item()])
    # If the decoder output is the End Of Sentence token, stop decoding process
    if top_index.item() == de_w2i["_EOS"]:
      break
    output.append(top_index.item())
  
  print("English: "+ " ".join([en_i2w[x] for x in en_inputs[i]]))
  print("Predicted: " + " ".join([de_i2w[x] for x in output]))
  print("Actual: " + " ".join([de_i2w[x] for x in de_inputs[i]]))
  print()
   


English: what exception to the rules of vowel harmony do compound words have ? _EOS
Predicted: the
Actual: in compound words , the vowels need not harmonize between the constituent words of the compound . _EOS

English: do the different species of zebras interbreed ? _EOS
Predicted: no
Actual: no _EOS

English: what is the most common romanization standard for standard mandarin today ? _EOS
Predicted: the
Actual: hanyu pinyin _EOS

English: has swahili no diphthongs ? _EOS
Predicted: no
Actual: no _EOS

English: what is the name of a university ( or similar institution for imparting higher education ) in beijing ? _EOS
Predicted: the
Actual: tsinghua university _EOS

English: during his lifetime , did pollock enjoy considerable fame and notoriety ? _EOS
Predicted: yes
Actual: yes . _EOS

English: was it likely that the xylophone reached europe during the crusades ? _EOS
Predicted: yes
Actual: yes . _EOS

English: when was james monroe appointed to secretary of war ? _EOS
Predicted: in


In [30]:
len(en_inputs)

3422

In [34]:
en_inputs[25]

[46, 401, 10, 93, 44, 8, 3, 1]