Requirements

In [None]:
!pip install inltk

from __future__ import unicode_literals, print_function, division
from io import open
import inltk
from inltk.inltk import tokenize
from inltk.inltk import setup
import unicodedata
import string
import re
import pandas as pd
import random
import os
import csv
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from nltk.translate.bleu_score import sentence_bleu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from google.colab import drive
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 200

#initialize Lang Class
class Lang:
   def __init__(self):
       #initialize containers to hold the words and corresponding index
       self.word2index = {}
       self.word2count = {}
       self.index2word = {0: "SOS", 1: "EOS"}
       self.n_words = 2  # Count SOS and EOS

#split a sentence into words and add it to the container
   def addSentence(self, sentence):
       for word in sentence.split(' '):
           self.addWord(word)

#If the word is not in the container, the word will be added to it, 
#else, update the word counter
   def addWord(self, word):
       if word not in self.word2index:
           self.word2index[word] = self.n_words
           self.word2count[word] = 1
           self.index2word[self.n_words] = word
           self.n_words += 1
       else:
           self.word2count[word] += 1

In [None]:
def process_data():

  sf = open('/content/drive/My Drive/data/bible-uedin.hi-ne.ne' , "r")
  tf = open('/content/drive/My Drive/data/bible-uedin.hi-ne.hi' , "r")

  source = Lang()
  target = Lang()
  pairs = []
  count = 0
  count2 = 0
  for sent in sf:
    sent = sent.strip()
    source.addSentence(sent)
    pairs.append(sent)
    count = count+1

  # print(count)

  for sent in tf:
    sent = sent.strip()
    target.addSentence(sent.strip())
    pairs.append(sent)
    count2 = count2 + 1

  # print(count2)

  pairs_new = []

  for i in range(count):
    full = [pairs[i],pairs[i+count]]
    pairs_new.append(full)



  return source, target, pairs_new

In [None]:
# source, target, pairs = process_data()   JUST A CHECK TO SEE IF THE SIZE OF BOTH DATASET IS SAME

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


Model

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


Training the model

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        # plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

    #     if iter % plot_every == 0:
    #         plot_loss_avg = plot_loss_total / plot_every
    #         plot_losses.append(plot_loss_avg)
    #         plot_loss_total = 0

    # showPlot(plot_losses)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]


def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(val_pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')        

In [None]:
input_lang, output_lang, pairs = process_data()
pairs = filterPairs(pairs)

print(random.choice(pairs))

print(len(pairs))
x = len(pairs)

trainSplit = int(x*0.7)
valSplit = trainSplit + int(x*0.2)

train_pairs = pairs[:trainSplit]
val_pairs = pairs[trainSplit:valSplit]
test_pairs = pairs[valSplit:]

print(len(train_pairs))
print(len(val_pairs))
print(len(test_pairs))

['अब म बिन्ती गरिरहेछु केही खाऊ। यो तिमीहरूलाई बाँच्नको लागि आवश्यक हो। तिमीहरू कसैको पनि टाउकोको एउटा केश पनि नष्ट हुँदैन।”', 'इसलिये तुम्हें समझाता हूं; कि कुछ खा लो, जिस से तुम्हारा बचाव हो; क्योंकि तुम में से किसी के सिर पर एक बाल भी न गिरेगा।']
30486
21340
6097
3049


In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)


In [None]:
hidden_size = 256

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)


trainIters(encoder1, attn_decoder1, 30000, print_every=1000)

1m 31s (- 44m 16s) (1000 3%) 6.0689
3m 0s (- 42m 1s) (2000 6%) 5.6734
4m 28s (- 40m 14s) (3000 10%) 5.5950
5m 56s (- 38m 35s) (4000 13%) 5.5655
7m 23s (- 36m 56s) (5000 16%) 5.4634
8m 50s (- 35m 23s) (6000 20%) 5.3613
10m 20s (- 33m 57s) (7000 23%) 5.3017
11m 46s (- 32m 23s) (8000 26%) 5.1537
13m 10s (- 30m 43s) (9000 30%) 5.0703
14m 38s (- 29m 17s) (10000 33%) 5.2424
16m 7s (- 27m 51s) (11000 36%) 5.1525
17m 34s (- 26m 21s) (12000 40%) 5.2212
19m 5s (- 24m 57s) (13000 43%) 5.3283
20m 33s (- 23m 30s) (14000 46%) 5.2057
22m 2s (- 22m 2s) (15000 50%) 5.1365
23m 31s (- 20m 35s) (16000 53%) 5.1381
25m 2s (- 19m 8s) (17000 56%) 5.1524
26m 30s (- 17m 40s) (18000 60%) 5.0888
27m 59s (- 16m 12s) (19000 63%) 5.1202
29m 30s (- 14m 45s) (20000 66%) 5.0881
31m 1s (- 13m 17s) (21000 70%) 5.0339
32m 30s (- 11m 49s) (22000 73%) 4.9444
33m 59s (- 10m 20s) (23000 76%) 4.9257
35m 29s (- 8m 52s) (24000 80%) 4.9313
36m 59s (- 7m 23s) (25000 83%) 4.8959
38m 30s (- 5m 55s) (26000 86%) 4.9516
40m 2s (- 4m 26

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

> भोलिपल्ट बिहान, येशू चाँडै उठनुभयो। उहाँले घर छोडनुभयो, विहान अंध्यारोमा नै येशू एकान्तमा जानूभयो अनि प्रार्थना गर्न लाग्नु भयो।
= और भोर को दिन निकलने से बहुत पहिले, वह उठकर निकला, और एक जंगली स्थान में गया और वहां प्रार्थना करने लगा।
< तब ने ने से के और और और और और और और और से और और के और <EOS>

> यो दृष्टान्तको अर्थ के हो भनेः बीऊ परमेश्वरको वचन हो।
= दृष्टान्त यह है; बीज तो परमेश्वर का वचन है।
< क्या क्या से का है? <EOS>

> परमप्रभु भन्नुहुन्छ, “तिमीहरुका पिता-पुर्खाहरु जस्तो नबन बितेका समयमा अगमवक्ताहरु तिनीहरुसित कुरा गर्थे। तिनीहरुले भने, ‘तिमीहरुले आफ्नो नराम्रो कु-कर्महरु छोडिदेऊ।’ तर तिमीहरुका पुर्खाहरुले मेरो एक शब्द पनि सुनेनन्।” परमप्रभुले यो कुरा भन्नुभयो।
= अपने पुरखाओं के समान न बनो, उन से तो अगले भविष्यद्वक्ता यह पुकार पुकारकर कहते थे कि सेनाओं का यहोवा यों कहता है, अपने बुरे मार्गों से, और अपने बुरे कामों से फिरो; परन्तु उन्हों ने न तो सुना, और न मेरी ओर ध्यान दिया, यहोवा की यही वाणी है।
< यहोवा यहोवा के कहा, यहोवा से यहोवा के के मैं के और के मैं के के मैं के और के 

In [None]:
class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder, device, MAX_LENGTH=MAX_LENGTH):
       super().__init__()
      
#initialize the encoder and decoder
       self.encoder = encoder
       self.decoder = decoder
       self.device = device
     
   def forward(self, source, target, teacher_forcing_ratio=0.5):

       input_length = source.size(0) #get the input length (number of words in sentence)
       batch_size = target.shape[1] 
       target_length = target.shape[0]
       vocab_size = self.decoder.output_dim
      
#initialize a variable to hold the predicted outputs
       outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)

#encode every word in a sentence
       for i in range(input_length):
           encoder_output, encoder_hidden = self.encoder(source[i])

#use the encoder’s hidden layer as the decoder hidden
       decoder_hidden = encoder_hidden.to(device)
  
#add a token before the first predicted word
       decoder_input = torch.tensor([SOS_token], device=device)  # SOS

#topk is used to get the top K value over a list
#predict the output word from the current target word. If we enable the teaching force,  then the #next decoder input is the next word, else, use the decoder output highest value. 

       for t in range(target_length):   
           decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
           outputs[t] = decoder_output
           teacher_force = random.random() < teacher_forcing_ratio
           topv, topi = decoder_output.topk(1)
           input = (target[t] if teacher_force else topi)
           if(teacher_force == False and input.item() == EOS_token):
               break

       return outputs

In [None]:
model = Seq2Seq(encoder1, attn_decoder1, device).to(device)

In [None]:
for name, params in model.named_children():
  print(name)

encoder
decoder


In [None]:
for param in model.parameters():    
    param.requires_grad = False

trained_encoder = list(model.children())[0]
trained_decoder = list(model.children())[1] 

In [None]:
encoder2 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder2 = trained_decoder


trainIters(encoder2, attn_decoder2, 15000, print_every=1000)

1m 8s (- 16m 5s) (1000 6%) 5.1840
2m 17s (- 14m 56s) (2000 13%) 5.0214
3m 27s (- 13m 50s) (3000 20%) 5.0823
4m 36s (- 12m 40s) (4000 26%) 5.0752
5m 44s (- 11m 28s) (5000 33%) 5.0554
6m 51s (- 10m 17s) (6000 40%) 4.9794
7m 59s (- 9m 8s) (7000 46%) 5.0815
9m 8s (- 7m 59s) (8000 53%) 5.0330
10m 16s (- 6m 51s) (9000 60%) 5.0714
11m 26s (- 5m 43s) (10000 66%) 4.9687
12m 35s (- 4m 34s) (11000 73%) 5.0261
13m 45s (- 3m 26s) (12000 80%) 5.0319
14m 55s (- 2m 17s) (13000 86%) 4.9985
16m 4s (- 1m 8s) (14000 93%) 5.0323
17m 14s (- 0m 0s) (15000 100%) 5.0114


In [None]:
new_model = Seq2Seq(encoder2, attn_decoder2, device).to(device)

for param in new_model.parameters():    
    param.requires_grad = True

trained_encoder = list(new_model.children())[0]
trained_decoder = list(new_model.children())[1] 


trainIters(trained_encoder, trained_decoder, 15000, print_every=1000)

1m 34s (- 22m 1s) (1000 6%) 5.0558
3m 3s (- 19m 50s) (2000 13%) 4.8134
4m 35s (- 18m 21s) (3000 20%) 4.8986
6m 6s (- 16m 46s) (4000 26%) 4.8820
7m 37s (- 15m 15s) (5000 33%) 4.8448
9m 9s (- 13m 44s) (6000 40%) 4.9377
10m 40s (- 12m 11s) (7000 46%) 4.8153
12m 10s (- 10m 39s) (8000 53%) 4.9169
13m 42s (- 9m 8s) (9000 60%) 4.8525
15m 17s (- 7m 38s) (10000 66%) 4.8857
16m 49s (- 6m 6s) (11000 73%) 4.7492
18m 19s (- 4m 34s) (12000 80%) 4.8747
19m 52s (- 3m 3s) (13000 86%) 4.8659
21m 24s (- 1m 31s) (14000 93%) 4.8442
22m 58s (- 0m 0s) (15000 100%) 4.8133


In [None]:
def BLEU_score(encoder, decoder, n = len(test_pairs)):
  score = 0
  for i in range(n):
        pair = random.choice(test_pairs)
        reference = [pair[1].split(' ')]
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        candidate = output_sentence.split(' ')
        i_score = sentence_bleu(reference, candidate)
        score = score + i_score
  avg_score = score/n

  return(avg_score)

In [None]:
base_score = BLEU_score(encoder1,attn_decoder1)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
tuned_score = BLEU_score(trained_encoder,trained_decoder)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
print("Number of Training Pairs : ", len(train_pairs))
print("Number of Validation Pairs : ", len(val_pairs))
print("Number of Test Pairs : ", len(test_pairs))
print("Base Model Score : ", base_score*100)
print("Tuned Model Score : ", tuned_score*100)

Number of Training Pairs :  21340
Number of Validation Pairs :  6097
Number of Test Pairs :  3049
Base Model Score :  36.34311005628223
Tuned Model Score :  37.338164343879335
