Requirements

In [None]:
!pip install inltk

from __future__ import unicode_literals, print_function, division
from io import open
import inltk
from inltk.inltk import tokenize
from inltk.inltk import setup
import unicodedata
import string
import re
import pandas as pd
import random
import os
import csv
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from nltk.translate.bleu_score import sentence_bleu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 200

#initialize Lang Class
class Lang:
   def __init__(self):
       #initialize containers to hold the words and corresponding index
       self.word2index = {}
       self.word2count = {}
       self.index2word = {0: "SOS", 1: "EOS"}
       self.n_words = 2  # Count SOS and EOS

#split a sentence into words and add it to the container
   def addSentence(self, sentence):
       for word in sentence.split(' '):
           self.addWord(word)

#If the word is not in the container, the word will be added to it, 
#else, update the word counter
   def addWord(self, word):
       if word not in self.word2index:
           self.word2index[word] = self.n_words
           self.word2count[word] = 1
           self.index2word[self.n_words] = word
           self.n_words += 1
       else:
           self.word2count[word] += 1

In [None]:
def process_data():

  sf = open('/content/drive/My Drive/data/bible-uedin.hi-mr.hi' , "r")
  tf = open('/content/drive/My Drive/data/bible-uedin.hi-mr.mr' , "r")

  source = Lang()
  target = Lang()
  pairs = []
  count = 0
  count2 = 0
  for sent in sf:
    sent = sent.strip()
    source.addSentence(sent)
    pairs.append(sent)
    count = count+1

  # print(count)

  for sent in tf:
    sent = sent.strip()
    target.addSentence(sent.strip())
    pairs.append(sent)
    count2 = count2 + 1

  # print(count2)

  pairs_new = []

  for i in range(count):
    full = [pairs[i],pairs[i+count]]
    pairs_new.append(full)



  return source, target, pairs_new

In [None]:
# source, target, pairs = process_data()   JUST A CHECK TO SEE IF THE SIZE OF BOTH DATASET IS SAME

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


Model

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


Training the model

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        # plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

    #     if iter % plot_every == 0:
    #         plot_loss_avg = plot_loss_total / plot_every
    #         plot_losses.append(plot_loss_avg)
    #         plot_loss_total = 0

    # showPlot(plot_losses)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]


def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(val_pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')        

In [None]:
input_lang, output_lang, pairs = process_data()
pairs = filterPairs(pairs)

print(random.choice(pairs))

print(len(pairs))
x = len(pairs)

trainSplit = int(x*0.7)
valSplit = trainSplit + int(x*0.2)

train_pairs = pairs[:trainSplit]
val_pairs = pairs[trainSplit:valSplit]
test_pairs = pairs[valSplit:]

print(len(train_pairs))
print(len(val_pairs))
print(len(test_pairs))

['और जब से नित्य होमबलि उठाई जाएगी, और वह घिनौनी वस्तु जो उजाड़ करा देती है, स्थापित की जाएगी, तब से बारह सौ नब्बे दिन बीतेंगे।', '“‘नित्याची होमार्पणे करणे बंद होईल. ह्या वेळेपासून ती भयंकर नाश करणारी गोष्टी घेडेपर्यंतचा काळ हा 1290 दिवसांचा असेल.']
30381
21266
6076
3039


In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)


In [None]:
hidden_size = 256

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 30000, print_every=1000)

2m 43s (- 78m 53s) (1000 3%) 6.5134
5m 26s (- 76m 7s) (2000 6%) 6.5417
8m 5s (- 72m 49s) (3000 10%) 6.3418
10m 52s (- 70m 41s) (4000 13%) 6.4789
13m 36s (- 68m 1s) (5000 16%) 6.3213
16m 18s (- 65m 12s) (6000 20%) 6.2228
18m 59s (- 62m 25s) (7000 23%) 6.2667
21m 45s (- 59m 49s) (8000 26%) 6.3205
24m 33s (- 57m 18s) (9000 30%) 6.3930
27m 15s (- 54m 31s) (10000 33%) 6.1993
30m 4s (- 51m 56s) (11000 36%) 6.3785
32m 51s (- 49m 17s) (12000 40%) 6.3317
35m 40s (- 46m 39s) (13000 43%) 6.2055
38m 30s (- 44m 0s) (14000 46%) 6.2430
41m 22s (- 41m 22s) (15000 50%) 6.2570
44m 20s (- 38m 47s) (16000 53%) 6.2254
47m 11s (- 36m 5s) (17000 56%) 6.3019
50m 7s (- 33m 25s) (18000 60%) 6.3244
53m 0s (- 30m 41s) (19000 63%) 6.2093
55m 55s (- 27m 57s) (20000 66%) 6.2274
58m 53s (- 25m 14s) (21000 70%) 6.3411
61m 51s (- 22m 29s) (22000 73%) 6.2953
64m 46s (- 19m 42s) (23000 76%) 6.1514
67m 41s (- 16m 55s) (24000 80%) 6.2219
70m 37s (- 14m 7s) (25000 83%) 6.2149
73m 33s (- 11m 18s) (26000 86%) 6.1064
76m 33s (

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

> नीकुदेमुस ने, (जो पहिले उसके पास आया था और उन में से एक था), उन से कहा।
= परंतु त्या घोळक्यात निकदेम हजार होता. यापूर्वी हा निकदेमच येशूला भेटायला आला होता निकदेम म्हणाला.
< मग तो सर्व व व तो व <EOS>

> इतने में जब हजारों की भीड़ लग गई, यहां तक कि एक दूसरे पर गिरे पड़ते थे, तो वह सब से पहिले अपने चेलों से कहने लगा, कि फरीसियों के कपटरूपी खमीर से चौकस रहना।
= आणि म्हणून हजारो लोकांचा समुदाय जमला होता. इतके लोक जमले होते की, ते एकमेकांना तुडवू लागले, तेव्हा येशू प्रथम आपल्या शिष्यांशी बोलला: “परुश्यांच्या खमिराविषयी जपा, म्हणजे जे ढोंग आहे त्याविषयी जपा.
< पण या सर्व सर्व सर्व या सर्व सर्व लोक या सर्व त्यांना ते ते <EOS>

> और मैं उन्हें अनन्त जीवन देता हूं, और वे कभी नाश नहीं होंगी, और कोई उन्हें मेरे हाथ से छीन न लेगा।
= मी माझ्या मेंढरांना अनंतकाळचे जीवन देतो. ती कधीच मरणार नाहीत. आणि त्यांना कोणीच माझ्या हातून हिरावून घेणार नाही.
< मी मी मी आहे. मी मी मी मी मी मी मी मी मी मी मी मी मी मी मी मी मी त्यांना मी मी मी <EOS>

> यीशु यरूशलेम को जाते हुए बारह चेलों को एकान्त में ले गया, और 

In [None]:
class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder, device, MAX_LENGTH=MAX_LENGTH):
       super().__init__()
      
#initialize the encoder and decoder
       self.encoder = encoder
       self.decoder = decoder
       self.device = device
     
   def forward(self, source, target, teacher_forcing_ratio=0.5):

       input_length = source.size(0) #get the input length (number of words in sentence)
       batch_size = target.shape[1] 
       target_length = target.shape[0]
       vocab_size = self.decoder.output_dim
      
#initialize a variable to hold the predicted outputs
       outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)

#encode every word in a sentence
       for i in range(input_length):
           encoder_output, encoder_hidden = self.encoder(source[i])

#use the encoder’s hidden layer as the decoder hidden
       decoder_hidden = encoder_hidden.to(device)
  
#add a token before the first predicted word
       decoder_input = torch.tensor([SOS_token], device=device)  # SOS

#topk is used to get the top K value over a list
#predict the output word from the current target word. If we enable the teaching force,  then the #next decoder input is the next word, else, use the decoder output highest value. 

       for t in range(target_length):   
           decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
           outputs[t] = decoder_output
           teacher_force = random.random() < teacher_forcing_ratio
           topv, topi = decoder_output.topk(1)
           input = (target[t] if teacher_force else topi)
           if(teacher_force == False and input.item() == EOS_token):
               break

       return outputs

In [None]:
model = Seq2Seq(encoder1, attn_decoder1, device).to(device)

In [None]:
for name, params in model.named_children():
  print(name)

encoder
decoder


In [None]:
for param in model.parameters():    
    param.requires_grad = False

trained_encoder = list(model.children())[0]
trained_decoder = list(model.children())[1] 

In [None]:
encoder2 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder2 = trained_decoder

trainIters(encoder2, attn_decoder2, 15000, print_every=1000)

1m 41s (- 23m 46s) (1000 6%) 6.3019
3m 18s (- 21m 30s) (2000 13%) 6.2118
4m 56s (- 19m 45s) (3000 20%) 6.2099
6m 35s (- 18m 7s) (4000 26%) 6.2319
8m 15s (- 16m 31s) (5000 33%) 6.2526
9m 55s (- 14m 53s) (6000 40%) 6.2512
11m 34s (- 13m 13s) (7000 46%) 6.1437
13m 14s (- 11m 35s) (8000 53%) 6.1928
14m 55s (- 9m 56s) (9000 60%) 6.2518
16m 33s (- 8m 16s) (10000 66%) 6.1545
18m 15s (- 6m 38s) (11000 73%) 6.2133
19m 53s (- 4m 58s) (12000 80%) 6.1837
21m 33s (- 3m 19s) (13000 86%) 6.2059
23m 13s (- 1m 39s) (14000 93%) 5.9653
24m 52s (- 0m 0s) (15000 100%) 6.1422


In [None]:
new_model = Seq2Seq(encoder2, attn_decoder2, device).to(device)

for param in new_model.parameters():    
    param.requires_grad = True

trained_encoder = list(new_model.children())[0]
trained_decoder = list(new_model.children())[1] 

trainIters(trained_encoder, trained_decoder, 15000, print_every=1000)

3m 0s (- 42m 3s) (1000 6%) 6.1017
5m 58s (- 38m 52s) (2000 13%) 6.0436
8m 55s (- 35m 42s) (3000 20%) 6.0422
11m 51s (- 32m 36s) (4000 26%) 6.1170
14m 50s (- 29m 40s) (5000 33%) 5.9608
17m 40s (- 26m 31s) (6000 40%) 6.0040
20m 36s (- 23m 33s) (7000 46%) 6.0285
23m 34s (- 20m 37s) (8000 53%) 6.0565
26m 31s (- 17m 40s) (9000 60%) 6.0269
29m 29s (- 14m 44s) (10000 66%) 5.9615
32m 27s (- 11m 48s) (11000 73%) 5.9305
35m 23s (- 8m 50s) (12000 80%) 5.9502
38m 19s (- 5m 53s) (13000 86%) 5.9157
41m 13s (- 2m 56s) (14000 93%) 5.9355
44m 6s (- 0m 0s) (15000 100%) 5.8693


In [None]:
def BLEU_score(encoder, decoder, n = len(test_pairs)):
  score = 0
  for i in range(n):
        pair = random.choice(test_pairs)
        reference = [pair[1].split(' ')]
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        candidate = output_sentence.split(' ')
        i_score = sentence_bleu(reference, candidate)
        score = score + i_score
  avg_score = score/n

  return(avg_score)

In [None]:
base_score = BLEU_score(encoder1,attn_decoder1)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
tuned_score = BLEU_score(trained_encoder,trained_decoder)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
print("Number of Training Pairs : ", len(train_pairs))
print("Number of Validation Pairs : ", len(val_pairs))
print("Number of Test Pairs : ", len(test_pairs))
print("Base Model Score : ", base_score*100)
print("Tuned Model Score : ", tuned_score*100)

Number of Training Pairs :  21266
Number of Validation Pairs :  6076
Number of Test Pairs :  3039
Base Model Score :  22.28613835060829
Tuned Model Score :  23.780408813884655
