Requirements

In [None]:
!pip install inltk

from __future__ import unicode_literals, print_function, division
from io import open
import inltk
from inltk.inltk import tokenize
from inltk.inltk import setup
import unicodedata
import string
import re
import pandas as pd
import random
import os
import csv
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from nltk.translate.bleu_score import sentence_bleu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 200

#initialize Lang Class
class Lang:
   def __init__(self):
       #initialize containers to hold the words and corresponding index
       self.word2index = {}
       self.word2count = {}
       self.index2word = {0: "SOS", 1: "EOS"}
       self.n_words = 2  # Count SOS and EOS

#split a sentence into words and add it to the container
   def addSentence(self, sentence):
       for word in sentence.split(' '):
           self.addWord(word)

#If the word is not in the container, the word will be added to it, 
#else, update the word counter
   def addWord(self, word):
       if word not in self.word2index:
           self.word2index[word] = self.n_words
           self.word2count[word] = 1
           self.index2word[self.n_words] = word
           self.n_words += 1
       else:
           self.word2count[word] += 1

In [None]:
def process_data():

  sf = open('/content/drive/My Drive/data/bible-uedin.hi-mr.mr' , "r")
  tf = open('/content/drive/My Drive/data/bible-uedin.hi-mr.hi' , "r")

  source = Lang()
  target = Lang()
  pairs = []
  count = 0
  count2 = 0
  for sent in sf:
    sent = sent.strip()
    source.addSentence(sent)
    pairs.append(sent)
    count = count+1

  # print(count)

  for sent in tf:
    sent = sent.strip()
    target.addSentence(sent.strip())
    pairs.append(sent)
    count2 = count2 + 1

  # print(count2)

  pairs_new = []

  for i in range(count):
    full = [pairs[i],pairs[i+count]]
    pairs_new.append(full)



  return source, target, pairs_new

In [None]:
# source, target, pairs = process_data()   JUST A CHECK TO SEE IF THE SIZE OF BOTH DATASET IS SAME

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


Model

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


Training the model

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        # plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

    #     if iter % plot_every == 0:
    #         plot_loss_avg = plot_loss_total / plot_every
    #         plot_losses.append(plot_loss_avg)
    #         plot_loss_total = 0

    # showPlot(plot_losses)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]


def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(val_pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')        

In [None]:
input_lang, output_lang, pairs = process_data()
pairs = filterPairs(pairs)

print(random.choice(pairs))

print(len(pairs))
x = len(pairs)

trainSplit = int(x*0.7)
valSplit = trainSplit + int(x*0.2)

train_pairs = pairs[:trainSplit]
val_pairs = pairs[trainSplit:valSplit]
test_pairs = pairs[valSplit:]

print(len(train_pairs))
print(len(val_pairs))
print(len(test_pairs))

['“मग मी मागे फिरुन डोंगर उतरुन आलो. डोंगरावर आग धगधगत होती. माझ्या हातात आज्ञापटाच्या दोन पाट्या होत्या.', 'तब मैं उलटे पैर पर्वत से नीचे उतर चला, और मेरे दोनों हाथों में वाचा की दोनों पटियाएं थीं।']
30381
21266
6076
3039


In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)


In [None]:
hidden_size = 256

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)


trainIters(encoder1, attn_decoder1, 30000, print_every=1000)

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

> आणि तेथे त्याला चमत्कार करता आला नाही. फक्त त्याने काही रोगी लोकांच्या डोक्यावर हात ठेवून त्यांना बरे केले. त्यांच्या अविश्वासमुळे त्याला आश्चर्य वाटले.
= और वह वहां कोई सामर्थ का काम न कर सका, केवल थोड़े बीमारों पर हाथ रखकर उन्हें चंगा किया।।
< और ने ने अपने के को और और और और और और और और और और और और और और और और और और और और और और के और <EOS>

> त्या प्रदेशातील 2 3 लोक जखमी होऊन मरतील. पण 1 3 वाचतील
= यहोवा की यह भी वाणी है, कि इस देश के सारे निवासियों की दो तिहाई मार डाली जाएगाी और बची हुई तिहाई उस में बनी रहेगी।
< और ने के से के और और और और और और और और और और और और और और <EOS>

> पाणी आणून साठवून ठेव. का? कारण शत्रुसैन्य शहराला घेराव घालणार आहे. ते कोणालाही अन्न-पाणी आत नेऊ देणार नाहीत. तुझ्या बचावाच्या जागा भक्कम कर. विटा तयार करण्यासाठी माती मिळव. चुना कालब! विटा बनविण्याचे साचे मिळव!
= घिर जाने के दिनों के लिये पानी भर ले, और गढ़ों को अधिक दृढ़ कर; कीचड ले आकर गारा लताड़, और भट्ठे को सजा!
< और ने अपने के कि यहोवा को और और और और और और और और और और और और न ने <EOS>

> नंतर येशूची आई 

In [None]:
class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder, device, MAX_LENGTH=MAX_LENGTH):
       super().__init__()
      
#initialize the encoder and decoder
       self.encoder = encoder
       self.decoder = decoder
       self.device = device
     
   def forward(self, source, target, teacher_forcing_ratio=0.5):

       input_length = source.size(0) #get the input length (number of words in sentence)
       batch_size = target.shape[1] 
       target_length = target.shape[0]
       vocab_size = self.decoder.output_dim
      
#initialize a variable to hold the predicted outputs
       outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)

#encode every word in a sentence
       for i in range(input_length):
           encoder_output, encoder_hidden = self.encoder(source[i])

#use the encoder’s hidden layer as the decoder hidden
       decoder_hidden = encoder_hidden.to(device)
  
#add a token before the first predicted word
       decoder_input = torch.tensor([SOS_token], device=device)  # SOS

#topk is used to get the top K value over a list
#predict the output word from the current target word. If we enable the teaching force,  then the #next decoder input is the next word, else, use the decoder output highest value. 

       for t in range(target_length):   
           decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
           outputs[t] = decoder_output
           teacher_force = random.random() < teacher_forcing_ratio
           topv, topi = decoder_output.topk(1)
           input = (target[t] if teacher_force else topi)
           if(teacher_force == False and input.item() == EOS_token):
               break

       return outputs

In [None]:
model = Seq2Seq(encoder1, attn_decoder1, device).to(device)

In [None]:
for name, params in model.named_children():
  print(name)

encoder
decoder


In [None]:
for param in model.parameters():    
    param.requires_grad = False

trained_encoder = list(model.children())[0]
trained_decoder = list(model.children())[1] 

In [None]:
encoder2 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder2 = trained_decoder


trainIters(encoder2, attn_decoder2, 15000, print_every=1000)

1m 5s (- 15m 19s) (1000 6%) 5.4285
2m 8s (- 13m 52s) (2000 13%) 5.3133
3m 11s (- 12m 46s) (3000 20%) 5.2293
4m 16s (- 11m 45s) (4000 26%) 5.2555
5m 20s (- 10m 41s) (5000 33%) 5.3273
6m 24s (- 9m 37s) (6000 40%) 5.3144
7m 27s (- 8m 31s) (7000 46%) 5.1741
8m 31s (- 7m 27s) (8000 53%) 5.2159
9m 36s (- 6m 24s) (9000 60%) 5.2930
10m 40s (- 5m 20s) (10000 66%) 5.2572
11m 47s (- 4m 17s) (11000 73%) 5.3084
12m 51s (- 3m 12s) (12000 80%) 5.2149
13m 55s (- 2m 8s) (13000 86%) 5.2710
15m 1s (- 1m 4s) (14000 93%) 5.2597
16m 6s (- 0m 0s) (15000 100%) 5.3495


In [None]:
new_model = Seq2Seq(encoder2, attn_decoder2, device).to(device)

for param in new_model.parameters():    
    param.requires_grad = True

trained_encoder = list(new_model.children())[0]
trained_decoder = list(new_model.children())[1] 


trainIters(trained_encoder, trained_decoder, 15000, print_every=1000)

1m 20s (- 18m 48s) (1000 6%) 5.1133
2m 40s (- 17m 25s) (2000 13%) 5.0558
4m 4s (- 16m 17s) (3000 20%) 5.1302
5m 26s (- 14m 57s) (4000 26%) 5.1365
6m 45s (- 13m 31s) (5000 33%) 5.0183
8m 7s (- 12m 11s) (6000 40%) 5.0134
9m 27s (- 10m 48s) (7000 46%) 4.9788
10m 46s (- 9m 25s) (8000 53%) 4.9721
12m 8s (- 8m 5s) (9000 60%) 5.0726
13m 29s (- 6m 44s) (10000 66%) 4.9477
14m 50s (- 5m 23s) (11000 73%) 4.9929
16m 11s (- 4m 2s) (12000 80%) 4.9391
17m 32s (- 2m 41s) (13000 86%) 4.9778
18m 53s (- 1m 20s) (14000 93%) 4.9518
20m 15s (- 0m 0s) (15000 100%) 4.9490


In [None]:
def BLEU_score(encoder, decoder, n = len(test_pairs)):
  score = 0
  for i in range(n):
        pair = random.choice(test_pairs)
        reference = [pair[1].split(' ')]
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        candidate = output_sentence.split(' ')
        i_score = sentence_bleu(reference, candidate)
        score = score + i_score
  avg_score = score/n

  return(avg_score)

In [None]:
base_score = BLEU_score(encoder1,attn_decoder1)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
tuned_score = BLEU_score(trained_encoder,trained_decoder)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
print("Number of Training Pairs : ", len(train_pairs))
print("Number of Validation Pairs : ", len(val_pairs))
print("Number of Test Pairs : ", len(test_pairs))
print("Base Model Score : ", base_score*100)
print("Tuned Model Score : ", tuned_score*100)

Number of Training Pairs :  21266
Number of Validation Pairs :  6076
Number of Test Pairs :  3039
Base Model Score :  34.93287319217059
Tuned Model Score :  41.92891653117014
