## ENCODER DECODER NETWORK WITH ATTENTION AND TEACHER FORCING

**References:**

Tutorials Given in Competition Document : [Competetion Link](https://docs.google.com/document/d/1p74wG-bECCgbpyq5x_x2QJrf5RSf9FnMLGSAiyUkHLo/edit)

PyTorch NMT Tutorial : [Pytorch NMT](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html)

Github Page : To understand batch Processing in PyTorch [Github Pengyuchen](https://github.com/pengyuchen/PyTorch-Batch-Seq2seq)

Referred Few Stackoverflow Links for few Regex examples and for some bugs.

# **FUNCTIONS**

In [None]:
from google.colab import  drive
drive.mount('/drive')

Mounted at /drive


## LIBRARIES

In [None]:
location = r"/drive/My Drive/Files/"
INDIC_NLP_LIB_HOME = location + "indic_nlp_library"
INDIC_NLP_RESOURCES = location + "indic_nlp_resources"

In [None]:
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()

In [None]:
!pip install Morfessor
import re
import string
import spacy
import tqdm.notebook as tq
nlpen = spacy.load("en_core_web_sm")
import random
import pickle
import pandas as pd
from indicnlp.tokenize import sentence_tokenize
from indicnlp.tokenize import indic_tokenize
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

Collecting Morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


## TEXT PROCESSING

In [None]:
english_nums = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
hindi_nums =   ['०', '१', '२', '३', '४', '५', '६', '७', '८', '९']

def clean_string( instr ):
  instr = instr.lower()
  instr = instr.replace(u'[', ' ')
  instr = instr.replace(u']', ' ')
  instr = instr.replace(u'{', ' ')
  instr = instr.replace(u'}', ' ')
  instr = instr.replace(u'(', ' ')
  instr = instr.replace(u')', ' ')
  instr = instr.replace(u'...', ' ')
  instr = instr.replace(u'..', ' ')
  instr = instr.replace(u'-', ' ')
  instr = instr.replace(u',', ' ')
  instr = instr.replace(u'"', ' ')
  instr = re.sub(' +',' ', instr)
  return instr
  
def preprocess_hindi( instr ):
  factory    = IndicNormalizerFactory()
  normalizer = factory.get_normalizer("hi",remove_nuktas=True)
  instr      = normalizer.normalize(instr)

  instr      = clean_string( instr )
  #instr = instr.replace(u'॥', '')
  for nums in hindi_nums:
    instr    = instr.replace(nums, nums + ' ')

  instr      = ItransTransliterator.from_itrans( instr , 'hi')  
  instr      = re.sub(' +',' ', instr)
  instr      = ItransTransliterator.from_itrans( instr , 'hi')
  instr      = instr.strip() #sentence_tokenize.sentence_split(instr, lang='hi')
  
  return instr

def preprocess_english( instr ):
  instr = clean_string(instr)

  instr = instr.replace("’", "'")
  instr = instr.replace("n\'t", " not")
  instr = instr.replace("'re" , " are")
  instr = instr.replace("'ve" , " have")
  instr = instr.replace("'s"  , " is")
  instr = instr.replace("'ll" , " will")
  instr = instr.replace("'m" , " am")
  #instr = re.sub(r'[^\w\s\\d]' , " " , instr)
  #instr = re.sub(r'[\d]' , ' ' , instr)

  for nums in english_nums:
    instr    = instr.replace(nums, nums + ' ')
  instr = re.sub(' +',' ', instr)
  instr = instr.strip()

  return instr

def get_hindi_tokens(sentence):
  return indic_tokenize.trivial_tokenize(sentence)

def get_english_tokens(sentence):
  tokens = []
  tokstr = nlpen(sentence)
  for token in tokstr:
    tokens.append(token.text)
  return tokens

In [None]:
def process_pairs(df, load_from_file = 0, location = ''):
  if( load_from_file == 0):
    pairs = []
    pairs_tokens = []
    for i in tq.tqdm( df.index ):
      hinsen  = df['hindi'][i]
      hsent   = preprocess_hindi( hinsen )
      htokens = get_hindi_tokens(hsent)

      engsen  = df['english'][i]
      esent   = preprocess_english( engsen )
      etokens = get_english_tokens(esent)

      pairs.append( [hsent, esent] )
      pairs_tokens.append( [htokens, etokens] )

    with open(location + r'pairs.pickle', 'wb') as handle:
        pickle.dump(pairs, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(location + r'pairs_tokens.pickle', 'wb') as handle:
        pickle.dump(pairs_tokens, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return pairs, pairs_tokens
  else:
    with open(location + r'pairs.pickle', 'rb') as handle:
        pairs = pickle.load(handle)
    with open(location + r'pairs_tokens.pickle', 'rb') as handle:
        pairs_tokens = pickle.load(handle)
    return pairs, pairs_tokens  

## LANGUAGE

In [None]:
START_TOKEN = 0
END_TOKEN = 1
PAD_TOKEN = 2

class Language:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {}
    self.num_words = 3
    self.word2index['START_TOKEN'] = START_TOKEN
    self.index2word['END_TOKEN'] = END_TOKEN
    self.index2word['PAD_TOKEN'] = PAD_TOKEN
    self.index2word[START_TOKEN] = 'START_TOKEN'
    self.index2word[END_TOKEN] = 'END_TOKEN'
    self.index2word[PAD_TOKEN] = 'PAD_TOKEN'

  def addWord(self, word):
    if word in self.word2index:
      self.word2count[word] = self.word2count[word] + 1
    else:
      self.word2count[word] = 1
      self.word2index[word] = self.num_words
      self.index2word[self.num_words] = word
      self.num_words = self.num_words + 1
  
  def addSentence(self, sentence_tokens):
      for word in sentence_tokens:
        self.addWord(word)

In [None]:
def generate_language( pairs_tokens ):
    hindi   = Language('hindi')
    english = Language('english')
    for i in tq.tqdm( range(len(pairs_tokens)) ):
      hindi.addSentence(pairs_tokens[i][0])
      english.addSentence(pairs_tokens[i][1])
    return hindi, english

PROCESS TEXT TO TENSOR

In [None]:
def get_filitered_data(max_length, pairs, pairs_tokens):
  fil_pairs = []
  fil_pairs_tokens = []
  for i in  range( len(pairs_tokens)) :
    if( len(pairs_tokens[i][0] ) < max_length and len(pairs_tokens[i][1]) < max_length ):
      fil_pairs.append( pairs[i] )
      fil_pairs_tokens.append( pairs_tokens[i] )
  return fil_pairs, fil_pairs_tokens

In [None]:
def indexesFromSentence(lang, tokens, max_length):
  indexes = []
  indexes.append(START_TOKEN)
  for word in tokens:
    if word in lang.word2index.keys():
      indexes.append( lang.word2index[word] )
    else:
      indexes.append( random.randint(2, lang.num_words))
  indexes = indexes[0:max_length-1]
  indexes.append(END_TOKEN)
  indexes.extend( [PAD_TOKEN]*( max_length - len(indexes)))
  return indexes

def tensorFromSentence(lang, sentence, max_length):
  indexes = indexesFromSentence(lang, sentence, max_length)
  return torch.tensor(indexes, dtype=torch.long, device=device)

def tensorsFromPair(pairs, input_lang, output_lang, max_length):
  res_pairs = []
  for pair in pairs:
    input_tensor  = tensorFromSentence(input_lang, pair[0], max_length)
    target_tensor = tensorFromSentence(output_lang, pair[1], max_length)
    res_pairs.append( (input_tensor, target_tensor) )
  return res_pairs

## NEURAL MACHINE TRANSLATOR


LIBRARIES

In [None]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

!pip install -U nltk
import nltk
import sys
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 7.3MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp37-none-any.whl size=1434673 sha256=64b04de7675d442c12e664cd2a218ad4ab6c67d8987f71c6e27adf6d7eac1939
  Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a526d8bc8d384306
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.5


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


###### ENCODER and DECODER

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden, batch_size=1):
        embedded = self.embedding(input).view(1, batch_size, self.hidden_size)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self, batch_size=1):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [None]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, max_length):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)

        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, batch_size=1):
        embed = self.embedding(input).view(1, batch_size, self.hidden_size)
        output, hidden = self.gru(embed, hidden)
        output = self.out(output)
        preds = self.softmax(output[0])
        return preds, hidden


    def initHidden(self, batch_size=1):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

##### SEQUENCE 2 SEQUENCE + Teacher Forcing

In [None]:
class seq2seq(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, max_length):
        super(seq2seq, self).__init__()
        self.input_size  = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.max_length = max_length

        self.encoder = Encoder(input_size, hidden_size).to(device)
        self.decoder = Decoder(hidden_size, output_size, max_length).to(device)



    def train_model(self, train_data, num_epoch ):
        encoder_optim = optim.Adam(self.encoder.parameters(), lr = 0.01)
        decoder_optim = optim.Adam(self.decoder.parameters(), lr = 0.01)
        lossfn = nn.NLLLoss()

        history = []
        for epoch in (range(num_epoch)):
            print('\n Epoch  : ', epoch)

            total_loss = 0

            for bin, bout in tq.tqdm(train_data):
                batch_size     = bin.size()[0]
                input  = bin.transpose(0,1)
                output = bout.transpose(0,1)
                inlen  = input.size()[0]
                outlen = output.size()[0]

                encoder_hidden = self.encoder.initHidden(batch_size = batch_size)
                encoder_optim.zero_grad()
                decoder_optim.zero_grad()

                loss = 0
                encoder_outputs = torch.zeros(self.max_length, batch_size, self.encoder.hidden_size, device = device)
                for ei in range(inlen):
                    enc_output, encoder_hidden = self.encoder(input[ei],encoder_hidden, batch_size =batch_size)
                    encoder_outputs[ei]        = enc_output[0]

                decoder_input = torch.tensor([START_TOKEN]*batch_size, device=device)
                decoder_hidden = encoder_hidden
                # TEACHER FORCING
                if random.random() < 0.5 :
                  for di in range(outlen):
                      decoder_output, decoder_hidden = self.decoder(
                          decoder_input, decoder_hidden, batch_size= batch_size)
                      decoder_input      = output[di]
                      loss               = loss + lossfn( decoder_output, output[di])
                else:
                  for di in range(outlen):
                      decoder_output, decoder_hidden = self.decoder(
                          decoder_input, decoder_hidden, batch_size= batch_size)
                      topvalue, topindex = decoder_output.data.topk(1)
                      decoder_input      = topindex.squeeze().detach()
                      loss               = loss + lossfn( decoder_output, output[di])

                total_loss = total_loss + loss.item()

                loss.backward()
                encoder_optim.step()
                decoder_optim.step()

            history.append( total_loss / len(train_data))
        return history
          
        
    def predict_sentence(self, sentence, input_lang, output_lang):
        with torch.no_grad():
            input   = tensorFromSentence(input_lang, sentence, self.max_length)
            inlen   = input.size()[0]
            if inlen > self.max_length:
              inlen = self.max_length

            enc_hidden = self.encoder.initHidden(1)
            enc_outputs = torch.zeros(self.max_length, self.encoder.hidden_size, device=device)
            
            for i in range(inlen):
                enc_output, enc_hidden = self.encoder(input[i], enc_hidden)
                enc_outputs[i]         = enc_outputs[i] + enc_output[0, 0]

            dec_input      = torch.tensor([[START_TOKEN]], device=device)
            dec_hidden     = enc_hidden
            dec_words = []
            
            for i in range(self.max_length):
              dec_output, dec_hidden = self.decoder( dec_input, dec_hidden , batch_size = 1)
              maxval, maxindex = dec_output.data.topk(1)
              dec_input = maxindex.squeeze().detach()

              if(maxindex.item() == END_TOKEN):
                dec_words.append('END_TOKEN')
                break
              else:
                dec_words.append( output_lang.index2word[maxindex.item()] )
          
        return dec_words

PERFORMANCE EVALUATION

In [None]:
def make_sentence(tokens):
  str = ''
  for x in tokens:
    if x not in ['START_TOKEN', 'END_TOKEN', 'PAD_TOKEN']:
      str = str + ' ' + x
  return re.sub('(?<=\d)+ (?=\d)+', '', str)[1:]


def get_bleu_score(model, pairs, inlang, outlang):

  total_num = len(pairs)
  total_bleu_scores = 0
  total_meteor_scores = 0
  
  for i in tq.tqdm( range(total_num) ):
    output    = make_sentence ( model.predict_sentence(pairs[i][0], inlang, outlang) )
    original  = make_sentence(pairs[i][1])
    total_bleu_scores   += sentence_bleu([output.split(" ")], original.split(" "))
    total_meteor_scores += single_meteor_score(output, original)

  bleu_result = total_bleu_scores/total_num
  meteor_result = total_meteor_scores/total_num
  
  print()
  print("bleu score: ",bleu_result)
  print("meteor score: ",meteor_result)

# **EXECUTION**

READ AND PROCESS FILE

In [None]:
MAX_LENGTH = 17

In [None]:
data_location = location + 'NMT/'
model_location = location + 'NMT/NMT_GRUTF/'
df = pd.read_csv(data_location+'train.csv',  index_col=0)

In [None]:
pairs, tokens = process_pairs(df, load_from_file=1, location = data_location + 'DataPairs/')
train_pairs, test_pairs, train_tokens, test_tokens = train_test_split( pairs, tokens, test_size = 0.2, shuffle = True, random_state = 200)
fil_train, fil_train_tokens = get_filitered_data( MAX_LENGTH - 2, train_pairs, train_tokens)

GENERATE LANGUAGE

In [None]:
hindi, english = generate_language(train_tokens)

HBox(children=(FloatProgress(value=0.0, max=81857.0), HTML(value='')))




GET TENSORS

In [None]:
train_tensors = tensorsFromPair(fil_train_tokens, hindi, english, MAX_LENGTH)
train_loader = torch.utils.data.DataLoader(train_tensors, batch_size=256, shuffle=True)

TRAIN MODEL

In [None]:
hidden_size = 256

In [None]:
model2 = seq2seq(hindi.num_words + 1, english.num_words + 1, hidden_size, MAX_LENGTH)

In [None]:
model2.train_model(train_data= train_loader, num_epoch= 200)

SAVE / LOAD MODEL

In [None]:
torch.save(model2.state_dict(), model_location + 'gru_dict_200')
torch.save(model2, model_location + 'gru_200')

In [None]:
model = seq2seq(hindi.num_words + 1, english.num_words + 1, hidden_size, MAX_LENGTH)
model.load_state_dict( torch.load(model_location + 'gru_dict_200', map_location=torch.device('cpu')))
model.eval()

seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(40447, 256)
    (gru): GRU(256, 256)
  )
  (decoder): Decoder(
    (embedding): Embedding(29686, 256)
    (gru): GRU(256, 256)
    (out): Linear(in_features=256, out_features=29686, bias=True)
    (softmax): LogSoftmax(dim=1)
  )
)

EVALUATE PERFORMANCE

In [None]:
get_bleu_score(model2, test_tokens, hindi, english)

# USE MODEL FOR TRANSLATION

In [None]:
week1 = pd.read_csv(data_location+'Weekly Data/Week1/week1.csv', index_col=0)

In [None]:
week1_processed = []
week1_tensors = []
for x in  week1['hindi']:
  t = get_hindi_tokens(preprocess_hindi(x))
  week1_processed.append(t)
  week1_tensors.append( tensorFromSentence(hindi, t, MAX_LENGTH))

In [None]:
translated_tokens = []
for i in tq.tqdm( range(len(week1_tensors)) ):
  translated_tokens.append( model.predict_sentence( week1_tensors[i], hindi, english) ) 

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [None]:
translated_texts = []
for t in translated_tokens:
  translated_texts.append( make_sentence(t ) )

In [None]:
with open(data_location + 'Weekly Data/Week1/grutf.txt', 'w') as f:
    for item in translated_texts:
        f.write("%s\n" % item)

In [None]:
#torch.save( tmodel.state_dict(), model_location + 'gru_dict_100')
#torch.save(model, location+ 'gru_enc_dec')

#tmodel = torch.load(model_location+ 'gru_100')
#tmodel.eval()

#tq.tqdm._instances.clear()