## ENCODER DECODER NETWORK WITH ATTENTION AND TEACHER FORCING

**References:**

Tutorials Given in Competition Document : [Competetion Link](https://docs.google.com/document/d/1p74wG-bECCgbpyq5x_x2QJrf5RSf9FnMLGSAiyUkHLo/edit)

PyTorch NMT Tutorial : [Pytorch NMT](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html)

Github Page : To understand batch Processing in PyTorch [Github Pengyuchen](https://github.com/pengyuchen/PyTorch-Batch-Seq2seq)

Referred Few Stackoverflow Links for few Regex examples and for some bugs.

FUNCTIONS

In [None]:
from google.colab import  drive
drive.mount('/drive')

Mounted at /drive


### LIBRARIES

In [None]:
location = r"/drive/My Drive/Files/"
INDIC_NLP_LIB_HOME = location + "indic_nlp_library"
INDIC_NLP_RESOURCES = location + "indic_nlp_resources"

In [None]:
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()

In [None]:
!pip install Morfessor
import re
import string
import spacy
import tqdm.notebook as tq
nlpen = spacy.load("en_core_web_sm")
import random
import pickle
import pandas as pd
from indicnlp.tokenize import sentence_tokenize
from indicnlp.tokenize import indic_tokenize
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

Collecting Morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


### TEXT PROCESSING

In [None]:
english_nums = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
hindi_nums =   ['०', '१', '२', '३', '४', '५', '६', '७', '८', '९']

def clean_string( instr ):
  instr = instr.lower()
  instr = instr.replace(u'[', ' ')
  instr = instr.replace(u']', ' ')
  instr = instr.replace(u'{', ' ')
  instr = instr.replace(u'}', ' ')
  instr = instr.replace(u'(', ' ')
  instr = instr.replace(u')', ' ')
  instr = instr.replace(u'...', ' ')
  instr = instr.replace(u'..', ' ')
  instr = instr.replace(u'-', ' ')
  instr = instr.replace(u',', ' ')
  instr = instr.replace(u'"', ' ')
  instr = re.sub(' +',' ', instr)
  return instr
  
def preprocess_hindi( instr ):
  factory    = IndicNormalizerFactory()
  normalizer = factory.get_normalizer("hi",remove_nuktas=True)
  instr      = normalizer.normalize(instr)

  instr      = clean_string( instr )
  #instr = instr.replace(u'॥', '')
  for nums in hindi_nums:
    instr    = instr.replace(nums, nums + ' ')

  instr      = ItransTransliterator.from_itrans( instr , 'hi')  
  instr      = re.sub(' +',' ', instr)
  instr      = ItransTransliterator.from_itrans( instr , 'hi')
  instr      = instr.strip() #sentence_tokenize.sentence_split(instr, lang='hi')
  
  return instr

def preprocess_english( instr ):
  instr = clean_string(instr)

  instr = instr.replace("’", "'")
  instr = instr.replace("n\'t", " not")
  instr = instr.replace("'re" , " are")
  instr = instr.replace("'ve" , " have")
  instr = instr.replace("'s"  , " is")
  instr = instr.replace("'ll" , " will")
  instr = instr.replace("'m" , " am")
  #instr = re.sub(r'[^\w\s\\d]' , " " , instr)
  #instr = re.sub(r'[\d]' , ' ' , instr)

  for nums in english_nums:
    instr    = instr.replace(nums, nums + ' ')
  instr = re.sub(' +',' ', instr)
  instr = instr.strip()

  return instr

def get_hindi_tokens(sentence):
  return indic_tokenize.trivial_tokenize(sentence)

def get_english_tokens(sentence):
  tokens = []
  tokstr = nlpen(sentence)
  for token in tokstr:
    tokens.append(token.text)
  return tokens

In [None]:
def process_pairs(df, load_from_file = 0, location = ''):
  if( load_from_file == 0):
    pairs = []
    pairs_tokens = []
    for i in tq.tqdm( df.index ):
      hinsen  = df['hindi'][i]
      hsent   = preprocess_hindi( hinsen )
      htokens = get_hindi_tokens(hsent)

      engsen  = df['english'][i]
      esent   = preprocess_english( engsen )
      etokens = get_english_tokens(esent)

      pairs.append( [hsent, esent] )
      pairs_tokens.append( [htokens, etokens] )

    with open(location + r'pairs.pickle', 'wb') as handle:
        pickle.dump(pairs, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(location + r'pairs_tokens.pickle', 'wb') as handle:
        pickle.dump(pairs_tokens, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return pairs, pairs_tokens
  else:
    with open(location + r'pairs.pickle', 'rb') as handle:
        pairs = pickle.load(handle)
    with open(location + r'pairs_tokens.pickle', 'rb') as handle:
        pairs_tokens = pickle.load(handle)
    return pairs, pairs_tokens  

### LANGUAGE

In [None]:
START_TOKEN = 0
END_TOKEN = 1
PAD_TOKEN = 2

class Language:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {}
    self.num_words = 3
    self.word2index['START_TOKEN'] = START_TOKEN
    self.index2word['END_TOKEN'] = END_TOKEN
    self.index2word['PAD_TOKEN'] = PAD_TOKEN
    self.index2word[START_TOKEN] = 'START_TOKEN'
    self.index2word[END_TOKEN] = 'END_TOKEN'
    self.index2word[PAD_TOKEN] = 'PAD_TOKEN'

  def addWord(self, word):
    if word in self.word2index:
      self.word2count[word] = self.word2count[word] + 1
    else:
      self.word2count[word] = 1
      self.word2index[word] = self.num_words
      self.index2word[self.num_words] = word
      self.num_words = self.num_words + 1
  
  def addSentence(self, sentence_tokens):
      for word in sentence_tokens:
        self.addWord(word)

In [None]:
def generate_language( pairs_tokens ):
    hindi   = Language('hindi')
    english = Language('english')
    for i in tq.tqdm( range(len(pairs_tokens)) ):
      hindi.addSentence(pairs_tokens[i][0])
      english.addSentence(pairs_tokens[i][1])
    return hindi, english

PROCESS TEXT TO TENSOR

In [None]:
def get_filitered_data(max_length, pairs, pairs_tokens):
  fil_pairs = []
  fil_pairs_tokens = []
  for i in  range( len(pairs_tokens)) :
    if( len(pairs_tokens[i][0] ) < max_length and len(pairs_tokens[i][1]) < max_length ):
      fil_pairs.append( pairs[i] )
      fil_pairs_tokens.append( pairs_tokens[i] )
  return fil_pairs, fil_pairs_tokens

In [None]:
def indexesFromSentence(lang, tokens, max_length):
  indexes = []
  indexes.append(START_TOKEN)
  for word in tokens:
    if word in lang.word2index.keys():
      indexes.append( lang.word2index[word] )
    else:
      indexes.append( random.randint(2, lang.num_words))
  indexes = indexes[0:max_length-1]
  indexes.append(END_TOKEN)
  indexes.extend( [PAD_TOKEN]*( max_length - len(indexes)))
  return indexes

def tensorFromSentence(lang, sentence, max_length):
  indexes = indexesFromSentence(lang, sentence, max_length)
  return torch.tensor(indexes, dtype=torch.long, device=device)

def tensorsFromPair(pairs, input_lang, output_lang, max_length):
  res_pairs = []
  for pair in pairs:
    input_tensor  = tensorFromSentence(input_lang, pair[0], max_length)
    target_tensor = tensorFromSentence(output_lang, pair[1], max_length)
    res_pairs.append( (input_tensor, target_tensor) )
  return res_pairs

### NEURAL MACHINE TRANSLATOR


LIBRARIES

In [None]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

!pip install -U nltk
import nltk
import sys
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |▎                               | 10kB 19.6MB/s eta 0:00:01[K     |▌                               | 20kB 27.0MB/s eta 0:00:01[K     |▊                               | 30kB 23.3MB/s eta 0:00:01[K     |█                               | 40kB 20.2MB/s eta 0:00:01[K     |█▏                              | 51kB 13.9MB/s eta 0:00:01[K     |█▍                              | 61kB 13.4MB/s eta 0:00:01[K     |█▋                              | 71kB 13.5MB/s eta 0:00:01[K     |█▉                              | 81kB 14.2MB/s eta 0:00:01[K     |██                              | 92kB 14.0MB/s eta 0:00:01[K     |██▎                             | 102kB 13.8MB/s eta 0:00:01[K     |██▌                             | 112kB 13.8MB/s eta 0:00:01[K     |██▊                             | 122kB 13.8MB/s eta 0:00:01[K 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


ENCODER and DECODER WITH ATTENTION

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers = 2)

    def forward(self, input):
        embedded = self.embedding(input)
        output, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers = 2)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, target, hidden, cell):
        target = target.unsqueeze(0)
        embed = self.embedding(target)
        output, (hidden, cell) = self.lstm(embed, (hidden, cell) )
        output = output.squeeze(0)
        preds = self.out(output)
        return preds, hidden, cell

SEQUENCE 2 SEQUENCE

In [None]:
class seq2seq(nn.Module):
    def __init__(self, input_size, output_size, embed_size, hidden_size, max_length):
        super(seq2seq, self).__init__()
        self.max_length = max_length
        self.encoder = Encoder(input_size, embed_size, hidden_size).to(device)
        self.decoder = Decoder(output_size, embed_size, hidden_size).to(device)

    def forward(self, src, target , teacher_forcing = 0.5):
        
        batch_size = src.shape[1]
        max_length = self.max_length
        target_vocab_size = self.decoder.output_size

        outputs = torch.zeros(max_length, batch_size, target_vocab_size).to(device)
        hidden, cell = self.encoder(src)

        dinput = src[0,:]
        for l in range(1, max_length):
            output, hidden, cell = self.decoder(dinput, hidden, cell)
            outputs[l] = output
            if random.random() < teacher_forcing
                dinput = target[l]  
            else:
                dinput = output.argmax(1)
        return outputs

In [None]:
def train( model, opt, lossfn, train_loader, num_epochs):
    model.train()
    history = []
    for epoch in range(num_epochs):
        epoch_loss = 0
        for x,y in tq.tqdm( train_loader ):
            x =  torch.transpose(x, 0, 1)
            y = torch.transpose(y, 0, 1)

            opt.zero_grad()
            yhat = model(x, target = y)

            y = y[1:].reshape(-1)
            yhat = yhat[1:].reshape(-1, yhat.shape[-1])

            loss = lossfn(yhat, y)
            loss.backward()
            opt.step()

            epoch_loss = epoch_loss + loss.item()

        print(' Epoch : ', epoch , '   loss  : ', epoch_loss / len(train_loader) )
        history.append(epoch_loss / len(train_loader))
    return history

In [None]:
def make_sentence(tokens):
    str = ''
    for x in tokens:
        if x not in ['START_TOKEN', 'END_TOKEN', 'PAD_TOKEN']:
            str = str + ' ' + x
    return re.sub('(?<=\d)+ (?=\d)+', '', str)[1:]

def translate(model, sentence, input_lang, output_lang, max_length):
    model.eval()
    with torch.no_grad():
        input = tensorFromSentence( input_lang, sentence, max_length= max_length)
        input = torch.transpose( input.unsqueeze(0) , 0 , 1)
        output = model(input, target=None, teacher_forcing = 0)
        dec_words = []
        for x in output.squeeze():
            v,i = x.data.topk(1)
            if(i.item() == END_TOKEN ):
                dec_words.append( output_lang.index2word[ i.item() ] )
                break
            else:
                dec_words.append( output_lang.index2word[ i.item() ] )
    
    return make_sentence( dec_words )


PERFORMANCE EVALUATION

In [None]:
def get_bleu_score(model, pairs, input_lang, output_lang, max_length):

  total_num = len(pairs)
  total_bleu_scores = 0
  total_meteor_scores = 0
  
  for i in tq.tqdm( range(total_num) ):
    output    = translate(model, pairs[i][0], input_lang, output_lang, max_length)
    original  = make_sentence(pairs[i][1])
    total_bleu_scores   += sentence_bleu([output.split(" ")], original.split(" "))
    total_meteor_scores += single_meteor_score(output, original)

  bleu_result = total_bleu_scores/total_num
  meteor_result = total_meteor_scores/total_num
  
  print()
  print("bleu score: ",bleu_result)
  print("meteor score: ",meteor_result)

# **EXECUTION**

### READ AND PROCESS FILE

In [None]:
MAX_LENGTH = 32

In [None]:
data_location = location + 'NMT/'
model_location = location + 'NMT/NMT_LSTM/'
df = pd.read_csv(data_location+'train.csv',  index_col=0)

NameError: ignored

In [None]:
pairs, tokens = process_pairs(df, load_from_file=1, location = data_location + 'DataPairs/')
train_pairs, test_pairs, train_tokens, test_tokens = train_test_split( pairs, tokens, test_size = 0.2, shuffle = True, random_state = 200)
fil_train, fil_train_tokens = get_filitered_data( MAX_LENGTH - 2, train_pairs, train_tokens)

GENERATE LANGUAGE

In [None]:
hindi, english = generate_language(train_tokens)

GET TENSORS

In [None]:
train_tensors = tensorsFromPair(train_tokens, hindi, english, MAX_LENGTH)
test_tensors = tensorsFromPair(test_tokens, hindi, english, MAX_LENGTH)
train_loader = torch.utils.data.DataLoader(train_tensors, batch_size=256, shuffle=True)

### TRAIN MODEL

In [None]:
hidden_size = 512
input_vocab_size = hindi.num_words + 1
output_vocab_size = english.num_words + 1
embedding_dim = 300
epochs = 20

In [None]:
model2 = seq2seq(input_vocab_size, output_vocab_size , embedding_dim, hidden_size, MAX_LENGTH)
opt = optim.Adam( model2.parameters() )
lossfn = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)

In [None]:
train( model2, opt, lossfn, train_loader, epochs)

HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  0    loss  :  2.732063106447458


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  1    loss  :  2.619626010209322


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  2    loss  :  2.5192801848053934


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  3    loss  :  2.4324987776577474


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  4    loss  :  2.3544906467199325


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  5    loss  :  2.2653040904551744


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  6    loss  :  2.181982069090009


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  7    loss  :  2.11001419685781


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  8    loss  :  2.0231910202652217


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  9    loss  :  1.9554239954799413


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  10    loss  :  1.8735475439578295


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  11    loss  :  1.817426684498787


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  12    loss  :  1.7542444348335267


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  13    loss  :  1.6943654499948024


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  14    loss  :  1.6200547203421594


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  15    loss  :  1.5699385333806277


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  16    loss  :  1.517520174011588


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  17    loss  :  1.4485948540270328


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  18    loss  :  1.3891402546316385


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


 Epoch :  19    loss  :  1.3445690371096135


[2.732063106447458,
 2.619626010209322,
 2.5192801848053934,
 2.4324987776577474,
 2.3544906467199325,
 2.2653040904551744,
 2.181982069090009,
 2.11001419685781,
 2.0231910202652217,
 1.9554239954799413,
 1.8735475439578295,
 1.817426684498787,
 1.7542444348335267,
 1.6943654499948024,
 1.6200547203421594,
 1.5699385333806277,
 1.517520174011588,
 1.4485948540270328,
 1.3891402546316385,
 1.3445690371096135]

In [None]:
losses_0_20 = [6.523064902424812,
 6.037573310732841, 
 5.599939221143723, 
 5.315892766416073, 
 5.09729093015194, 
 4.904697574675083, 
 4.731893748044968, 
 4.542162449657917, 
 4.382418308407068, 
 4.203161864727735, 
 4.039230632781982, 
 3.8850651726126673, 
 3.722262720763683, 
 3.574678003042936, 
 3.4505377903580667, 
 3.313774961978197, 
 3.1757298365235327, 
 3.0560265742242336, 
 2.944844899326563, 
 2.8258959256112575]

losses_20_40 = [2.732063106447458,
 2.619626010209322,
 2.5192801848053934,
 2.4324987776577474,
 2.3544906467199325,
 2.2653040904551744,
 2.181982069090009,
 2.11001419685781,
 2.0231910202652217,
 1.9554239954799413,
 1.8735475439578295,
 1.817426684498787,
 1.7542444348335267,
 1.6943654499948024,
 1.6200547203421594,
 1.5699385333806277,
 1.517520174011588,
 1.4485948540270328,
 1.3891402546316385,
 1.3445690371096135]

In [None]:
torch.save(model2.state_dict(), model_location + 'lstm2_dict_' + str(20+epochs) )
torch.save(model2, model_location + 'lstm2_' + str(20+epochs) )

LOAD MODEL

In [None]:
model = seq2seq(input_vocab_size, output_vocab_size , embedding_dim, hidden_size, MAX_LENGTH)
model.load_state_dict( torch.load(model_location + 'lstm2_dict_40'))
model.eval()

seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(40447, 300)
    (lstm): LSTM(300, 512, num_layers=2)
  )
  (decoder): Decoder(
    (embedding): Embedding(29686, 300)
    (lstm): LSTM(300, 512, num_layers=2)
    (out): Linear(in_features=512, out_features=29686, bias=True)
  )
)

### USE MODEL

In [None]:
get_bleu_score(model, test_tokens, hindi, english, MAX_LENGTH)

HBox(children=(FloatProgress(value=0.0, max=20465.0), HTML(value='')))

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()




bleu score:  0.022128485815101656
meteor score:  0.24826904310138603


### USE MODEL FOR TRANSLATION

In [None]:
week = pd.read_csv(data_location+'Weekly Data/Week2/week2.csv', index_col=0)

In [None]:
week_processed = []
for x in  week1['hindi']:
  t = get_hindi_tokens(preprocess_hindi(x))
  week_processed.append(t)

In [None]:
translate(model, week_processed[0], hindi, english, MAX_LENGTH)

'what is they saying on the table or what are you'

In [None]:
translated_texts = []
for i in tq.tqdm( range(len(week_tensors)) ):
  translated_texts.append( translate(model, week_processed[i], hindi, english, MAX_LENGTH) ) 

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [None]:
with open(data_location + 'Weekly Data/Week2/lstm.txt', 'w') as f:
    for item in translated_texts:
        f.write("%s\n" % item)

In [None]:
#torch.save( tmodel.state_dict(), model_location + 'gru_dict_100')
#torch.save(model, location+ 'gru_enc_dec')

#tmodel = torch.load(model_location+ 'gru_100')
#tmodel.eval()

#tq.tqdm._instances.clear()