<a href="https://colab.research.google.com/github/amantayal44/Hindi-to-English-NMT/blob/main/phase3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#PHASE 3 (NMT Hindi to English)

in phase 1, i tested various seq2seq model with encoder as biLSTM,LSTM,GRU,biGRU and similar decoder. I found that biGRU encoder with GRU decoder gives best scores on test data. In phase 2, i tested seq2seq model with biGRU encoder and GRU decoder on different vocab size and on different initialization. Model with 8k vocabulary size perform better. In this phase, i will update my model with attention layer as in [paper](https://arxiv.org/pdf/1409.0473.pdf), use 
10k vocab size and will also write code for vocab instead of torchtext library.

## Setup and Installing Libraries

I directly link my colab to google drive to easily load training data and store trained model

In [None]:
#ignore if file not on drive (change file address while creating dataset)
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"
!pip install nltk -U
!python3 -m spacy download en
!pip install revtok

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1271, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 1271 (delta 50), reused 54 (delta 25), pack-reused 1178[K
Receiving objects: 100% (1271/1271), 9.56 MiB | 13.82 MiB/s, done.
Resolving deltas: 100% (654/654), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (133/133), 149.77 MiB | 30.78 MiB/s, done.
Resolving deltas: 100% (51/51), done.
Collecting Morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6
Collecting nltk
[?25l  Downloadi

In [None]:
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()
from tqdm import tqdm
import nltk
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score
from indicnlp.tokenize import indic_tokenize 
import csv 
import re
import warnings
warnings.filterwarnings("ignore") #uncomment only if code is done

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Preprocessing

In [None]:
orignal_dataset = []
#if not on drive, change "gdrive/MyDrive/train.csv" to "train.csv"
with open("gdrive/MyDrive/train.csv",encoding="utf-8") as f:
  csv_reader = csv.reader(f, delimiter=',')
  i = 0
  for r in csv_reader:
    if i == 0:
      i = 1
      continue
    orignal_dataset.append([r[1],r[2]])

In [None]:
len(orignal_dataset)

102322

In [None]:
#non hindi symbols (some symbols that i found in hindi sentences)
non_hindi_chr = ['♫', '#', '$', '%', '&', '£', '¥', '§', '©', 'Â', 'è', 'Ã', '€','[',']']

In [None]:
# function to clean data
def clean_data(dataset,max_length=20):
  # remove dataset that has non ascii character in english part and keep sentences that has length less than max_length
  new_dataset = []
  i = 0
  for data in dataset:
    l_1 = len(indic_tokenize.trivial_tokenize(data[0])) #length of hindi sentence
    l_2 = len(data[1].split(" ")) #length of english sentence
    check_chr = True  
    for nh in non_hindi_chr: #checking if non hinid characters in hindi sentence
      if nh in data[0]:
        check_chr = False
        break
    #english sentence should not have other than ascii characters
    if re.search(r'[^\x00-\x7F]+',data[1]) == None and max(l_1,l_2) <= max_length and check_chr:
      new_dataset.append(data)
    #printing some removed sentences
    elif i<5:
      if i == 0: print("Some removed datasets")
      i += 1
      print("{}. \"{}\" , \"{}\"".format(i,data[0],data[1]))
  print("removed {} of {} datasets".format(len(dataset)-len(new_dataset),len(dataset)))
  return new_dataset

In [None]:
def capitalize_str(s):
  capt = True
  out = []
  for c in s:
    a = c
    if a in [" "]:
      pass
    elif ord(a) >= 97 and ord(a) <= 122:
      if capt: a = a.capitalize()
      capt = False
    elif a in ['?','.','!']:
      capt = True
    else:
      capt = False
    out.append(a)
  return "".join(out)

In [None]:
#to preprocess english sentence
def preprocess_eng(sentence):
  sentence = sentence.lower().strip() #lower case letters
  # removing shortforms
  sentence = re.sub(r"i'm","i am",sentence)
  sentence = re.sub(r"let's","let us",sentence)
  sentence = re.sub(r"\'ll", " will", sentence)
  sentence = re.sub(r"\'ve", " have", sentence)
  sentence = re.sub(r"\'re", " are", sentence)
  sentence = re.sub(r"\'d", " would", sentence)
  sentence = re.sub(r"\'re", " are", sentence)
  sentence = re.sub(r"n't"," not",sentence)

  sentence = re.sub(r"([?.!,])", r" \1 ", sentence) #creating space b/w punctuation
  sentence = re.sub(r'[" "]+', " ", sentence) # removing multiple places
  sentence = sentence.strip()
  return sentence

# some corresponding postprocess to increase score
def postprocess_eng(sentence,remove_unk=False):
  # sentence = capitalize_str(sentence) #capitalize first
  sentence = sentence.capitalize() #capitalize first
  sentence = re.sub(r" i ",r" I ",sentence) #changes small 'i' to capital 'I' in sentence
  sentence = re.sub(r" ([?.!,'\:])",r"\1",sentence) #remove space b/w last word and punctuation
  if remove_unk: sentence = re.sub(r" <unk> ",r" ",sentence) # to remove <unk> token
  return sentence


In [None]:
#preprocess data using above functions and check bleu score of preprocessed output
def data_preprocessing(dataset,max_length=20):
  new_dataset = []
  for data in dataset:
    new_dataset.append([data[0],preprocess_eng(data[1]),data[1]])
  new_dataset = clean_data(new_dataset,max_length)
  # comparing change in bleu score and meteor score
  total_bleu_score = 0
  total_meteor_score = 0
  for i in tqdm(range(len(new_dataset))):
    total_bleu_score += sentence_bleu([new_dataset[i][2].split(" ")], postprocess_eng(new_dataset[i][1]).split(" "))
    total_meteor_score += single_meteor_score(new_dataset[i][2],postprocess_eng(new_dataset[i][1]))

  l = len(new_dataset)
  print("\nbleu score {}".format(round(total_bleu_score/l,2)))
  print("meteor score {}".format(round(total_meteor_score/l,2)))

  return new_dataset

In [None]:
dataset = data_preprocessing(orignal_dataset,max_length=25)

Some removed datasets
1. "एल सालवाडोर मे, जिन दोनो पक्षों ने सिविल-युद्ध से वापसी ली, उन्होंने वही काम किये जो कैदियों की कश्मकश के निदान हैं।" , "in el salvador , both sides that withdrew from their civil war took moves that had been proven to mirror a prisoner's dilemma strategy ."
2. "पर मेरे लिए उसका यहुदी विरोधी होना उसके कार्यों को और भी प्रशंसनीय बनाता है क्योंकि उसके पास भी पक्षपात करने के वही कारण थे जो बाकी फौजियों के पास थे पर उसकी सच जानने और उसे बनाए रखने की प्रेरणा सबसे ऊपर थी" , "but personally , for me , the fact that picquart was anti-semitic actually makes his actions more admirable , because he had the same prejudices , the same reasons to be biased as his fellow officers , but his motivation to find the truth and uphold it trumped all of that ."
3. "तो स्मार्ट में, हमारे पास लक्ष्य के अलावा, मलेरिया टीका विकसित करने के, हम अफ्रीकी वैज्ञानिकों को भी प्रशिक्षण दे रहे हैं, क्योंकि अफ्रीका में बीमारी का बोझ काफी ज़्यादा है, और आपको उन लोगों की आवश्यकता है जो सीमाओं को आ

  0%|          | 0/89076 [00:00<?, ?it/s]

removed 13246 of 102322 datasets


100%|██████████| 89076/89076 [00:19<00:00, 4459.82it/s]


bleu score 0.58
meteor score 0.92





In [None]:
#creating train,val and test set of data with 0.8,0.1,0.1 split
from sklearn.model_selection import train_test_split
train_set,val_set = train_test_split(dataset,test_size=0.2,random_state=42)
val_set,test_set = train_test_split(val_set,test_size=0.5,random_state=42)
len(train_set),len(val_set),len(test_set)

(71260, 8908, 8908)

## Tokenization

spacy for english and indic_tokenize for hindi

In [None]:
from collections import Counter
import torch
import random
import spacy
nlp = spacy.load("en")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#class for creating vocab
class vocab:
  def __init__(self,counter,max_size):
    self.word2index = dict()
    self.index2word = ['<pad>', '<unk>', '<eos>', '<sos>']
    self.unk_idx = 1
    self.max_size = max_size+4
    words = sorted(counter.items(), key=lambda t: t[0])
    words.sort(key=lambda t: t[1], reverse=True)
    for w,i in words:
      if len(self.index2word) == self.max_size:
        break
      self.index2word.append(w)
    for i in range(len(self.index2word)):
      self.word2index[self.index2word[i]] = i
  def __len__(self):
    return len(self.index2word)
  def __getitem__(self,word):
    return self.word2index.get(word,self.unk_idx)

In [None]:
#spacy for english tokenization and indic library for hindi
eng_tokenizer = lambda sentence: [tok.text for tok in nlp.tokenizer(sentence)]
hindi_tokenizer = indic_tokenize.trivial_tokenize

In [None]:
##uncomment if using saved tokenized sentence
#spacy tokenization taking very long time (about 40-50 mins)
#so i tokenized all sentences once using spacy and saved as pickle
# import pickle
# with open("gdrive/MyDrive/cs779_model/final_eng_sen.pickle", 'rb') as handle:
#     eng_sen =  pickle.load(handle)


#tokenizing english sentences
#may take 40-50 mins
eng_sen = dict()

#can use tqdm to show loop progress (uncomment below line) but sometimes print gibbrish output
#for data in tqdm(dataset):
#if using tqdm comment below line
for data in dataset:
  eng_sen[data[1]] = eng_tokenizer(data[1])

In [None]:
#function to create vocab on given sentences
def get_vocab(dataset,eng_tokenizer,hindi_tokenizer,max_size_eng=5000,max_size_hindi=5000):
  eng_counter = Counter()
  hindi_counter = Counter()
  for data in tqdm(dataset):
    # using saved spacy tokenize sentences
    eng_counter.update(eng_sen[data[1]])
    hindi_counter.update(hindi_tokenizer(data[0]))

  eng_vocab = vocab(eng_counter,max_size_eng)
  hindi_vocab = vocab(hindi_counter,max_size_hindi)
  return eng_vocab,hindi_vocab,eng_counter,hindi_counter

In [None]:
#using vocab size of 10k
eng_vocab,hindi_vocab,eng_counter,hindi_counter = get_vocab(train_set,eng_tokenizer,hindi_tokenizer,10000,10000)

100%|██████████| 71260/71260 [00:01<00:00, 48567.13it/s]


In [None]:
#function to tokenize data using vocab
def tokenize(dataset,eng_tokenizer,hindi_tokenizer,eng_vocab,hindi_vocab):
  tokenized_data = []
  for data in tqdm(dataset):
    # using saved spacy tokenize sentences
    eng_data = torch.tensor([eng_vocab['<sos>']]+[eng_vocab[t] for t in eng_sen[data[1]]]+[eng_vocab['<eos>']], dtype=torch.long)
    hindi_data = torch.tensor([hindi_vocab['<sos>']]+[hindi_vocab[t] for t in hindi_tokenizer(data[0])]+[hindi_vocab['<eos>']], dtype=torch.long)
    tokenized_data.append([hindi_data,eng_data])
  return tokenized_data

In [None]:
tokenized_train_data= tokenize(train_set,eng_tokenizer,hindi_tokenizer,eng_vocab,hindi_vocab)
tokenized_val_data= tokenize(val_set,eng_tokenizer,hindi_tokenizer,eng_vocab,hindi_vocab)
tokenized_test_data = tokenize(test_set,eng_tokenizer,hindi_tokenizer,eng_vocab,hindi_vocab)

100%|██████████| 71260/71260 [00:03<00:00, 22023.42it/s]
100%|██████████| 8908/8908 [00:00<00:00, 24021.94it/s]
100%|██████████| 8908/8908 [00:00<00:00, 23252.04it/s]


creating batches of data and padding them using pytorch dataloader

In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

BATCH_SIZE = 128
pad_hindi = hindi_vocab['<pad>']
pad_eng = eng_vocab['<pad>']

In [None]:
def get_data(data):
  hindi_data = []
  eng_data = []
  for hindi_sen,eng_sen in data:
    hindi_data.append(hindi_sen)
    eng_data.append(eng_sen)
  hindi_data = pad_sequence(hindi_data,padding_value=pad_hindi)
  eng_data = pad_sequence(eng_data,padding_value=pad_eng)
  return hindi_data,eng_data

In [None]:
train_data = DataLoader(tokenized_train_data, batch_size=BATCH_SIZE,shuffle=True,collate_fn=get_data)
val_data = DataLoader(tokenized_val_data, batch_size=BATCH_SIZE,shuffle=True,collate_fn=get_data)
test_data = DataLoader(tokenized_test_data, batch_size=BATCH_SIZE,shuffle=True,collate_fn=get_data)

##Model

seq2seq with bigru encoder and gru decoder and attention layer 

In [None]:
from torch import nn
from torch.nn import LSTM,GRU,Linear,Embedding
import torch.optim as optim
import time

In [None]:
e_vocab_size = len(hindi_vocab) #encoder vocab size
d_vocab_size = len(eng_vocab) #decoder vocab size

In [None]:
class biEncoder(nn.Module):
  def __init__(self,vocab_size,emb_size=512,hid_size=512,dropout=0.5,out=512):
    super().__init__()
    self.vocab_size = vocab_size
    self.hid_size = hid_size
    self.embedding = Embedding(vocab_size,emb_size)
    self.dropout = nn.Dropout(dropout)
    self.rnn = GRU(emb_size,hid_size,bidirectional=True) #bidirectional
    self.out = Linear(2*hid_size,out)
  
  def forward(self,input):
    embedded = self.embedding(input)
    embedded = self.dropout(embedded)
    outputs,h = self.rnn(embedded)
    h = self.out(torch.cat((h[0], h[1]), dim = 1)).unsqueeze(0) #concat hidden values of backward and forward layer for output dim
    return outputs,h
    
#Attention Layer
class attention(nn.Module):
  def __init__(self,hid_size_e=512,hid_size_d=512,attn_size=256):
    super().__init__()
    self.hid_size_e = hid_size_e
    self.hid_size_d = hid_size_d
    self.attn_size = attn_size
    self.ff_1 = Linear(hid_size_e*2,attn_size) #enc_outputs
    self.ff_2 = Linear(hid_size_d,attn_size) #hidden
    self.ff_3 = Linear(attn_size,1) #final score
  
  def forward(self,output,h):
    score = self.ff_1(output)+self.ff_2(h)
    score = self.ff_3(torch.tanh(score))
    attn_weights = torch.softmax(score,dim=0) 
    context_vector = attn_weights*output #applying attn_weights on output
    context_vector = torch.sum(context_vector,dim=0).unsqueeze(0)
    return context_vector,attn_weights

class biDecoder(nn.Module):
  def __init__(self,vocab_size,emb_size=256,hid_size_e=512,hid_size_d=512,attn_size=256,dropout=0.5):
    super().__init__()
    self.vocab_size = vocab_size
    self.hid_size_e = hid_size_e
    self.hid_size = hid_size_d
    self.embedding = Embedding(vocab_size,emb_size)
    self.dropout = nn.Dropout(dropout)
    self.attention = attention(hid_size_e,hid_size_d,attn_size) #attention layer
    self.rnn = GRU(emb_size+2*hid_size_e,hid_size_d) #unidirectional
    self.out = Linear(hid_size_d+emb_size+2*hid_size_e,vocab_size) #output: [hidden,embedding,context_vector] -> vocab

  def forward(self,input,enc_outputs,h):
    # print(input.shape)
    input = input.view(1,-1)
    embedded = self.embedding(input)
    embedded = self.dropout(embedded)
    context_vector,attn_weights = self.attention(enc_outputs,h) #applying attention
    output,h = self.rnn(torch.cat([embedded,context_vector],dim=2),h)
    #using context_vector,embedding and rnn output for predicting next word (act as some kind of residual connection)
    output = torch.cat([embedded,context_vector,output],dim=2)
    output = self.out(output.squeeze(0))
    return output,h,attn_weights



class biseq2seq(nn.Module):
  def __init__(self,device,e_vocab_size,d_vocab_size,emb_size=256,hid_size_e=512,hid_size_d=512,attn_size=256,dropout=0.5):
    super().__init__()
    self.d_vocab_size = d_vocab_size
    self.e_vocab_size = e_vocab_size
    self.encoder = biEncoder(e_vocab_size,emb_size,hid_size_e,dropout,out=hid_size_d)
    self.decoder = biDecoder(d_vocab_size,emb_size,hid_size_e,hid_size_d,attn_size,dropout)
    self.device = device

  def forward(self,input,target,forcing = 0.5):
    batch_size = target.shape[1]
    len = target.shape[0]
    #output of dim [len,batch_size,vocab_size]
    output = torch.zeros(len,batch_size,self.d_vocab_size).to(self.device)
    outputs,h = self.encoder(input) #encoder output

    input = target[0] #taking first token of target as input (which is <sos>)
    for i in range(len-1): 
      out,h,attn = self.decoder(input,outputs,h) #encoder output
      output[i+1] = out
      force = random.random() < forcing #either choose next target as input or predicted value by model
      #using teacher force helps model to generate better results when only first word is seen
      # improves generating power of rnn model
      if force: input = target[i+1]
      else: input = out.argmax(1) 
    
    return output

In [None]:
#using norm clipping to avoid exploding gradient in GRU
#using batch gradient descent
def train(model,dataset,optimizer,loss_fn,clip=1):
  model.train()
  total_loss = 0
  for input,target in dataset:
    # initializing optimizer
    optimizer.zero_grad()
    input = input.to(device)
    target = target.to(device)
    # ouput from model
    output = model(input,target)
    # from dim [len,batch_size] to [len*batch_size]
    target = target[1:].view(-1) #ignoring 0th values as its of <sos> and 0 in output
    #from from dim [len,batch_size,vocab_size] to [len*batch_size,vocab_size]
    output = output[1:].view(-1,output.shape[-1])
    # calculating loss
    batch_loss = loss_fn(output,target)
    # back propogation 
    batch_loss.backward()
    # norm clipping to avoid exploding gradients
    if clip is not None:
      torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    #upgrading gradients
    optimizer.step()
    total_loss += batch_loss.item()
  
  return total_loss/len(dataset)

def evaluate(model,dataset,loss_fn):
  model.eval() #to avoid dropout and other things use during training
  total_loss = 0
  with torch.no_grad(): #not compute graidents
    for input,target in dataset:
      input = input.to(device)
      target = target.to(device)
      output = model(input,target,0)
      target = target[1:].view(-1)
      output = output[1:].view(-1,output.shape[-1])
      # calculating loss on eval set
      batch_loss = loss_fn(output,target)
      total_loss += batch_loss.item()
  
  return total_loss/len(dataset)

In [None]:
#if validation loss increase by more than stop it terminates training
def fit(model,train_data,val_data,optimizer,loss_fn,name="model",EPOCHS=10,clip=1,stop=0.1,test_data=None):
    history = [] #used for futher analysis and graph
    min_val = 10000 #used in finding parameters with least val set loss
    for epoch in range(EPOCHS):
        start = time.time()
        train_loss = train(model, train_data, optimizer,loss_fn,clip)
        val_loss = evaluate(model, val_data,loss_fn)  
        end = time.time()
        print("train loss: {:.3f} val loss: {:.3f}".format(train_loss,val_loss))
        t = end - start
        print("time taken by {} epoch {} min {} s".format(epoch+1,int(t/60),int(t%60)))
        history.append({
            "epoch":epoch,
            "train_loss": train_loss,
            "val_loss": val_loss,
            "time": t,
        })
        if val_loss<min_val:
            min_val = val_loss
             #saving best model
            torch.save(model.state_dict(), name+'_best.pt')
        # stop training if val_loss increase more than by stop
        if val_loss > min_val*(1+stop):
            break
    #saving model on last epoch
    torch.save(model.state_dict(), name+'.pt')
    #evaluating both best and last model on test set
    if test_data is not None:
        test_loss = evaluate(model,test_data,loss_fn)
        model.load_state_dict(torch.load(name+'_best.pt'))
        test_loss_best = evaluate(model,test_data,loss_fn)
        print("test loss: {:.3f} test loss on best val: {:.3f}".format(test_loss,test_loss_best))
    
    return history

In [None]:
#using embedding size = 512 and hidden dimension size of both encoder and decoder = 512
#attention hidden dim = 256 (default value)
model = biseq2seq(device,e_vocab_size,d_vocab_size,emb_size=512,hid_size_e=512,hid_size_d=512).to(device)

In [None]:
#counting no. of parameters in model
def parameters_count(model):
    return sum(parameter.numel() for parameter in model.parameters() if parameter.requires_grad)

In [None]:
print(parameters_count(model)) #37.9M

37961749


In [None]:
#using cross entropy loss with ignoring output for padded values
loss_fn = nn.CrossEntropyLoss(ignore_index = pad_eng)
#using adam optimizer (as dicussed in class)
optimizer = optim.Adam(model.parameters())

In [None]:
def initialize_xavier_normal(model):
  for layer,parameter in model.named_parameters():
    #initializing variable
    if "weight" in layer:
      nn.init.xavier_normal_(parameter.data)

In [None]:
#initializing models
#using xavier normal initializer
model.apply(initialize_xavier_normal)

biseq2seq(
  (encoder): biEncoder(
    (embedding): Embedding(10004, 512)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(512, 512, bidirectional=True)
    (out): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): biDecoder(
    (embedding): Embedding(10004, 512)
    (dropout): Dropout(p=0.5, inplace=False)
    (attention): attention(
      (ff_1): Linear(in_features=1024, out_features=256, bias=True)
      (ff_2): Linear(in_features=512, out_features=256, bias=True)
      (ff_3): Linear(in_features=256, out_features=1, bias=True)
    )
    (rnn): GRU(1536, 512)
    (out): Linear(in_features=2048, out_features=10004, bias=True)
  )
)

##Training

In [None]:
#takes around 33-34 mins
#using 10 epochs
history = fit(model,train_data,val_data,optimizer,loss_fn,name="attention",test_data=test_data,EPOCHS=10)

train loss: 4.762 val loss: 4.431
time taken by 1 epoch 3 min 18 s
train loss: 3.717 val loss: 3.944
time taken by 2 epoch 3 min 22 s
train loss: 3.098 val loss: 3.769
time taken by 3 epoch 3 min 22 s
train loss: 2.663 val loss: 3.757
time taken by 4 epoch 3 min 23 s
train loss: 2.346 val loss: 3.794
time taken by 5 epoch 3 min 23 s
train loss: 2.135 val loss: 3.849
time taken by 6 epoch 3 min 22 s
train loss: 1.976 val loss: 3.920
time taken by 7 epoch 3 min 22 s
train loss: 1.836 val loss: 4.018
time taken by 8 epoch 3 min 22 s
train loss: 1.719 val loss: 4.096
time taken by 9 epoch 3 min 22 s
train loss: 1.614 val loss: 4.139
time taken by 10 epoch 3 min 22 s
test loss: 4.173 test loss on best val: 3.772


##Evaluating

In [None]:
#output by model for hindi sentence
def inference(model,sentence,eng_vocab,hindi_vocab,max_len=40):
  model.eval() #in eval model
  #dim from [len] to [len,1] (as not in batches)
  sentence = sentence.unsqueeze(1).to(device)
  output = [eng_vocab['<sos>']] #output vector
  attentions = []
  with torch.no_grad():
    enc_outputs,h = model.encoder(sentence) #encoder output
    for i in range(max_len):
      input = torch.tensor([output[-1]],dtype=torch.long).to(device) #input is last output
      out,h,attn = model.decoder(input,enc_outputs,h) #decoder output
      attentions.append(attn.view(-1))
      prediction = out.argmax(1).item() #word with most value in final layer
      if prediction == eng_vocab['<eos>']: #break if <eos> token is found (end of sentence)
        break
      output.append(prediction)
  return output[1:],attentions #returning output (ignoring <sos> token)

In [None]:
#calculates bleu and meteor score on test data 
def score_data(test,model,inference,eng_vocab,hindi_vocab,hindi_tokenizer,remove_unk=False):
    total_bleu_score_p = 0
    total_meteor_score_p = 0
    total_bleu_score = 0
    total_meteor_score = 0
    for i in tqdm(range(len(test))):
        data = test[i]
        tokenized = torch.tensor([hindi_vocab['<sos>']]+[hindi_vocab[t] for t in hindi_tokenizer(data[0])]+[hindi_vocab['<eos>']], dtype=torch.long)
        output,attn = inference(model,tokenized,eng_vocab,hindi_vocab)
        output = " ".join([eng_vocab.index2word[t] for t in output])
        #without postprocessing on output after preprocessing
        total_bleu_score += sentence_bleu([data[1].split(" ")], output.split(" "))
        total_meteor_score += single_meteor_score(data[1],output)
        #with postprocessing on output from actual data
        total_bleu_score_p += sentence_bleu([data[2].split(" ")], postprocess_eng(output,remove_unk).split(" "))
        total_meteor_score_p += single_meteor_score(data[2],postprocess_eng(output,remove_unk))

    l = len(test)
    print("\nbleu score {:.4f}, bleu score with on actual {:.4f}".format(total_bleu_score/l,total_bleu_score_p/l))
    print("meteor score {:.4f}, meteor score with on actual {:.4f}".format(total_meteor_score/l,total_meteor_score_p/l))

In [None]:
score_data(test_set,model,inference,eng_vocab,hindi_vocab,hindi_tokenizer)
#we have 2 scores of bleu and meteor
#1) when output is compared with preprocessed target 
#2) when output is postprocessed and compared with actual target 

100%|██████████| 8908/8908 [01:53<00:00, 78.50it/s]


bleu score 0.0819, bleu score with on actual 0.0264
meteor score 0.4284, meteor score with on actual 0.2861





In [None]:
#sentences in final test file
sample = []
with open("hindistatements.csv",encoding="utf-8") as f:
  csv_reader = csv.reader(f, delimiter=',')
  i = 0
  for r in csv_reader:
    if i == 0:
      i = 1
      continue
    sample.append(r[2])

In [None]:
def final_result(model,inference,sample,hindi_tokenizer,hindi_vocab,eng_vocab):
  result = []
  for s in sample:
    hindi_s = torch.tensor([hindi_vocab['<sos>']]+[hindi_vocab[t] for t in hindi_tokenizer(s)]+[hindi_vocab['<eos>']], dtype=torch.long)
    output,attn = inference(model,hindi_s,eng_vocab,hindi_vocab)
    output = " ".join([eng_vocab.index2word[t] for t in output])
    output = postprocess_eng(output)
    result.append(output)
  return result

In [None]:
#final outcome
result = final_result(model,inference,sample,hindi_tokenizer,hindi_vocab,eng_vocab)

In [None]:
#checking output of random sentence in sample
r = int(random.randint(0,4999))
sample[r], result[r]

('और मुझे आश्चर्य हुआ, ये कॉमिक्स लेक्चर एक हिट थे।',
 'And I wonder,,, I was a a of a.')

In [None]:
#creating answer.txt
f = open("answer.txt", "w")
for s in result:
  f.write(s+"\n")
f.close()

In [None]:
#saving model on drive (ignore)
!cp attention_best.pt gdrive/MyDrive/cs779_model/