<a href="https://colab.research.google.com/github/abhishekshakya/seq-2-seq-for-neural-machine-translation-english-to-hindi-/blob/master/machineTranslation(attention).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

!cp '/content/gdrive/My Drive/parallelCorpus/data.csv' 'data.csv'

In [None]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm

In [None]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [None]:
df.drop(columns=['source'],inplace=True)

In [None]:
df = df.iloc[:10000,:]
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [None]:
def cleanerEng(x):
  x = str(x)
  x = x.lower()
  x = re.sub(r'[^a-z0-9]+',' ',x)
  if len(x) > 150:
    x = x[:150]
  return x

def cleanerHindi(x):
  x = str(x)
  x = re.sub(r'[-.।|,?;:<>&$₹]+',' ',x)
  if len(x) > 150:
    x = x[:150]
  return x

In [None]:
df.iloc[:,0] = df['english_sentence'].apply(func=cleanerEng)
df.iloc[:,1] = df['hindi_sentence'].apply(func= cleanerHindi)
df.iloc[:,0] = df['english_sentence'].apply(func= lambda x : (str(x).split()))
df.iloc[:,1] = df['hindi_sentence'].apply(func= lambda x : (str(x).split()))

In [None]:
def addTokens(x,start=False):
  x.append('<END>')
  if start:
    x.insert(0,'<START>')
  return list(x)

In [None]:
df.iloc[:,0] = df['english_sentence'].apply(func= addTokens,start=False)
df.iloc[:,1] = df['hindi_sentence'].apply(func= addTokens,start=True)

In [None]:
df.iloc[79,1]

['<START>',
 'जो',
 'वह',
 'सीखता',
 'है',
 'या',
 'पूर्व',
 'क्रमादेशित',
 'होता',
 'है',
 '<END>']

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random

In [None]:
data = df.values

In [None]:
data[:,1].shape

(10000,)

In [None]:
class vocab:

  def __init__(self,data,token=True):
    self.data = data
    if token:
      self.word2idx = {'<START>':1, '<END>':2, '<PAD>':0}
      self.idx2word = {1:'<START>', 2:'<END>', 0:'<PAD>'}
      self.idx = 2

    else:
      self.word2idx = {'<PAD>':0, '<END>':1}
      self.idx2word = {0:'<PAD>', 1:'<END>'}
      self.idx = 1

    self.x = []
    self.create()
    self.vocab_size = self.idx + 1

  def create(self):
    max_len = 0;
    for sentence in  self.data:
      max_len = max(max_len, len(sentence))
      for word in sentence:
        if self.word2idx.get(word) is None:
          self.idx += 1
          self.word2idx[word] = self.idx
          self.idx2word[self.idx] = word
    
    for sentence in self.data:
      sent = []
      for word in sentence:
        sent.append(self.word2idx[word])
      
      for i in range(len(sentence),max_len+1):
        sent.append(0)
      
      self.x.append(torch.Tensor(sent))

    

In [None]:
English_vocab = vocab(data[:,0],token=False)
Hindi_vocab = vocab(data[:,1],token=True)

In [None]:
for idx in Hindi_vocab.x[2]:
  print(Hindi_vocab.idx2word[int(idx)],end=' ')

<START> यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है <END> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

In [None]:
class parallelData(Dataset):

  def __init__(self):
    self.x = English_vocab.x
    self.y = Hindi_vocab.x

  def __getitem__(self,i):
    return self.x[i], self.y[i]
  
  def __len__(self):
    return len(self.x)


In [None]:
dataset = parallelData()

In [None]:
a = dataset[0][0].shape[0]
b = dataset[0][1].shape[0]
for i in range(len(dataset)):
  if a != dataset[i][0].shape[0] or b != dataset[i][1].shape[0]:
    print(a,dataset[i][0].shape[0],b,dataset[i][1].shape[0])

In [None]:
Hindi_vocab.x[90].shape

torch.Size([42])

In [None]:
torch.cuda.device_count()

1

In [None]:
#Model preparation

class encoder(nn.Module):

  def __init__(self, input_size, embedding_size, hidden_size, layers):
    '''
    input_size = size of vocab
    embedding_size = embedding dim
    hidden_size = hidden state size
    layer = num of layers of lstms
    '''
    super().__init__()
    self.embed = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size) # output size = (*,embedding_size)
    self.lstm = nn.LSTM(input_size=embedding_size, hidden_size= hidden_size, num_layers=layers, batch_first = True)

  def forward(self,x):
    '''
    x shape = [batch_size, sentence]
    one complete sentence represents a "sequence"
    '''
    x = self.embed(x) # shape [batch_size,  sentence, embed_size]
    output, (hidden_state, cell_state) = self.lstm(x) #shape [batch_size, seq_len, hidden_size]

    # print(output.shape, x.shape)
    return output, hidden_state, cell_state

##_---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class decoder(nn.Module):

  def __init__(self,input_size, embedding_size, hidden_size, layers):
    '''
    same configuration as encoder
    here input_size = size of hindi vocab
    '''
    super().__init__()
    self.embed = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size) # output size = (*,embedding_size)
    self.lstm = nn.LSTM(input_size=embedding_size, hidden_size= hidden_size, num_layers=layers, batch_first = True)
    self.fc = nn.Linear(in_features=hidden_size, out_features=input_size) #since output would be prob distribution among hindi vocab therefore out_feature=input_size

  def forward(self,x,hidden_state, cell_state):
    '''
    to have control over output we have to take sentence as word by word
    therefore seq_len would be 1 as input is  one word not the whole sentence
    x = [batch_size] ->required-> [batch_size, 1] (1 is seq_len)
    '''
    # print(x.shape)
    x = x.reshape(-1,1) # shape [batch, 1]
    # print(x.shape)
    x = self.embed(x) # shape [batch, 1, embed_dim]

    output, (hidden_state, cell_state) = self.lstm(x, (hidden_state, cell_state)) # shape output=>[batch, 1, hidden_size], hidden=>[layers, batch, hidden_size]
    output = self.fc(output) # shape [batch, 1, hindi_vocab_size]
    
    #just for removing extra dim
    output = output.squeeze(dim=1) #shape [batch, hindi_vocab_size]

    return output, hidden_state, cell_state


##_---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

class AttnDecoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, layers):
    super().__init__()

    self.embed = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size) # output size = (*,embedding_size)
    self.lstm = nn.LSTM(input_size=hidden_size + embedding_size, hidden_size= hidden_size, num_layers=layers, batch_first = True)
    self.fc = nn.Linear(in_features=hidden_size, out_features=input_size) #since output would be prob distribution among hindi vocab therefore out_feature=input_size

    #encoder_states from encoder => [batch, seq_len(35), hidden_size]
    #prev decoder hidden_state => [batch, layers(1), hidden_size] =>need to be in => [batch, seq(35), hidden_size]
    #therefore input of energy will be along hidden_size ie input = hidden_size*2
    self.energy =  nn.Linear(hidden_size*2, 1) #out [batch, seq_len, 1]
    self.softmax = nn.Softmax(dim=1)# doing softmax for each word ie (dim=1)

  
  def forward(self, x, hidden_state, cell_state, encoder_states):
    seq_len = encoder_states.shape[1]
    batch_size = encoder_states.shape[0]
    hidden_size = encoder_states.shape[2]

    h_new = hidden_state.repeat(seq_len, 1, 1) #shape [seq_len*1, batch, hidden_size] it will repeat dim=0 seq length times
    #by doing .repeat operation we can concat hidden state with all timestamps of encoder_states
    # print(h_new.shape, encoder_states.shape, hidden_state.shape)
    h_new = h_new.permute(1,0,2) #[batch, seq_len, hidden_size]
    energy = self.energy(torch.cat((h_new, encoder_states), dim=2))#input [batch, seq_len(35), hidden_size*2]  out = [batch, seq_len(35), 1]
    att_weights = self.softmax(energy)
    att_weights = att_weights.permute(0,2,1) # [batch, 1, seq_len]

    context = torch.bmm(att_weights, encoder_states) #[batch, 1, hidden_size]
    

    x = x.reshape(-1,1) # shape [batch, 1]
    x = self.embed(x) # shape [batch, 1, embed_dim]

    input_new = torch.cat((context, x), dim=2) #[batch, 1, hidden_size+embed_dim]

    output, (hidden_state, cell_state) = self.lstm(input_new, (hidden_state, cell_state)) # shape output=>[batch, 1, hidden_size], hidden=>[layers, batch, hidden_size]
    output = self.fc(output) # shape [batch, 1, hindi_vocab_size]

    output = output.squeeze(dim=1) #shape [batch, hindi_vocab_size]
    del h_new
    del context
    del input_new
    return output, hidden_state, cell_state

##_---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

class seq2seq(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder


  def forward(self, input, target, teaching_force=0.5):
    '''
    input = batch of english sentences[batch, sentece(padded)]
    target = batch of hindi sentences [batch, sentence(padded)] 
    '''
    batch_size = input.shape[0]
    seq_len = target.shape[1]
    hindi_vocab_size = Hindi_vocab.vocab_size

    output = torch.zeros((seq_len, batch_size, hindi_vocab_size)).to(device)

    _, hidden, cell = self.encoder(input)
    target = target.permute(1,0) # shape [seq, batch]
    x = target[0] # <START> token

    for i in range(1, seq_len):
      out, hidden, cell = self.decoder(x, hidden, cell) #out shape = [batch, vocab_size]
      output[i] = out
      decoder_guess = out.argmax(1)# taking the word with max value(confidence)  shape = [batch of words]

      if random.random() < teaching_force:
        x = target[i]
      else:
        x =  decoder_guess
    
    return output  #shape[seq_len, batch_size, vocab_size]

##_---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

class Attnseq2seq(nn.Module):
  def __init__(self, encoder, att_decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = att_decoder


  def forward(self, input, target, teaching_force=0.5):
    '''
    input = batch of english sentences[batch, sentece(padded)]
    target = batch of hindi sentences [batch, sentence(padded)] 
    '''
    batch_size = input.shape[0]
    seq_len = target.shape[1]
    hindi_vocab_size = Hindi_vocab.vocab_size

    output = torch.zeros((seq_len, batch_size, hindi_vocab_size)).to(device)

    encoder_states, hidden, cell = self.encoder(input)
    target = target.permute(1,0) # shape [seq, batch]
    x = target[0] # <START> token

    for i in range(1, seq_len):
      out, hidden, cell = self.decoder(x, hidden, cell, encoder_states) #out shape = [batch, vocab_size]
      output[i] = out
      decoder_guess = out.argmax(1)# taking the word with max value(confidence)  shape = [batch of words]

      if random.random() < teaching_force:
        x = target[i]
      else:
        x =  decoder_guess
    
    return output  #shape[seq_len, batch_size, vocab_size]



In [None]:
##training
epochs = 150
learning_rate = 0.001
batch_size = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_size = 300
hidden_size = 512
layers = 1


In [None]:
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
it = iter(loader)
x,y = next(it)
print(x.shape,y.shape)
len(dataset)


torch.Size([100, 37]) torch.Size([100, 42])


10000

In [None]:
ENC = encoder(English_vocab.vocab_size, embedding_size, hidden_size, layers).to(device)
# DE = decoder(Hindi_vocab.vocab_size, embedding_size, hidden_size, layers).to(device)
DE = AttnDecoder(Hindi_vocab.vocab_size, embedding_size, hidden_size, layers).to(device)
# model = seq2seq(ENC,DE).to(device)
model = Attnseq2seq(ENC,DE).to(device)
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=0)


In [None]:
for epoch in tqdm(range(epochs)):
  for id,(x,y) in (enumerate(tqdm(loader))):
    x = x.long().to(device)
    y = y.long().to(device)#[batch,seq]

    output = model(x,y)# [seq, batch, vocab]
    output = output[1:].reshape(-1,output.shape[2])
    y = y.permute(1,0)#[seq, batch]
    y = y[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterion(output,y)

    loss.backward()
    optimizer.step()

    # if id%20 == 0:
  print(f'[{epoch+1}/{epochs}] loss=>{loss.item()}')
  
  

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[1/150] loss=>7.219956874847412


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[2/150] loss=>6.945805072784424


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[3/150] loss=>6.906153678894043


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[4/150] loss=>6.710653305053711


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[5/150] loss=>6.642881393432617


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[6/150] loss=>6.639989852905273


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[7/150] loss=>6.591505527496338


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[8/150] loss=>6.414831161499023


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[9/150] loss=>6.172200679779053


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[10/150] loss=>6.095067977905273


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[11/150] loss=>5.805426120758057


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[12/150] loss=>5.811115264892578


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[13/150] loss=>5.84146785736084


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[14/150] loss=>5.076922416687012


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[15/150] loss=>5.290768146514893


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[16/150] loss=>5.014581680297852


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[17/150] loss=>4.725817680358887


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[18/150] loss=>4.664719104766846


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[19/150] loss=>4.374694347381592


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[20/150] loss=>4.1253485679626465


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[21/150] loss=>4.238427639007568


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[22/150] loss=>3.5190670490264893


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[23/150] loss=>3.7405552864074707


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[24/150] loss=>3.6567420959472656


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[25/150] loss=>3.728693962097168


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[26/150] loss=>3.2338063716888428


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[27/150] loss=>2.985640287399292


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[28/150] loss=>2.603079080581665


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[29/150] loss=>2.726790428161621


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[30/150] loss=>2.4975404739379883


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[31/150] loss=>2.3765878677368164


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[32/150] loss=>2.237377882003784


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[33/150] loss=>2.2672135829925537


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[34/150] loss=>2.311246633529663


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[35/150] loss=>2.005934476852417


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[36/150] loss=>1.8455369472503662


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[37/150] loss=>1.4201940298080444


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[38/150] loss=>1.2974857091903687


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[39/150] loss=>1.327499270439148


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[40/150] loss=>1.1150522232055664


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[41/150] loss=>1.0998011827468872


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[42/150] loss=>1.1561071872711182


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[43/150] loss=>1.2024919986724854


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[44/150] loss=>0.9487064480781555


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[45/150] loss=>0.708673357963562


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[46/150] loss=>0.6499477028846741


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[47/150] loss=>0.7037720680236816


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[48/150] loss=>0.5231795310974121


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[49/150] loss=>0.4562450647354126


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[50/150] loss=>0.3563171327114105


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[51/150] loss=>0.3106549084186554


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[52/150] loss=>0.3922344446182251


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[53/150] loss=>0.23249848186969757


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[54/150] loss=>0.28711721301078796


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[55/150] loss=>0.193973571062088


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[56/150] loss=>0.18202446401119232


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[57/150] loss=>0.15029479563236237


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[58/150] loss=>0.12328855693340302


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[59/150] loss=>0.09879656881093979


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[60/150] loss=>0.11472749710083008


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[61/150] loss=>0.09231844544410706


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[62/150] loss=>0.06588022410869598


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[63/150] loss=>0.08088932186365128


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[64/150] loss=>0.06047912687063217


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[65/150] loss=>0.05632929503917694


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[66/150] loss=>0.04562302306294441


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[67/150] loss=>0.03641127422451973


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[68/150] loss=>0.0377872958779335


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[69/150] loss=>0.03760962188243866


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[70/150] loss=>0.033919718116521835


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[71/150] loss=>0.03884424641728401


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[72/150] loss=>0.06997515261173248


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[73/150] loss=>0.10517440736293793


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[74/150] loss=>0.28486010432243347


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[75/150] loss=>0.11782024800777435


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[76/150] loss=>0.10384387522935867


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[77/150] loss=>0.03579751029610634


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[78/150] loss=>0.02683587372303009


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[79/150] loss=>0.029701128602027893


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[80/150] loss=>0.016037071123719215


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[81/150] loss=>0.015191339887678623


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[82/150] loss=>0.012153083458542824


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[83/150] loss=>0.012464821338653564


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[84/150] loss=>0.012478882446885109


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[85/150] loss=>0.009912715293467045


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[86/150] loss=>0.011369057931005955


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[87/150] loss=>0.01057661883533001


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[88/150] loss=>0.014293470419943333


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[89/150] loss=>0.008612852543592453


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[90/150] loss=>0.008738246746361256


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[91/150] loss=>0.009907190687954426


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[92/150] loss=>0.011818335391581059


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[93/150] loss=>0.009414184838533401


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[94/150] loss=>0.0059803808107972145


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[95/150] loss=>0.006038492079824209


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[96/150] loss=>0.010332596488296986


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[97/150] loss=>0.005956618115305901


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[98/150] loss=>0.006656108424067497


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[99/150] loss=>0.006212169770151377


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[100/150] loss=>0.007929553277790546


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


[101/150] loss=>0.011319557204842567


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

KeyboardInterrupt: ignored

In [None]:
torch.save(model.state_dict(),'model.pt')
from google.colab import files
files.download('model.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def prediction(x):
    for idx in x:
      if idx == 0:
        break
      print(English_vocab.idx2word[int(idx)],end=' ')
    
    print()

    x = x.long().reshape(1,-1).to(device)
    ans = translate(x)
    res = []
    for id in ans:
      res.append(Hindi_vocab.idx2word[id])
    
    return res
    

    


In [None]:
def translate(input):
      #input = batch of english sentences[batch, sentece(padded)]
      with torch.no_grad():
        guess = []
        encoder_states, hidden, cell = model.encoder(input)
        # x = torch.ones((1)).float().to(device) # <START> token
        x = torch.ones((1)).long().to(device)
        while True:
          out, hidden, cell = model.decoder(x, hidden, cell, encoder_states) #out shape = [batch, vocab_size]
          x = out.argmax(1)# taking the word with max value(confidence)  shape = [batch of words]
          guess.append(int(x[0].detach().cpu()))

          if x == 2:
            break

      return guess

In [None]:
prediction(dataset[50][0])

category information technology <END> 
श्रेणी सूचना प्रौद्योगिकी <END> 


In [None]:
prediction(dataset[100][0])

class united states of america <END> 
श्रेणी संयुक्त राज्य अमेरिका <END> 


In [None]:
prediction(dataset[71][0])

he often spoke to us of his own great longing to meet him <END> 
वे बहुधा उनसे मिलने की तीव्र इच्छा के बारे में भी हमें बताया करते <END> 


In [None]:
prediction(dataset[random.randint(0,10000)][0])

ruth glenn little explain the ramayana composed with lynn jessup acsbieen 1 928875 02 05 <END> 
रुथ ग्लेन लिटल की व्याख्या के साथ लिन जेसप रचित रामायण (आइएसबीएन 1 928875 02 05) <END> 


In [None]:
for i in range(15):
  prediction(dataset[random.randint(0,10000)][0])

he got together a ramakrishna vivekananda group of young people <END> 
अब सुभाष ने रामकृष्ण विवेकानन्द युवजन सभा का गठन किया <END> 
we have already noted in fact that on account of its peculiar intellectual trend it regards the apprehension of ultimate reality to be the highest val <END> 
हमने पहले ही इस बात की ओर ध्यान दिया कि यर्थार्थ में चरम वास्तविकता की आशंका को वह उच्चतम मूल्य मानता है और व्यावहारिक मूल्यों को नीचा स्थान देता है <END> 
45 best period travelling in jodhpur is from october to march <END> 
45 अक्टूबर से मार्च जोधपुर शहर के भ्रमण का सर्वोत्तम समय है <END> 
on march 13 reckitt benckiser plc makers of dettol and many other consumer products decided to bid goodbye to the indian stock markets <END> 
ड़ेटॉल और कई अन्य उपभोक्ता सामग्रियों की निर्माता रेकिट बेंकाइजर प्ले ने 13 मार्च को फैसल किया कि अब वह भारतीय शेयर बाजार को अलविदा कह देगी <END> 
but after his demise his bother sardar vallabh bhai patel did not accept this will and filed a case in the court <END> 
मगर उनके 

In [None]:
dataset[3][0]

tensor([ 8., 30., 31., 32., 23., 33., 34., 35., 36., 37.,  4., 38., 39.,  1.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [None]:
def get(sent):
  # sentence = sentence.lower()
  # sent = sentence.split()
  # sent.append('<END>')
  # print(sent)

  toks = []
  for word in sent:
    if English_vocab.word2idx.get(word) is None:
      toks.append(English_vocab.word2idx['the'])
    else:
      toks.append(English_vocab.word2idx[word])
  # print(toks)
  sent = torch.tensor(toks).float()
  res = prediction(sent)
  # print(res)
  return res

In [None]:
#bleu score calculation

In [None]:
tdf = pd.read_csv('data.csv')
tdf.drop(columns=['source'],inplace=True)
tdf = tdf.iloc[10001:11000,:]
tdf.head()

Unnamed: 0,english_sentence,hindi_sentence
10001,AMF vechers calculated three mass medias in 20...,मोबाइल फोन पर मीडिया के आगमन से अल्फा उपयोगकर्...
10002,Ask if any check has been made .,पता कीजिए कि क्या कोई जाँच हुई है .
10003,for eight billion people living in cities,ऐसे शहरों में रहने वाले ८ अरब लोगों को
10004,"He had an idea, a very clear idea.","उनका विचार था, एकदम साफ विचार,"
10005,Government should make documents related to Ne...,नेताजी से जुड़े दस्तावेज सार्वजनिक करे सरकार


In [None]:
tdf.iloc[:,0] = tdf['english_sentence'].apply(func=cleanerEng)
tdf.iloc[:,1] = tdf['hindi_sentence'].apply(func= cleanerHindi)
tdf.iloc[:,0] = tdf['english_sentence'].apply(func= lambda x : (str(x).split()))
tdf.iloc[:,1] = tdf['hindi_sentence'].apply(func= lambda x : (str(x).split()))

In [None]:
tdf.head()

Unnamed: 0,english_sentence,hindi_sentence
10001,"[amf, vechers, calculated, three, mass, medias...","[मोबाइल, फोन, पर, मीडिया, के, आगमन, से, अल्फा,..."
10002,"[ask, if, any, check, has, been, made]","[पता, कीजिए, कि, क्या, कोई, जाँच, हुई, है]"
10003,"[for, eight, billion, people, living, in, cities]","[ऐसे, शहरों, में, रहने, वाले, ८, अरब, लोगों, को]"
10004,"[he, had, an, idea, a, very, clear, idea]","[उनका, विचार, था, एकदम, साफ, विचार]"
10005,"[government, should, make, documents, related,...","[नेताजी, से, जुड़े, दस्तावेज, सार्वजनिक, करे, ..."


In [None]:
tdf.iloc[:,0] = tdf['english_sentence'].apply(func= addTokens,start=False)

In [None]:
tdf.head()

Unnamed: 0,english_sentence,hindi_sentence
10001,"[amf, vechers, calculated, three, mass, medias...","[मोबाइल, फोन, पर, मीडिया, के, आगमन, से, अल्फा,..."
10002,"[ask, if, any, check, has, been, made, <END>]","[पता, कीजिए, कि, क्या, कोई, जाँच, हुई, है]"
10003,"[for, eight, billion, people, living, in, citi...","[ऐसे, शहरों, में, रहने, वाले, ८, अरब, लोगों, को]"
10004,"[he, had, an, idea, a, very, clear, idea, <END>]","[उनका, विचार, था, एकदम, साफ, विचार]"
10005,"[government, should, make, documents, related,...","[नेताजी, से, जुड़े, दस्तावेज, सार्वजनिक, करे, ..."


In [None]:
tdata = tdf.values
test_dataset = vocab(tdata[:,0],token=False)

999

really mean that they re what bad <END> 
जो कि इसका मतलब है क्या हुआ है <END> 


In [None]:
res = get(tdata[2,0])[:-1]
print(res)
print(tdata[2,0][:-1])

for eight billion people living in cities <END> 
['कुछ', 'लोग', 'थे']
['for', 'eight', 'billion', 'people', 'living', 'in', 'cities']


In [None]:
list_refrences = []
list_hypothesis = []
for i in tqdm(range(int(tdata.shape[0]))):
  list_refrences.append(tdata[i,0][:-1])
  list_hypothesis.append(get(tdata[i,0])[:-1])

In [None]:
import nltk
print(f"bleu-1 : {nltk.translate.bleu_score.corpus_bleu(list_refrences, list_hypothesis, weights=(1.0,0.0,0.0,0.0))}")
print(f"bleu-2 : {nltk.translate.bleu_score.corpus_bleu(list_refrences, list_hypothesis, weights=(0.5,0.5,0.0))}")
print(f"bleu-3 : {nltk.translate.bleu_score.corpus_bleu(list_refrences, list_hypothesis, weights=(0.33,0.33,0.33,0.0))}")
print(f"bleu-4 : {nltk.translate.bleu_score.corpus_bleu(list_refrences, list_hypothesis, weights=(0.25,0.25,0.25,0.25))}")

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


bleu-1 : 0.0006056935190793456
bleu-2 : 0.02461084149474263
bleu-3 : 0.08672483172180753
bleu-4 : 0.1568784290294323


In [None]:
print(bleu_score)

0.1568784290294323
