<a href="https://colab.research.google.com/github/Devashish-Siwatch/marathi-neural-machine-translation/blob/main/tf_seq2seq_attn_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Before beginning this notebook, ensure that you have data.csv in available in the working directory

In [1]:
!pip install torchtext==0.8.0

Collecting torchtext==0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/23/23/8499af6d9c22b29b01f66a2c11d38ce71cd1cafa2655913c29818ed4a00f/torchtext-0.8.0-cp36-cp36m-manylinux1_x86_64.whl (6.9MB)
[K     |████████████████████████████████| 6.9MB 11.7MB/s 
Installing collected packages: torchtext
  Found existing installation: torchtext 0.3.1
    Uninstalling torchtext-0.3.1:
      Successfully uninstalled torchtext-0.3.1
Successfully installed torchtext-0.8.0


## Hyperparameter declaration

In [2]:
from argparse import Namespace

In [3]:
hype = Namespace(
    LR = 0.0001,
    BATCH_SIZE = 64,
    NUM_EPOCHS = 100,
    CLIP = 1,
)

In [4]:
model_hype = Namespace(
    EMBEDDING_SIZE = 128,
    GRU_UNITS = 512,
)

In [5]:
#example usage
hype.BATCH_SIZE

64

In [6]:
#to dict
vars(hype)

{'BATCH_SIZE': 64, 'CLIP': 1, 'LR': 0.0001, 'NUM_EPOCHS': 100}

## Data Processing

### Loading the data

In [8]:
import pandas as pd
import numpy as np

In [9]:
# importing the data from csv file
data = pd.read_csv('data.csv', header=None)
data.columns = ['english', 'marathi']
data.tail()

Unnamed: 0,english,marathi
40746,Just saying you don't like fish because of the...,हड्डींमुळे मासे आवडत नाही असं म्हणणं हे काय मा...
40747,The Japanese Parliament today officially elect...,आज जपानी संसदेने अधिकृतरित्या र्‍यौतारौ हाशिमो...
40748,Tom tried to sell his old VCR instead of throw...,टॉमने त्याचा जुना व्ही.सी.आर फेकून टाकण्याऐवजी...
40749,You can't view Flash content on an iPad. Howev...,आयपॅडवर फ्लॅश आशय बघता येत नाही. पण तुम्ही त्य...
40750,"In 1969, Roger Miller recorded a song called ""...","१९६९मध्ये रॉजर मिलरने ""यू डोन्ट वॉन्ट माय लव्ह..."


### Building tokenizers

In [10]:
import tensorflow as tf
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing import sequence

import re
import string

print(tf.__version__)

2.4.1


In [11]:
# Seperating punctuation: "Hello, World!" --> "Hello , World ! "
def sep_punk(text):
  for punk in string.punctuation:
    text = text.replace(punk," "+punk+" ")
  return text

# Adding <sos> and <eos> to the sentences
def add_init_token(sent_list):
  new_sent_list = []
  for sent in sent_list:
    sent = sep_punk(sent)
    sent = '<sos> ' + sent + ' <eos>'
    new_sent_list.append(sent)

  return new_sent_list    

In [12]:
mar_list = list(data['marathi'])
eng_list = list(data['english'])

mar_list = add_init_token(mar_list)
eng_list = add_init_token(eng_list)

In [13]:
print(mar_list[100])
print(eng_list[100])

<sos> मी आहे !  <eos>
<sos> It ' s me !  <eos>


In [14]:
# <unk> --> <unk> x
# Building tokenizers from keras.processing
def tokenize(sent_list):
  tokenizer = text.Tokenizer(filters='', oov_token='<unk>')
  tokenizer.fit_on_texts(sent_list)

  tensor_list = tokenizer.texts_to_sequences(sent_list)
  tensor_list = sequence.pad_sequences(tensor_list, padding='post')
  
  return {'Tensors': tensor_list, 'Tokenizer': tokenizer} 

In [15]:
marathi = tokenize(sent_list=mar_list)
english = tokenize(sent_list=eng_list)

In [16]:
mar_tokenizer = marathi['Tokenizer']
eng_tokenizer = english['Tokenizer']

print(f'The length of marathi vocab: {len(mar_tokenizer.word_index)}')
print(f'The length of english vocab: {len(eng_tokenizer.word_index)}')

The length of marathi vocab: 13842
The length of english vocab: 5716


In [17]:
mar_tensors = marathi['Tensors']
eng_tensors = english['Tensors']

print(f'Max length of sequence: {len(mar_tensors[0])}')
print(f'Max length of sequence: {len(eng_tensors[0])}')


Max length of sequence: 44
Max length of sequence: 50


In [18]:
print(mar_tensors[0])
print(eng_tensors[0])

[  2 707   4   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]
[ 2 49  4  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]


### Implimenting TF Dataset

In [19]:
from tensorflow.data import Dataset

In [20]:
# Building TF Dataset --> Batching, Shuffling 
BUFFER_SIZE = len(mar_tensors)
BATCH_SIZE = hype.BATCH_SIZE

dataset = Dataset.from_tensor_slices((mar_tensors, eng_tensors)).shuffle(BUFFER_SIZE, reshuffle_each_iteration=True)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [21]:
ex_mar_batch, ex_eng_batch = next(iter(dataset))
print(ex_mar_batch.shape)
print(ex_eng_batch.shape)

(64, 44)
(64, 50)


## Modelling

In [22]:
from tensorflow import nn
from tensorflow.keras import layers, Model

### Encoder

In [23]:
class Encoder(Model):
  def __init__(self, vocab_size, embedding_size, enc_units, batch_size):
    
    super(Encoder, self).__init__()

    self.batch_size = batch_size
    self.enc_units = enc_units
    
    self.embedding = layers.Embedding(vocab_size, embedding_size)
    self.gru = layers.GRU(enc_units, return_sequences= True, return_state= True, recurrent_initializer='glorot_uniform')
    # Glorot_uniform --> Xavier Initialization. Xavier et al. --> uniform but some magnitude factor, helps model converge

  def call(self, x, hidden):
    # Shape of x: (batch_size,src_length)
    x = self.embedding(x)
    
    #shape of x: (batch_size, src_length, embedding_size)
    output, state = self.gru(x, initial_state=hidden)
    
    # shape of output: (batch_size, src_length, enc_units)
    # shape of hidden: (batch_size, enc_units)

    return output,state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.enc_units))

In [24]:
mar_vocab_size = len(mar_tokenizer.word_index) + 1
EMBEDDING_SIZE = model_hype.EMBEDDING_SIZE
GRU_UNITS = model_hype.GRU_UNITS

encoder = Encoder(mar_vocab_size, EMBEDDING_SIZE, GRU_UNITS, BATCH_SIZE)

In [25]:
#Example input

ex_hidden = encoder.initialize_hidden_state()
sample_out, sample_hidden = encoder(ex_mar_batch, ex_hidden)

print(sample_out.shape)
print(sample_hidden.shape)

(64, 44, 512)
(64, 512)


### Attention

In [26]:
class Attention(layers.Layer):
  '''
      Custom made Bahdanau Attention
  '''
  def __init__(self, units):
    super(Attention, self).__init__()
    self.W1 = layers.Dense(units)
    self.W2 = layers.Dense(units)
    self.V = layers.Dense(1)

  def call(self, q, val):
    #Adding the time axis to q
    q_with_time_axis = tf.expand_dims(q, 1)
    
    # score = W * tanh(W*h_e + W*h_d)
    score = self.V( nn.tanh( self.W1(q_with_time_axis) + self.W2(val) ) )

    # getting attention weights 
    attention_w = nn.softmax(score, axis=1)

    # creating context vec by multiplying with enc outputs
    context_vec = attention_w * val
    context_vec = tf.reduce_sum(context_vec, axis=1)

    return context_vec, attention_w

In [27]:
ATTN_SIZE = 10
attention = Attention(ATTN_SIZE)

In [28]:
#example code
attn_res ,  attn_w = attention(sample_hidden, sample_out)
print(attn_res.shape)
print(attn_w.shape)

(64, 512)
(64, 44, 1)


### Decoder

In [29]:
class Decoder(Model):

  def __init__(self, vocab_size, embedding_size, dec_units, batch_size):
    super(Decoder,self).__init__()

    self.batch_size = batch_size
    self.dec_units = dec_units
    
    self.embedding = layers.Embedding(vocab_size, embedding_size)
    self.gru = layers.GRU(dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

    self.fc = layers.Dense(vocab_size)

    self.attention = Attention(dec_units)

  def call(self, x, hidden, enc_out):
    # Attention out to get context vectors
    context_vec, attention_w = self.attention(hidden, enc_out)

    # shape of x: (batch_size, trg_lenght)
    x = self.embedding(x)

    # adding context for info
    x = tf.concat([tf.expand_dims(context_vec, 1), x], axis=-1)

    # getting GRU
    output, state = self.gru(x)

    output = tf.reshape(output, (-1, output.shape[2]))

    # Taking in random input and converting it to words --> max(vocab size) 
    x = self.fc(output)

    return x, state, attention_w

In [30]:
eng_vocab_size = len(eng_tokenizer.word_index) + 1

decoder = Decoder(eng_vocab_size, EMBEDDING_SIZE, GRU_UNITS, BATCH_SIZE)

In [31]:
#example code

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),sample_hidden, sample_out)

print (sample_decoder_output.shape)

(64, 5717)


## Training

### Optimizers and Loss Functions

In [32]:
from tensorflow.keras import optimizers as optim
from tensorflow.keras import losses

In [33]:
optimizer = optim.Adam(learning_rate=hype.LR)
criteria = losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss = criteria(real, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss = loss*mask

  return tf.reduce_mean(loss)

In [34]:
optimizer.learning_rate

<tf.Variable 'learning_rate:0' shape=() dtype=float32, numpy=1e-04>

### Azure Blob set-up and loading


In [35]:
!pip install azure-storage-blob

Collecting azure-storage-blob
[?25l  Downloading https://files.pythonhosted.org/packages/8e/00/6772472a99cd0a5e74e4e90f87947fa041b37981a3ff93d883cbc450518d/azure_storage_blob-12.7.1-py2.py3-none-any.whl (339kB)
[K     |████████████████████████████████| 348kB 22.3MB/s 
[?25hCollecting cryptography>=2.1.4
[?25l  Downloading https://files.pythonhosted.org/packages/a6/a1/49543f8ae3165c598e6c1393c54f9af8eaf7111f86e769ab4b897cdcf096/cryptography-3.4.4-cp36-abi3-manylinux2014_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 54.6MB/s 
[?25hCollecting msrest>=0.6.18
[?25l  Downloading https://files.pythonhosted.org/packages/e8/cc/6c96bfb3d3cf4c3bdedfa6b46503223f4c2a4fa388377697e0f8082a4fed/msrest-0.6.21-py2.py3-none-any.whl (85kB)
[K     |████████████████████████████████| 92kB 12.6MB/s 
[?25hCollecting azure-core<2.0.0,>=1.10.0
[?25l  Downloading https://files.pythonhosted.org/packages/f5/2a/ad5f6bb3fcbb5c59087183041c82fe74d536204a03f9a662db825e1b3be4/azure_core-1.11

In [36]:
import os, uuid
from azure.storage import blob
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
print(__version__)

12.7.1


In [37]:
if "tf_checkpoint" not in os.listdir():
  os.mkdir("tf_checkpoint")

In [38]:
connect_str = "DefaultEndpointsProtocol=https;AccountName=tfmodel;AccountKey=PinzJZWJy/mFOWDgkBcCTPA9Fnfr7/qvaZSbjxQVH4YGrBt4MseqbKYjUGNKYX9PpBh+zgAk6uDrVpmvejBCiw==;EndpointSuffix=core.windows.net"

In [39]:
blob_service_client =  BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client("tf-ckpt")
blob_list = [blob.name for blob in container_client.list_blobs()]

In [40]:
for file in blob_list:
  blob_client = blob_service_client.get_blob_client('tf-ckpt', file)
  with open('./tf_checkpoint/'+file, "wb") as f:
    f.write(blob_client.download_blob().readall())

### Checkpoint

In [41]:
checkpoint_path = './tf_checkpoint'
checkpoint = tf.train.Checkpoint(epoch=tf.Variable(1),
                                 optimizer = optimizer,
                                 encoder=encoder,
                                 decoder=decoder,
                                 )
manager = tf.train.CheckpointManager(checkpoint, checkpoint_path, max_to_keep=1)


In [42]:
#  manager.save()

In [43]:
# manager.checkpoints

In [44]:
manager.restore_or_initialize()

'./tf_checkpoint/ckpt-17'

### Training Loop

In [47]:
import time

In [None]:
@tf.function
def train_step(src, trg, enc_hidden):
  loss = 0


  with tf.GradientTape() as tape:
    enc_out, enc_hidden = encoder(src, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([eng_tokenizer.word_index['<sos>']]*BATCH_SIZE, 1)

    for t in range(1, trg.shape[1]):

      pred, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_out)

      loss += loss_function(trg[:,t], pred)

      dec_input = tf.expand_dims(trg[:,t],1)

  
  batch_loss = loss/ int(trg.shape[1])

  variables = encoder.trainable_variables + decoder.trainable_variables

  grads = tape.gradient(loss,variables)

  optimizer.apply_gradients(zip(grads, variables))

  return batch_loss

In [None]:
EPOCHS = 50
steps_per_epoch = len(mar_tensors)//BATCH_SIZE


for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()

  total_loss = 0

  for (batch, (src,trg)) in enumerate(dataset.take(steps_per_epoch)):

    batch_loss = train_step(src, trg, enc_hidden)

    total_loss += batch_loss

    if batch%100 == 0:
      print(f'Epoch {epoch} Batch {batch} Loss{batch_loss.numpy():.4f}')

  
  if (epoch+1)%2 == 0:
    manager.save()

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 0 Batch 0 Loss0.0049
Epoch 0 Batch 100 Loss0.0050
Epoch 0 Batch 200 Loss0.0048
Epoch 0 Batch 300 Loss0.0062
Epoch 0 Batch 400 Loss0.0039
Epoch 0 Batch 500 Loss0.0018
Epoch 0 Batch 600 Loss0.0035
Epoch 1 Loss 0.0053
Time taken for 1 epoch 170.73022413253784 sec

Epoch 1 Batch 0 Loss0.0020
Epoch 1 Batch 100 Loss0.0029
Epoch 1 Batch 200 Loss0.0038
Epoch 1 Batch 300 Loss0.0036
Epoch 1 Batch 400 Loss0.0029
Epoch 1 Batch 500 Loss0.0013
Epoch 1 Batch 600 Loss0.0038
Epoch 2 Loss 0.0028
Time taken for 1 epoch 121.86319851875305 sec

Epoch 2 Batch 0 Loss0.0016
Epoch 2 Batch 100 Loss0.0017
Epoch 2 Batch 200 Loss0.0052
Epoch 2 Batch 300 Loss0.0023
Epoch 2 Batch 400 Loss0.0029
Epoch 2 Batch 500 Loss0.0044
Epoch 2 Batch 600 Loss0.0021
Epoch 3 Loss 0.0022
Time taken for 1 epoch 121.60438752174377 sec

Epoch 3 Batch 0 Loss0.0016
Epoch 3 Batch 100 Loss0.0019
Epoch 3 Batch 200 Loss0.0034
Epoch 3 Batch 300 Loss0.0014
Epoch 3 Batch 400 Loss0.0014
Epoch 3 Batch 500 Loss0.0008
Epoch 3 Batch 600 Loss0.

KeyboardInterrupt: ignored

In [None]:
manager.save()

'./tf_checkpoint/ckpt-17'

### Saving in blob

**Caution:** Only change this in case you wish to permanently change the model file. Do not change this otherwise.

In [None]:
blob_list = [blob.name for blob in container_client.list_blobs()]
blob_list

['checkpoint', 'ckpt-17.data-00000-of-00001', 'ckpt-17.index']

In [None]:
# clear the blob
for file_name in blob_list:
  container_client.delete_blob(blob=file_name)

In [None]:
# getting the file names
files = os.listdir('./tf_checkpoint')
files

['checkpoint', 'ckpt-17.data-00000-of-00001', 'ckpt-17.index']

In [None]:
# uploading the files
for file in files:
  blob_client = container_client.get_blob_client(file)
  with open("./tf_checkpoint/" + file,"rb") as data:
    blob_client.upload_blob(data)

## Inference

In [48]:
def evaluate(sentence):
  attention_plot = np.zeros((eng_tensors.shape[1],mar_tensors.shape[1]))

  sentence = sep_punk(sentence)

  inputs = []
  for i in sentence.lower().split(' '):
    if i != '' :
      if (i in mar_tokenizer.word_docs.keys()):
        inputs.append(mar_tokenizer.word_index[i])
      else: inputs.append(mar_tokenizer.word_index['<unk>'])

  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=mar_tensors.shape[1],
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = []

  hidden = [tf.zeros((1, GRU_UNITS))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([eng_tokenizer.word_index['<sos>']], 0)

  for t in range(eng_tensors.shape[1]):
    predictions, dec_hidden, _ = decoder(dec_input,  dec_hidden, enc_out)

    # storing the attention weights to plot later on
    # attention_weights = tf.reshape(attention_weights, (-1, ))
    # attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result.append(eng_tokenizer.index_word[predicted_id])

    if eng_tokenizer.index_word[predicted_id] == '<eos>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [49]:
start = time.time()
res, sentence  = evaluate(data['marathi'][40749])
end  = time.time()

print(end-start)
print(sentence)
print(data['english'][40749])
print(res)

0.28397536277770996
आयपॅडवर फ्लॅश आशय बघता येत नाही .  पण तुम्ही त्या वेब पानांचे यूआरएल स्वतःला ईमेल करून तोच आशय घरी पोहोचल्यावर आपल्या रोजच्या संगणकावर पाहू शकता . 
You can't view Flash content on an iPad. However, you can easily email yourself the URLs of these web pages and view that content on your regular computer when you get home.
['you', 'can', "'", 't', 'view', 'flash', 'content', 'on', 'an', 'ipad', '.', 'however', ',', 'you', 'can', 'easily', 'email', 'yourself', 'the', 'urls', 'of', 'these', 'web', 'pages', 'and', 'view', 'that', 'content', 'on', 'your', 'regular', 'computer', 'when', 'you', 'get', 'home', '.', '<eos>']


In [50]:
res, sentence = evaluate("तुझं नाव काय आहे?")

print(res)

['what', "'", 's', 'your', 'name', '?', '<eos>']


In [51]:
if 'मराठी' in mar_tokenizer.word_docs.keys():
  print("True")
else: print("False")

False


# BLEU Score

In [None]:
import torchtext
from torchtext.data.metrics import bleu_score

In [None]:
from tqdm.notebook import trange

In [None]:
trgs = []
preds = []

for i in trange(len(data)):
  
  src = data['marathi'][i]
  trg = data['english'][i]

  trg = [tok.lower() for tok in sep_punk(trg).split(" ") if tok!='']

  pred, _ = evaluate(src)

  #preds.append(trg)
  preds.append(pred[:-1])
  trgs.append([trg])


HBox(children=(FloatProgress(value=0.0, max=40751.0), HTML(value='')))




In [None]:
trgs

[[['go', '.']],
 [['run', '!']],
 [['run', '!']],
 [['run', '!']],
 [['run', '!']],
 [['who', '?']],
 [['wow', '!']],
 [['fire', '!']],
 [['fire', '!']],
 [['help', '!']],
 [['help', '!']],
 [['jump', '!']],
 [['jump', '!']],
 [['jump', '.']],
 [['jump', '.']],
 [['stop', '!']],
 [['stop', '!']],
 [['wait', '!']],
 [['wait', '!']],
 [['hello', '!']],
 [['hurry', '!']],
 [['hurry', '!']],
 [['hurry', '!']],
 [['i', 'won', '!']],
 [['i', 'won', '!']],
 [['get', 'up', '.']],
 [['got', 'it', '!']],
 [['got', 'it', '?']],
 [['got', 'it', '?']],
 [['got', 'it', '?']],
 [['got', 'it', '?']],
 [['he', 'ran', '.']],
 [['he', 'ran', '.']],
 [['he', 'ran', '.']],
 [['he', 'ran', '.']],
 [['i', 'fell', '.']],
 [['i', 'fell', '.']],
 [['i', 'fell', '.']],
 [['i', 'fell', '.']],
 [['i', 'know', '.']],
 [['i', 'know', '.']],
 [['i', 'know', '.']],
 [['i', 'lost', '.']],
 [['i', 'lost', '.']],
 [['i', 'work', '.']],
 [['i', 'work', '.']],
 [['i', "'", 'm', 'ok', '.']],
 [['listen', '.']],
 [['listen',

In [None]:
preds

[['go', '.'],
 ['jump', '!'],
 ['jump', '!'],
 ['jump', '!'],
 ['jump', '!'],
 ['who', "'", 's', 'up', '?'],
 ['never', 'open', '!'],
 ['come', '!'],
 ['go', '!'],
 ['help', '!'],
 ['help', 'me', '!'],
 ['jump', '!'],
 ['jump', '!'],
 ['jump', '.'],
 ['jump', '.'],
 ['come', '!'],
 ['come', '!'],
 ['come', '!'],
 ['come', '!'],
 ['it', "'", 's', 'popularity', '!'],
 ['come', 'back', '!'],
 ['come', 'back', '!'],
 ['come', 'back', '!'],
 ['i', 'won', '!'],
 ['i', 'won', '!'],
 ['get', 'up', '.'],
 ['come', 'on', '!'],
 ['who', 'did', 'it', 'go', '?'],
 ['who', 'did', 'you', 'get', '?'],
 ['did', 'you', 'see', '?'],
 ['did', 'you', 'see', 'it', '?'],
 ['he', 'ran', '.'],
 ['he', 'ran', '.'],
 ['he', 'ran', '.'],
 ['he', 'ran', '.'],
 ['i', 'fell', '.'],
 ['i', 'fell', '.'],
 ['i', 'fell', '.'],
 ['i', 'fell', '.'],
 ['i', 'know', '.'],
 ['i', 'know', '.'],
 ['there', 'knows', 'don', "'", 't', 'know', '.'],
 ['i', 'lost', '.'],
 ['i', 'lost', '.'],
 ['i', 'work', '.'],
 ['i', 'work', '.']

In [None]:
bleu_score(preds, trgs)

0.8057653903961182