<a href="https://colab.research.google.com/github/Bhavnicksm/marathi-neural-machine-translation/blob/main/tf_seq2seq_attn_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Before beginning this notebook, ensure that you have data.csv in available in the working directory

In [2]:
#!pip install torchtext==0.8.0

In [None]:
#!python -m spacy download en

## Hyperparameter declaration

In [3]:
from argparse import Namespace

In [20]:
hype = Namespace(
    LR = 0.01,
    BATCH_SIZE = 64,
    NUM_EPOCHS = 100,
    CLIP = 1,
)

In [29]:
model_hype = Namespace(
    EMBEDDING_SIZE = 256,
    GRU_UNITS = 1024,
    ATTN_SIZE = 10,
)

In [6]:
#example usage
hype.BATCH_SIZE

128

In [7]:
#to dict
vars(hype)

{'BATCH_SIZE': 128,
 'CLIP': 1,
 'DEVICE': None,
 'LR': 0.01,
 'NUM_EPOCHS': 100,
 'load_checkpoint': False,
 'save_checkpoint': True}

## Data Processing

### Loading the data

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [9]:
data = pd.read_csv('data.csv', header=None)
data.columns = ['english', 'marathi']
data.tail()

Unnamed: 0,english,marathi
40746,Just saying you don't like fish because of the...,हड्डींमुळे मासे आवडत नाही असं म्हणणं हे काय मा...
40747,The Japanese Parliament today officially elect...,आज जपानी संसदेने अधिकृतरित्या र्‍यौतारौ हाशिमो...
40748,Tom tried to sell his old VCR instead of throw...,टॉमने त्याचा जुना व्ही.सी.आर फेकून टाकण्याऐवजी...
40749,You can't view Flash content on an iPad. Howev...,आयपॅडवर फ्लॅश आशय बघता येत नाही. पण तुम्ही त्य...
40750,"In 1969, Roger Miller recorded a song called ""...","१९६९मध्ये रॉजर मिलरने ""यू डोन्ट वॉन्ट माय लव्ह..."


### Building tokenizers

In [10]:
import tensorflow as tf
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing import sequence

import re
import string

print(tf.__version__)

2.4.1


In [11]:
def sep_punk(text):
  for punk in string.punctuation:
    text = text.replace(punk," "+punk+" ")
  return text

def add_init_token(sent_list):
  new_sent_list = []
  for sent in sent_list:
    sent = sep_punk(sent)
    sent = '<sos> ' + sent + ' <eos>'
    new_sent_list.append(sent)

  return new_sent_list    

In [12]:
mar_list = list(data['marathi'])
eng_list = list(data['english'])

mar_list = add_init_token(mar_list)
eng_list = add_init_token(eng_list)

In [13]:
print(mar_list[100])
print(eng_list[100])

<sos> मी आहे !  <eos>
<sos> It ' s me !  <eos>


In [14]:
def tokenize(sent_list):
  tokenizer = text.Tokenizer(filters='')
  tokenizer.fit_on_texts(sent_list)

  tensor_list = tokenizer.texts_to_sequences(sent_list)
  tensor_list = sequence.pad_sequences(tensor_list, padding='post')
  
  return {'Tensors': tensor_list, 'Tokenizer': tokenizer} 

In [15]:
marathi = tokenize(sent_list=mar_list)
english = tokenize(sent_list=eng_list)

In [16]:
mar_tokenizer = marathi['Tokenizer']
eng_tokenizer = english['Tokenizer']

print(f'The length of marathi vocab: {len(mar_tokenizer.word_index)}')
print(f'The length of english vocab: {len(eng_tokenizer.word_index)}')

The length of marathi vocab: 13841
The length of english vocab: 5715


In [17]:
mar_tensors = marathi['Tensors']
eng_tensors = english['Tensors']

print(f'Max length of sequence: {len(mar_tensors[0])}')
print(f'Max length of sequence: {len(eng_tensors[0])}')


Max length of sequence: 44
Max length of sequence: 50


In [18]:
print(mar_tensors[0])
print(eng_tensors[0])

[  1 706   3   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]
[ 1 48  3  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]


### Implimenting TF Dataset

In [19]:
from tensorflow.data import Dataset

In [21]:
BUFFER_SIZE = len(mar_tensors)
BATCH_SIZE = hype.BATCH_SIZE

dataset = Dataset.from_tensor_slices((mar_tensors, eng_tensors)).shuffle(BUFFER_SIZE, reshuffle_each_iteration=True)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [22]:
ex_mar_batch, ex_eng_batch = next(iter(dataset))
print(ex_mar_batch.shape)
print(ex_eng_batch.shape)

(64, 44)
(64, 50)


## Modelling

In [23]:
from tensorflow import nn
from tensorflow.keras import layers, Model

### Encoder

In [24]:
class Encoder(Model):
  def __init__(self, vocab_size, embedding_size, enc_units, batch_size):
    
    super(Encoder, self).__init__()

    self.batch_size = batch_size
    self.enc_units = enc_units
    self.embedding = layers.Embedding(vocab_size, embedding_size)
    self.gru = layers.GRU(enc_units, return_sequences= True, return_state= True, recurrent_initializer='glorot_uniform')


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    return output,state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.enc_units))

In [26]:
mar_vocab_size = len(mar_tokenizer.word_index) + 1
EMBEDDING_SIZE = model_hype.EMBEDDING_SIZE
GRU_UNITS = model_hype.GRU_UNITS

encoder = Encoder(mar_vocab_size, EMBEDDING_SIZE, GRU_UNITS, BATCH_SIZE)

In [27]:
#Example input

ex_hidden = encoder.initialize_hidden_state()
sample_out, sample_hidden = encoder(ex_mar_batch, ex_hidden)

print(sample_out.shape)
print(sample_hidden.shape)

(64, 44, 1024)
(64, 1024)


### Attention

In [28]:
class Attention(layers.Layer):
  def __init__(self, units):

    super(Attention, self).__init__()
    self.W1 = layers.Dense(units)
    self.W2 = layers.Dense(units)
    self.V = layers.Dense(1)

  def call(self, q, val):
    #make q i.e. the hidden value into the same shape
    q_with_time_axis = tf.expand_dims(q, 1)

    score = self.V( nn.tanh( self.W1(q_with_time_axis) + self.W2(val) ) )

    attention_w = nn.softmax(score, axis=1)

    context_vec = attention_w * val
    context_vec = tf.reduce_sum(context_vec, axis=1)

    return context_vec, attention_w

In [30]:
ATTN_SIZE = model_hype.ATTN_SIZE
attention = Attention(ATTN_SIZE)

In [31]:
#example code
attn_res ,  attn_w = attention(sample_hidden, sample_out)
print(attn_res.shape)
print(attn_w.shape)

(64, 1024)
(64, 44, 1)


### Decoder

In [32]:
class Decoder(Model):

  def __init__(self, vocab_size, embedding_size, dec_units, batch_size):
    super(Decoder,self).__init__()

    self.batch_size = batch_size
    self.dec_units = dec_units
    
    self.embedding = layers.Embedding(vocab_size, embedding_size)
    self.gru = layers.GRU(dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

    self.fc = layers.Dense(vocab_size)

    self.attention = Attention(dec_units)

  def call(self, x, hidden, enc_out):
    context_vec, attention_w = self.attention(hidden, enc_out)

    x = self.embedding(x)

    x = tf.concat([tf.expand_dims(context_vec, 1), x], axis=-1)

    output, state = self.gru(x)

    output = tf.reshape(output, (-1, output.shape[2]))

    x = self.fc(output)

    return x, state, attention_w

In [33]:
eng_vocab_size = len(eng_tokenizer.word_index) + 1

decoder = Decoder(eng_vocab_size, EMBEDDING_SIZE, GRU_UNITS, BATCH_SIZE)

In [34]:
#example code

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),sample_hidden, sample_out)

print (sample_decoder_output.shape)

(64, 5716)


## Training

### Optimizers and Loss Functions

In [35]:
from tensorflow.keras import optimizers as optim
from tensorflow.keras import losses

In [36]:
optimizer = optim.Adam(learning_rate=hype.LR)
criteria = losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss = criteria(real, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss = loss*mask

  return tf.reduce_mean(loss)

### Azure Blob set-up and loading


In [38]:
#pip install azure-storage-blob

In [39]:
import os, uuid
from azure.storage import blob
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
print(__version__)

12.7.1


In [41]:
os.mkdir("tf_checkpoint")

In [42]:
connect_str = "DefaultEndpointsProtocol=https;AccountName=tfmodel;AccountKey=PinzJZWJy/mFOWDgkBcCTPA9Fnfr7/qvaZSbjxQVH4YGrBt4MseqbKYjUGNKYX9PpBh+zgAk6uDrVpmvejBCiw==;EndpointSuffix=core.windows.net"

In [43]:
blob_service_client =  BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client("tf-ckpt")
blob_list = [blob.name for blob in container_client.list_blobs()]

In [44]:
for file in blob_list:
  blob_client = blob_service_client.get_blob_client('tf-ckpt', file)
  with open('./tf_checkpoint/'+file, "wb") as f:
    f.write(blob_client.download_blob().readall())

### Checkpoint

In [45]:
checkpoint_path = './tf_checkpoint'
checkpoint = tf.train.Checkpoint(epoch=tf.Variable(1),
                                 optimizer = optimizer,
                                 encoder=encoder,
                                 decoder=decoder,
                                 )
manager = tf.train.CheckpointManager(checkpoint, checkpoint_path, max_to_keep=1)


In [46]:
#manager.save()

In [47]:
manager.checkpoints

['./tf_checkpoint/ckpt-9']

In [48]:
manager.restore_or_initialize()

'./tf_checkpoint/ckpt-9'

### Training Loop

In [None]:
import time

In [None]:
@tf.function
def train_step(src, trg, enc_hidden):
  loss = 0


  with tf.GradientTape() as tape:
    enc_out, enc_hidden = encoder(src, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([eng_tokenizer.word_index['<sos>']]*BATCH_SIZE, 1)

    for t in range(1, trg.shape[1]):

      pred, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_out)

      loss += loss_function(trg[:,t], pred)

      dec_input = tf.expand_dims(trg[:,t],1)

  
  batch_loss = loss/ int(trg.shape[1])

  variables = encoder.trainable_variables + decoder.trainable_variables

  grads = tape.gradient(loss,variables)

  optimizer.apply_gradients(zip(grads, variables))

  return batch_loss

In [None]:
EPOCHS = 10
steps_per_epoch = len(mar_tensors)//BATCH_SIZE


for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()

  total_loss = 0

  for (batch, (src,trg)) in enumerate(dataset.take(steps_per_epoch)):

    batch_loss = train_step(src, trg, enc_hidden)

    total_loss += batch_loss

    if batch%100 == 0:
      print(f'Epoch {epoch} Batch {batch} Loss{batch_loss.numpy():.4f}')

  
  if (epoch+1)%2 == 0:
    manager.save()

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 0 Batch 0 Loss1.3571
Epoch 0 Batch 100 Loss0.7339
Epoch 0 Batch 200 Loss0.6140
Epoch 0 Batch 300 Loss0.5658
Epoch 0 Batch 400 Loss0.5622
Epoch 0 Batch 500 Loss0.4740
Epoch 0 Batch 600 Loss0.4499
Epoch 1 Loss 0.5709
Time taken for 1 epoch 360.0094299316406 sec

Epoch 1 Batch 0 Loss0.4258
Epoch 1 Batch 100 Loss0.4397
Epoch 1 Batch 200 Loss0.3931
Epoch 1 Batch 300 Loss0.3750
Epoch 1 Batch 400 Loss0.3248
Epoch 1 Batch 500 Loss0.3080
Epoch 1 Batch 600 Loss0.2651
Epoch 2 Loss 0.3519
Time taken for 1 epoch 319.1427962779999 sec

Epoch 2 Batch 0 Loss0.2484
Epoch 2 Batch 100 Loss0.2567
Epoch 2 Batch 200 Loss0.2386
Epoch 2 Batch 300 Loss0.1931
Epoch 2 Batch 400 Loss0.2219
Epoch 2 Batch 500 Loss0.2131
Epoch 2 Batch 600 Loss0.1981
Epoch 3 Loss 0.2145
Time taken for 1 epoch 318.7302939891815 sec

Epoch 3 Batch 0 Loss0.1533
Epoch 3 Batch 100 Loss0.1521
Epoch 3 Batch 200 Loss0.1438
Epoch 3 Batch 300 Loss0.1662
Epoch 3 Batch 400 Loss0.1446
Epoch 3 Batch 500 Loss0.1048
Epoch 3 Batch 600 Loss0.142

In [None]:
manager.save()

'./tf_checkpoint/ckpt-9'

### Saving in blob

**Caution:** Only change this in case you wish to permanently change the model file. Do not change this otherwise.

In [49]:
# # clear the blob
# for file_name in blob_list:
#   container_client.delete_blob(blob=file_name)

In [50]:
# # getting the file names
# files = os.listdir('./tf_checkpoint')
# files

In [51]:
# # uploading the files
# for file in files:
#   blob_client = container_client.get_blob_client(file)
#   with open("./tf_checkpoint/" + file,"rb") as data:
#     blob_client.upload_blob(data)

## Inference

In [None]:
from matplotlib import ticker

In [52]:
def evaluate(sentence):
  attention_plot = np.zeros((eng_tensors.shape[1],mar_tensors.shape[1]))

  sentence = sep_punk(sentence)

  inputs = [mar_tokenizer.word_index[i] for i in sentence.lower().split(' ') if i!='']
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=mar_tensors.shape[1],
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = []

  hidden = [tf.zeros((1, GRU_UNITS))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([eng_tokenizer.word_index['<sos>']], 0)

  for t in range(eng_tensors.shape[1]):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result.append(eng_tokenizer.index_word[predicted_id])

    if eng_tokenizer.index_word[predicted_id] == '<eos>':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

In [53]:
res, sentence , _ = evaluate(data['marathi'][20222])


print(sentence)
print(data['english'][20222])
print(res)

एक फरक आहे . 
There is one difference.
['there', 'is', 'one', 'difference', '.', '<eos>']


# BLEU Score

In [54]:
from torchtext.data.metrics import bleu_score

In [55]:
def calculate_bleu_score(data):
  
  trgs = []
  preds = []

  for i in range(len(data)):
    src = data['marathi'][i]
    trg = data['english'][i]

    trg = [tok for tok in sep_punk(trg).split(" ") if tok!='']

    pred, _, _ = evaluate(src)

    #preds.append(trg)
    preds.append(pred[:-1])
    trgs.append([trg])

  return bleu_score(preds, trgs)

In [56]:
bleu = calculate_bleu_score(data)
print(f"The BLEU score is {bleu*100:.2f}")

KeyboardInterrupt: ignored

In [57]:
trgs = []
preds = []

for i in range(len(data)):
  src = data['marathi'][i]
  trg = data['english'][i]

  trg = [tok for tok in sep_punk(trg).split(" ") if tok!='']

  pred, _, _ = evaluate(src)

  #preds.append(trg)
  preds.append(pred[:-1])
  trgs.append([trg])

KeyboardInterrupt: ignored

In [None]:
bleu_score(preds, trgs)

0.5000459551811218