<a href="https://colab.research.google.com/github/Bhavnicksm/marathi-neural-machine-translation/blob/main/tf_seq2seq_attn_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Before beginning this notebook, ensure that you have data.csv in available in the working directory

In [1]:
#!pip install torchtext==0.8.0

In [2]:
#!python -m spacy download en

## Hyperparameter declaration

In [3]:
from argparse import Namespace

In [4]:
hype = Namespace(
    LR = 0.01,
    BATCH_SIZE = 128,
    NUM_EPOCHS = 100,
    CLIP = 1,
    DEVICE = None,
    save_checkpoint =True,
    load_checkpoint = False
)

In [5]:
checkpoint = Namespace(
    epoch_num = 0,
    model_params = None,
    optim_params = None,
    scheduler_params = None,
    losses = [],
)

In [6]:
#example usage
hype.BATCH_SIZE

128

In [7]:
#to dict
vars(hype)

{'BATCH_SIZE': 128,
 'CLIP': 1,
 'DEVICE': None,
 'LR': 0.01,
 'NUM_EPOCHS': 100,
 'load_checkpoint': False,
 'save_checkpoint': True}

## Data Processing

### Loading the data

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [13]:
data = pd.read_csv('data.csv', header=None)
data.columns = ['english', 'marathi']
data.tail()

Unnamed: 0,english,marathi
40746,Just saying you don't like fish because of the...,हड्डींमुळे मासे आवडत नाही असं म्हणणं हे काय मा...
40747,The Japanese Parliament today officially elect...,आज जपानी संसदेने अधिकृतरित्या र्‍यौतारौ हाशिमो...
40748,Tom tried to sell his old VCR instead of throw...,टॉमने त्याचा जुना व्ही.सी.आर फेकून टाकण्याऐवजी...
40749,You can't view Flash content on an iPad. Howev...,आयपॅडवर फ्लॅश आशय बघता येत नाही. पण तुम्ही त्य...
40750,"In 1969, Roger Miller recorded a song called ""...","१९६९मध्ये रॉजर मिलरने ""यू डोन्ट वॉन्ट माय लव्ह..."


### Building tokenizers

In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing import sequence

import re
import string

print(tf.__version__)

2.4.0


In [15]:
def sep_punk(text):
  for punk in string.punctuation:
    text = text.replace(punk," "+punk+" ")
  return text

def add_init_token(sent_list):
  new_sent_list = []
  for sent in sent_list:
    sent = sep_punk(sent)
    sent = '<sos> ' + sent + ' <eos>'
    new_sent_list.append(sent)

  return new_sent_list    

In [16]:
mar_list = list(data['marathi'])
eng_list = list(data['english'])

mar_list = add_init_token(mar_list)
eng_list = add_init_token(eng_list)

In [17]:
print(mar_list[100])
print(eng_list[100])

<sos> मी आहे !  <eos>
<sos> It ' s me !  <eos>


In [18]:
def tokenize(sent_list):
  tokenizer = text.Tokenizer(filters='')
  tokenizer.fit_on_texts(sent_list)

  tensor_list = tokenizer.texts_to_sequences(sent_list)
  tensor_list = sequence.pad_sequences(tensor_list, padding='post')
  
  return {'Tensors': tensor_list, 'Tokenizer': tokenizer} 

In [19]:
marathi = tokenize(sent_list=mar_list)
english = tokenize(sent_list=eng_list)

In [20]:
mar_tokenizer = marathi['Tokenizer']
eng_tokenizer = english['Tokenizer']

print(f'The length of marathi vocab: {len(mar_tokenizer.word_index)}')
print(f'The length of english vocab: {len(eng_tokenizer.word_index)}')

The length of marathi vocab: 13841
The length of english vocab: 5715


In [21]:
mar_tensors = marathi['Tensors']
eng_tensors = english['Tensors']

print(f'Max length of sequence: {len(mar_tensors[0])}')
print(f'Max length of sequence: {len(eng_tensors[0])}')


Max length of sequence: 44
Max length of sequence: 50


In [22]:
print(mar_tensors[0])
print(eng_tensors[0])

[  1 706   3   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]
[ 1 48  3  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]


### Implimenting TF Dataset

In [23]:
from tensorflow.data import Dataset

In [24]:
BUFFER_SIZE = len(mar_tensors)
BATCH_SIZE = 64

dataset = Dataset.from_tensor_slices((mar_tensors, eng_tensors)).shuffle(BUFFER_SIZE, reshuffle_each_iteration=True)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [25]:
ex_mar_batch, ex_eng_batch = next(iter(dataset))
print(ex_mar_batch.shape)
print(ex_eng_batch.shape)

(64, 44)
(64, 50)


## Modelling

In [26]:
from tensorflow import nn
from tensorflow.keras import layers, Model

### Encoder

In [27]:
class Encoder(Model):

  '''
      I know it may not seem like it, but I coded it myself. So please, stfu.
      I am super professional adding this docstring for you. Thank me later.
  '''
  def __init__(self, vocab_size, embedding_size, enc_units, batch_size):
    
    super(Encoder, self).__init__()

    self.batch_size = batch_size
    self.enc_units = enc_units
    self.embedding = layers.Embedding(vocab_size, embedding_size)
    self.gru = layers.GRU(enc_units, return_sequences= True, return_state= True, recurrent_initializer='glorot_uniform')


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    return output,state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.enc_units))

In [28]:
mar_vocab_size = len(mar_tokenizer.word_index) + 1
EMBEDDING_SIZE = 256
GRU_UNITS = 1024

encoder = Encoder(mar_vocab_size, EMBEDDING_SIZE, GRU_UNITS, BATCH_SIZE)

In [29]:
#Example input

ex_hidden = encoder.initialize_hidden_state()
sample_out, sample_hidden = encoder(ex_mar_batch, ex_hidden)

print(sample_out.shape)
print(sample_hidden.shape)

(64, 44, 1024)
(64, 1024)


### Attention

In [30]:
class Attention(layers.Layer):
  '''
  Yes, even this I coded myself. I did not just add the docstring later, bitch ass dickhead.
  BTW, FYI, this is Bahdanau Attention
  '''
  def __init__(self, units):

    super(Attention, self).__init__()
    self.W1 = layers.Dense(units)
    self.W2 = layers.Dense(units)
    self.V = layers.Dense(1)

  def call(self, q, val):
    #make q i.e. the hidden value into the same shape
    q_with_time_axis = tf.expand_dims(q, 1)

    score = self.V( nn.tanh( self.W1(q_with_time_axis) + self.W2(val) ) )

    attention_w = nn.softmax(score, axis=1)

    context_vec = attention_w * val
    context_vec = tf.reduce_sum(context_vec, axis=1)

    return context_vec, attention_w

In [31]:
attention = Attention(10)

In [32]:
#example code
attn_res ,  attn_w = attention(sample_hidden, sample_out)
print(attn_res.shape)
print(attn_w.shape)

(64, 1024)
(64, 44, 1)


### Decoder

In [33]:
class Decoder(Model):

  def __init__(self, vocab_size, embedding_size, dec_units, batch_size):
    super(Decoder,self).__init__()

    self.batch_size = batch_size
    self.dec_units = dec_units
    
    self.embedding = layers.Embedding(vocab_size, embedding_size)
    self.gru = layers.GRU(dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

    self.fc = layers.Dense(vocab_size)

    self.attention = Attention(dec_units)

  def call(self, x, hidden, enc_out):
    context_vec, attention_w = self.attention(hidden, enc_out)

    x = self.embedding(x)

    x = tf.concat([tf.expand_dims(context_vec, 1), x], axis=-1)

    output, state = self.gru(x)

    output = tf.reshape(output, (-1, output.shape[2]))

    x = self.fc(output)

    return x, state, attention_w

In [34]:
eng_vocab_size = len(eng_tokenizer.word_index) + 1

decoder = Decoder(eng_vocab_size, EMBEDDING_SIZE, GRU_UNITS, BATCH_SIZE)

In [35]:
#example code

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),sample_hidden, sample_out)

print (sample_decoder_output.shape)

(64, 5716)


## Training

### Optimizers and Loss Functions

In [36]:
from tensorflow.keras import optimizers as optim
from tensorflow.keras import losses

In [37]:
optimizer = optim.Adam()
criteria = losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss = criteria(real, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss = loss*mask

  return tf.reduce_mean(loss)

### Checkpoint

In [38]:
checkpoint_path = './tf_checkpoint'
checkpoint = tf.train.Checkpoint(optimizer = optimizer,
                                 encoder=encoder,
                                 decoder=decoder
                                 )

### Training Loop

In [39]:
import time

In [42]:
@tf.function
def train_step(src, trg, enc_hidden):
  loss = 0


  with tf.GradientTape() as tape:
    enc_out, enc_hidden = encoder(src, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([eng_tokenizer.word_index['<sos>']]*BATCH_SIZE, 1)

    for t in range(1, trg.shape[1]):

      pred, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_out)

      loss += loss_function(trg[:,t], pred)

      dec_input = tf.expand_dims(trg[:,t],1)

  
  batch_loss = loss/ int(trg.shape[1])

  variables = encoder.trainable_variables + decoder.trainable_variables

  grads = tape.gradient(loss,variables)

  optimizer.apply_gradients(zip(grads, variables))

  return batch_loss

In [43]:
EPOCHS = 10
steps_per_epoch = len(mar_tensors)//BATCH_SIZE


for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()

  total_loss = 0

  for (batch, (src,trg)) in enumerate(dataset.take(steps_per_epoch)):

    batch_loss = train_step(src, trg, enc_hidden)

    total_loss += batch_loss

    if batch%100 == 0:
      print(f'Epoch {epoch} Batch {batch} Loss{batch_loss.numpy():.4f}')

  
  if (epoch+1)%2 == 0:
    checkpoint.save(file_prefix=checkpoint_path)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 0 Batch 0 Loss1.4381
Epoch 0 Batch 100 Loss0.6549
Epoch 0 Batch 200 Loss0.6018
Epoch 0 Batch 300 Loss0.6292
Epoch 0 Batch 400 Loss0.4723
Epoch 0 Batch 500 Loss0.5306
Epoch 0 Batch 600 Loss0.4074
Epoch 1 Loss 0.5714
Time taken for 1 epoch 359.6276400089264 sec

Epoch 1 Batch 0 Loss0.4249
Epoch 1 Batch 100 Loss0.3775
Epoch 1 Batch 200 Loss0.3672
Epoch 1 Batch 300 Loss0.3786
Epoch 1 Batch 400 Loss0.3537
Epoch 1 Batch 500 Loss0.3106
Epoch 1 Batch 600 Loss0.2496
Epoch 2 Loss 0.3403
Time taken for 1 epoch 322.93631196022034 sec

Epoch 2 Batch 0 Loss0.2315
Epoch 2 Batch 100 Loss0.1924
Epoch 2 Batch 200 Loss0.1953
Epoch 2 Batch 300 Loss0.2088
Epoch 2 Batch 400 Loss0.1613
Epoch 2 Batch 500 Loss0.1674
Epoch 2 Batch 600 Loss0.1937
Epoch 3 Loss 0.2036
Time taken for 1 epoch 322.043879032135 sec

Epoch 3 Batch 0 Loss0.1153
Epoch 3 Batch 100 Loss0.1301
Epoch 3 Batch 200 Loss0.0985
Epoch 3 Batch 300 Loss0.1385
Epoch 3 Batch 400 Loss0.1185
Epoch 3 Batch 500 Loss0.1319
Epoch 3 Batch 600 Loss0.084

In [102]:
from matplotlib import ticker

In [103]:
def evaluate(sentence):
  attention_plot = np.zeros((eng_tensors.shape[1],mar_tensors.shape[1]))

  sentence = sep_punk(sentence)

  inputs = [mar_tokenizer.word_index[i] for i in sentence.split(' ') if i!='']
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=mar_tensors.shape[1],
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, GRU_UNITS))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([eng_tokenizer.word_index['<sos>']], 0)

  for t in range(eng_tensors.shape[1]):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += eng_tokenizer.index_word[predicted_id] + ' '

    if eng_tokenizer.index_word[predicted_id] == '<eos>':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

In [104]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

In [105]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

  attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
  plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [106]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_path))

<tensorflow.python.training.tracking.util.InitializationOnlyStatus at 0x7fc7eee368d0>

In [121]:
res, sentence , _ = evaluate(data['marathi'][20212])


print(sentence)
print(data['english'][20212])
print(res)

तिघांना अटक करण्यात आली . 
The three were arrested.
the three were arrested . <eos> 
