## Import the libraries

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import tensorflow as tf
from collections import Counter

## Preprocessing to feed into Tensorflow Model

In [None]:
file_path1 = 'F:\Datasets\data.en1'
file_path2 = 'F:\Datasets\data.en2'
file_path3 = 'F:\Datasets\data.en3'
eng_lines1 = []
eng_lines2 = []
eng_lines3 = []

with open(file_path1, 'r', encoding='utf-8') as file:
    eng_lines1 = [line.strip() for line in file.readlines()]
with open(file_path2, 'r', encoding='utf-8') as file:
    eng_lines2 = [line.strip() for line in file.readlines()]
with open(file_path3, 'r', encoding='utf-8') as file:
    eng_lines3 = [line.strip() for line in file.readlines()]

eng_lines = eng_lines1 + eng_lines2 + eng_lines3

In [None]:
file_path4 = 'F:\Datasets\data.ta1'
file_path5 = 'F:\Datasets\data.ta2'
file_path6 = 'F:\Datasets\data.ta3'
tam_lines1 = []
tam_lines2 = []
tam_lines3 = []

with open(file_path4, 'r', encoding='utf-8') as file:
    tam_lines1 = [line.strip() for line in file.readlines()]
with open(file_path5, 'r', encoding='utf-8') as file:
    tam_lines2 = [line.strip() for line in file.readlines()]
with open(file_path6, 'r', encoding='utf-8') as file:
    tam_lines3 = [line.strip() for line in file.readlines()]

tam_lines = tam_lines1 + tam_lines2 + tam_lines3

In [None]:
context = np.array(eng_lines)
target = np.array(tam_lines)

In [None]:
BUFFER_SIZE = len(context)
BATCH_SIZE = 64

is_train = np.random.uniform(size=(len(target),)) < 0.8

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context[is_train], target[is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context[~is_train], target[~is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))

In [None]:
example_text = tf.constant('இயற்கை')
print(example_text.numpy().decode())

In [None]:
def tf_lower_and_split_punct(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿\u0B80-\u0BFF]', '')
    text = tf.strings.regex_replace(text, r'([.?!,¿])', r' \1 ')
    text = tf.strings.strip(text)
    text = tf.strings.join(['[SOS]', text, '[EOS]'], separator=' ')
    return text

print(tf_lower_and_split_punct(example_text).numpy().decode())

In [None]:
max_vocab_size = 50000

In [None]:
context_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True)

context_text_processor.adapt(train_raw.map(lambda context, target: context))
context_text_processor.get_vocabulary()[:10]
context_text_processor.vocabulary_size()

In [None]:
target_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True)

target_text_processor.adapt(train_raw.map(lambda context, target: target))
target_text_processor.vocabulary_size()

In [None]:
def process_text(context, target):
  context = context_text_processor(context).to_tensor()
  target = target_text_processor(target)
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()
  return (context, targ_in), targ_out
    
train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

In [None]:
for (to_translate, sr_translation), translation in train_ds.take(1):
  print(to_translate[0, :].numpy()) 
  print()
  print(sr_translation[0, :].numpy()) 
  print(translation[0, :].numpy())

# Build the Encoder, Decoder and Attention Mechanism

In [None]:
VOCAB_SIZE = 50000
UNITS = 256

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, units):
        super(Encoder, self).__init__()

        self.embedding = tf.keras.layers.Embedding(  
            input_dim=vocab_size,
            output_dim=units,
            mask_zero=True
        )  

        self.rnn = tf.keras.layers.Bidirectional(  
            merge_mode="sum",  
            layer=tf.keras.layers.LSTM(
                units=units,
                return_sequences=True
            ),  
        )  

    def call(self, context):
        x = self.embedding(context)
        x = self.rnn(x)
        return x

In [None]:
encoder = Encoder(VOCAB_SIZE, UNITS)

encoder_output = encoder(to_translate)

print(f'Tensor of sentences in english has shape: {to_translate.shape}\n')
print(f'Encoder output has shape: {encoder_output.shape}')

In [None]:
class CrossAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        
        self.mha = ( 
            tf.keras.layers.MultiHeadAttention(
                key_dim=units,
                num_heads=1
            ) 
        )  

        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

    def call(self, context, target):
        attn_output = self.mha(
            query=target,
            value=context
        )

        x = self.add([target, attn_output])
        x = self.layernorm(x)

        return x

In [None]:
attention_layer = CrossAttention(UNITS)

sr_translation_embed = tf.keras.layers.Embedding(VOCAB_SIZE, output_dim=UNITS, mask_zero=True)(sr_translation)

attention_result = attention_layer(encoder_output, sr_translation_embed)

print(f'Tensor of contexts has shape: {encoder_output.shape}')
print(f'Tensor of translations has shape: {sr_translation_embed.shape}')
print(f'Tensor of attention scores has shape: {attention_result.shape}')

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, units):
        super(Decoder, self).__init__()

        self.embedding = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=units,
            mask_zero=True
        )  

        self.pre_attention_rnn = tf.keras.layers.LSTM(
            units=units,
            return_sequences=True,
            return_state=True
        )  

        self.attention = CrossAttention(units)

        self.post_attention_rnn = tf.keras.layers.LSTM(
            units=units,
            return_sequences=True
        )  

        self.output_layer = tf.keras.layers.Dense(
            units=vocab_size,
            activation=tf.nn.log_softmax
        )  

    def call(self, context, target, state=None, return_state=False):
        
        x = self.embedding(target)       
        x, hidden_state, cell_state = self.pre_attention_rnn(x, initial_state=state)
        x = self.attention(context, x)
        x = self.post_attention_rnn(x)
        logits = self.output_layer(x)
        
        if return_state:
            return logits, [hidden_state, cell_state]

        return logits

In [None]:
decoder = Decoder(VOCAB_SIZE, UNITS)

logits = decoder(encoder_output, sr_translation)

print(f'Tensor of contexts has shape: {encoder_output.shape}')
print(f'Tensor of right-shifted translations has shape: {sr_translation.shape}')
print(f'Tensor of logits has shape: {logits.shape}')

In [None]:
class Translator(tf.keras.Model):
    def __init__(self, vocab_size, units):
        super().__init__()

        self.encoder = Encoder(vocab_size, units)
        self.decoder = Decoder(vocab_size, units)

    def call(self, inputs):
        context, target = inputs

        encoded_context = self.encoder(context)
        logits = self.decoder(encoded_context, target)
        
        return logits

In [None]:
translator = Translator(VOCAB_SIZE, UNITS)

logits = translator((to_translate, sr_translation))

print(f'Tensor of sentences to translate has shape: {to_translate.shape}')
print(f'Tensor of right-shifted translations has shape: {sr_translation.shape}')
print(f'Tensor of logits has shape: {logits.shape}')

# Compile and Train the model

In [None]:
def masked_loss(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)
    
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask
    
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def masked_acc(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)

    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)

    return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
def compile_and_train(model, epochs=100, steps_per_epoch=100):
    model.compile(optimizer="adam", loss=masked_loss, metrics=[masked_acc, masked_loss])

    history = model.fit(
        train_ds.repeat(),
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_ds,
        validation_steps=20,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)],
    )

    return model, history

In [None]:
model, history = compile_and_train(translator)

# Some Utility Functions

In [None]:
word_to_id = tf.keras.layers.StringLookup(
    vocabulary=target_text_processor.get_vocabulary(), 
    mask_token="", 
    oov_token="[UNK]"
)

id_to_word = tf.keras.layers.StringLookup(
    vocabulary=target_text_processor.get_vocabulary(),
    mask_token="",
    oov_token="[UNK]",
    invert=True,
)

def tokens_to_text(tokens, id_to_word):
    words = id_to_word(tokens)
    text = " ".join(words)
    return text

unk_id = word_to_id("[UNK]")
sos_id = word_to_id("[SOS]")
eos_id = word_to_id("[EOS]")

# Minimum Bayes Risk Decoding

In [None]:
def generate_next_token(decoder, context, next_token, done, state, temperature=0.0):
    
    logits, state = decoder(context, next_token, state=state, return_state=True)
    
    logits = logits[:, -1, :]
        
    if temperature == 0.0:
        next_token = tf.argmax(logits, axis=-1)     
    else:
        logits = logits / temperature
        next_token = tf.random.categorical(logits, num_samples=1)
    
    logits = tf.squeeze(logits)
    next_token = tf.squeeze(next_token)
    
    logit = logits[next_token].numpy()
    
    next_token = tf.reshape(next_token, shape=(1,1))

    if next_token == eos_id:
        done = True
    
    return next_token, logit, state, done

In [None]:
def translate(model, text, max_length=50, temperature=0.0):
    tokens, logits = [], []
    
    text = tf.convert_to_tensor(text)[tf.newaxis]
    context = english_vectorizer(text).to_tensor()
    context = model.encoder(context)

    next_token = tf.fill((1,1), sos_id)
    
    state = [tf.zeros((1, UNITS)), tf.zeros((1, UNITS))]
    
    done = False
    
    for i in range(max_length):
        next_token, logit, state, done = generate_next_token(decoder=model.decoder, context=context, next_token=next_token, done=done, state=state,temperature=temperature)

        if done:
            break
    
        tokens.append(next_token)
        logits.append(logit)
    
    tokens = tf.concat(tokens, axis=-1)
    
    translation = tf.squeeze(tokens_to_text(tokens, id_to_word))
    translation = translation.numpy().decode()
    
    return translation, logits[-1], tokens

In [None]:
def generate_samples(model, text, n_samples=4, temperature=0.6):
    
    samples, log_probs = [], []

    for _ in range(n_samples):
        _, logp, sample = translate(model, text, temperature=temperature)
        samples.append(np.squeeze(sample.numpy()).tolist())
        log_probs.append(logp)
                
    return samples, log_probs

In [None]:
def jaccard_similarity(candidate, reference):
        
    candidate_set = set(candidate)
    reference_set = set(reference)
    
    common_tokens = candidate_set.intersection(reference_set)
    
    all_tokens = candidate_set.union(reference_set)
    
    overlap = len(common_tokens) / len(all_tokens)
        
    return overlap

In [None]:
def weighted_avg_overlap(samples, log_probs, similarity_fn):
    scores = {}
    
    for index_candidate, candidate in enumerate(samples):    
        
        overlap, weight_sum = 0.0, 0.0
        
        for index_sample, (sample, logp) in enumerate(zip(samples, log_probs)):
           
            if index_candidate == index_sample:
                continue
                
            sample_p = float(np.exp(logp))
            weight_sum += sample_p
            sample_overlap = similarity_fn(candidate, sample)
            overlap += sample_p * sample_overlap
            
        score = overlap / weight_sum
        score = round(score, 3)
        
        scores[index_candidate] = score
    
    return scores

In [None]:
def mbr_decode(model, text, n_samples=5, temperature=0.6, similarity_fn=jaccard_similarity):

    samples, log_probs = generate_samples(model, text, n_samples=n_samples, temperature=temperature)
    scores = weighted_avg_overlap(samples, log_probs, similarity_fn)
    decoded_translations = [tokens_to_text(s, id_to_word).numpy().decode('utf-8') for s in samples]
    max_score_key = max(scores, key=lambda k: scores[k])
    translation = decoded_translations[max_score_key]
    
    return translation, decoded_translations

# Test the model

In [None]:
english_sentence = "I love languages"

translation, candidates = mbr_decode(model, english_sentence, n_samples=10, temperature=0.6)

print("Translation candidates:")
for c in candidates:
    print(c)

print(f"\nSelected translation: {translation}")