# Project 2 Deep Learning :  Machine Translation for two "artificial" languages

### Imports and Loading of Data

In [1]:
import tensorflow 
import pickle
import string
import re
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model

In [2]:
# Load the input and output data
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import optimizers
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


### Generating the Language by combining input and output 

In [3]:
train_input_texts = pickle.load(open('Train_input', 'rb'))
train_output_texts = pickle.load(open('Train_output', 'rb'))

# Getting the max length of input and output
input_lengthMax = max([len(txt) for txt in train_input_texts])
output_lengthMax = max([len(txt) for txt in train_output_texts])

print(input_lengthMax)
print(output_lengthMax)

64
99


### For Training and Testing Purposes, I made a split in dataset where I took a small proportion of datasamples for my testing and verfication, below is train set I used for training and testing. After this, 80-20 split is done on this train set for training and validation.

In [4]:
# language = pickle.load(open('train_set', 'rb')) # Test Set of 500 samples

In [5]:
language = []
max_line_op_length = 0
for line_ip, line_op in zip(train_input_texts, train_output_texts):
    output_lng = "[start] " + line_op + " [end]" 
    language.append((line_ip, output_lng))
    
    current_line_op_length = len(output_lng)
    
    if current_line_op_length > max_line_op_length:
        max_line_op_length = current_line_op_length

print(max_line_op_length)
print(len(language))
print(language[1])

113
112000
('a d a d b d a e b d a g c g a g c f c f ', '[start] b d b d a e e a d d f c g c f c f a g i j a g h k a d g l  [end]')


In [6]:
# split of data into train and validation
# random.shuffle(language)
val_set_num = int(0.2 * len(language))
train_set_num = len(language) - val_set_num
train_set = language[:train_set_num]
val_set = language[train_set_num:train_set_num + val_set_num]
print(val_set_num)
print(train_set_num)

22400
89600


### Doing the Source and Target Vectorization of the input language and output language
#### Vocab Size and Sequence Length are chosen by analyzing the tokens and language and by experimenting with the different values in that range.

In [7]:
def custom_standardization(input_string):
    strip_chars = string.punctuation
    strip_chars = strip_chars.replace("[", "")
    strip_chars = strip_chars.replace("]", "")
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

vocab_size = 22 # Analyzing the language 
sequence_length = 130 # max length of output being about 113 so approx kept it as 150

source_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)
target_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_inp_lang = [pair[0] for pair in train_set]
train_otp_lang = [pair[1] for pair in train_set]
source_vectorization.adapt(train_inp_lang)
target_vectorization.adapt(train_otp_lang)

### Preprocessing the data

In [8]:
batch_size = 64

def format_dataset(inp, otp):
    inp = source_vectorization(inp)
    otp = target_vectorization(otp)
    return ({
        "language_1": inp,
        "language_2": otp[:, :-1],
    }, otp[:, 1:])

def make_dataset(pairs):
    inp_lang, otp_lang = zip(*pairs)
    inp_lang = list(inp_lang)
    otp_lang = list(otp_lang)
    dataset = tf.data.Dataset.from_tensor_slices((inp_lang, otp_lang))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_dataset = make_dataset(train_set)
val_dataset = make_dataset(val_set)

for inputs, targets in train_dataset.take(1):
    print(f"inputs['language_1'].shape: {inputs['language_1'].shape}")
    print(f"inputs['language_2'].shape: {inputs['language_2'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['language_1'].shape: (64, 130)
inputs['language_2'].shape: (64, 130)
targets.shape: (64, 130)


### Transformer Encoder : Implementing Layer as a subclass using dense layers and relu activation function. 
#### Different experiments were done with activation function, MultiHead Attention Layer and dimesions but this is the code which gave the best accuracy.

In [9]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        # Implemention of the multi-head attention mechanism
        self.attention = layers.MultiHeadAttention( 
            num_heads=num_heads, key_dim=embed_dim)
        # Two dense layers : ReLU activation + final dense layer with no activation function
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        # Two LayerNormalization layers
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim, # input,output vector dimensions
            "num_heads": self.num_heads, # no of heads for use in the multi-head attention layers
            "dense_dim": self.dense_dim, # intermediate vector dimensions after the dense projection layer
        })
        return config

### Transformer Decoder : Implementing Layer as a subclass using dense layers and relu activation function. 
#### Different experiments were done with activation function, Normalization layers and dimesions but this is the code which gave the best accuracy.

In [10]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim # dimensionality of embedding
        self.dense_dim = dense_dim # hidden layer dimensionality in the feedforward network inside the decoder
        self.num_heads = num_heads # no of heads for use in the multi-head attention layers
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)
    
    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        else:
            padding_mask = mask
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

### Positional Embeddings to re-inject order information. Implementing positional embedding as a subclassed layer.
#### Tried various optimizations such as sinusodial positional and absolute embeddings but the best one was Learned Embeddings.

In [11]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            "output_dim": self.output_dim, 
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim, 
        })
        return config

### Summing up the Model by calling the layers
#### 1. Various experiments were done with the dimension, num heads  but the best one is kept. 
#### 2. Different versions of optimizer were also tried and experiments with learning rate such as Decay Learning Rate were also tried.
#### 3. Experimented with the dropout layers and percentage to do regularization.
#### 4. Early Stopping was also tried with different values of patience.

In [None]:
# embed_dim = 512 
# dense_dim = 4096
# num_heads = 16
embed_dim = 256 
dense_dim = 2048
num_heads = 8
# We have two inputs which are passed to model : encoder and decoder
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="language_1")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs) # using positional embeddings to input
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x) # Encoder takes output of positional embeddings

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="language_2") # same flow as above for decode rlayer
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x) #Dropout layer is applied to the output of the TransformerDecoder layer to prevent overfitting
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x) # A dense layer with softmax activation is utilized to generate the probability distribution across the vocabulary for every position in the output sequence.
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
transformer.summary

In [None]:
# Model Saving
callbacks =[keras.callbacks.ModelCheckpoint("Rishabh_Bassi_532008692_Project2_Model.h5",save_best_only=True)]

In [None]:
# from tensorflow.keras.optimizers.schedules import PolynomialDecay
#from tensorflow.keras.callbacks import EarlyStopping

# early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# initial_learning_rate = 0.001
# end_learning_rate = 0.0001
# epochs = 120
# decay_steps = len(train_dataset) * epochs
# lr_schedule = PolynomialDecay(
#     initial_learning_rate, decay_steps, end_learning_rate, power=0.5
# )

# optimizer = Adam(learning_rate=lr_schedule)

In [None]:
epochs = 45
transformer.compile(
    optimizer=Adam(),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
transformer.fit(train_dataset, epochs=epochs, validation_data=val_dataset,callbacks=callbacks)

### Final Testing Code

### Code for Generating Predictions and Local Verification of Model Generated by Separating some percentage of Test Data

Below are two cells which you have to run for Generating the predictions, In the first cell you have to replace the test input in second line and load into test_Input.
Second Cell Execution will require you to run the above cells for classes where Transformer encoder, decoder etc. are implemented. So before running this cell, kindly run the above cells from starting till code cell 9(Positional embeddings) except the model training part.

In [12]:
# test_set = pickle.load(open('test_set', 'rb'))
# test_input = [sentence[0] for sentence in test_set] 

# Just uncomment the below line and comment the above two lines if you want to generate the predictions for your test input. Here the actual test input will be loaded
test_input = pickle.load(open('Test_input_1000', 'rb'))

In [13]:
def predictionGeneration(model,input_lines):
    output_vocab = target_vectorization.get_vocabulary()
    lookup_index = dict(zip(range(len(output_vocab)), output_vocab))
    tokenized_input_line = source_vectorization([input_lines])
    translated_line = "[start]"
    for i in range(sequence_length):
        tokenized_output_line = target_vectorization(
            [translated_line])[:, :-1]
        predict = model(
            [tokenized_input_line, tokenized_output_line])
        token_idx = np.argmax(predict[0, i, :])
        token = lookup_index[token_idx]
        translated_line += " " + token
        if token == "[end]":
            break
    return translated_line
    
def modelTesting(model,test_data):

    predictions = []  
    
    for data in test_data:
        predict = predictionGeneration(model,data)
        predict = predict.replace('[start] ', '').replace(' [end]', '')
        predictions.append(predict)

    return predictions
    
model = keras.models.load_model("Rishabh_Bassi_532008692_Project2_Model.h5",
    custom_objects={"TransformerDecoder": TransformerDecoder,
                    "TransformerEncoder": TransformerEncoder,
                    "PositionalEmbedding": PositionalEmbedding})
                    
                    
prediction_set = modelTesting(model,test_input)

pickle.dump(prediction_set, open('Rishabh_Bassi_532008692_Project2_Prediction','wb'))

### This is the cell which you can execute to get the accuracy of model given that prediction file is provided. Before Running this cell, Prediction file needs to be provided.

In [None]:
# Actual Testing code placeholder 
import pickle

# Loading datasets
test_output = pickle.load(open("test_output", "rb")) # your filename needs to be here to compare predictions
test_prediction = pickle.load(open("Rishabh_Bassi_532008692_Project2_Prediction", "rb"))

total_sentences = len(test_output)
total_correct_sentences = sum(pred_sentence.strip() == true_sentence.strip() for pred_sentence, true_sentence in zip(test_prediction, test_output))

test_acc = total_correct_sentences / total_sentences
your_score = round(test_acc * 1000) / 10
print(f"Your Test Accuracy: {test_acc:.4f}")
print(f"Your Score: {your_score}")

### Reference for Local Testing Code

In [14]:
#This code is for local testing
test_output = [sentence[1] for sentence in test_set]
print("Size of True Output vs Size of Preds:",len(test_output),len(prediction_set))
total_sentences = len(test_input)

correct_count = 0
for i in range(len(prediction_set)):
    trueout = test_output[i].replace('[start] ','').replace(' [end]','')
    
    if(trueout.strip()==prediction_set[i].strip()):
        correct_count+=1

test_acc = correct_count / total_sentences
your_score = round(test_acc * 1000) / 10
print(f"Your Test Accuracy: {test_acc:.4f}")
print(f"Your Score: {your_score}")

Size of True Output vs Size of Preds: 500 500
Your Test Accuracy: 0.9800
Your Score: 98.0
