In [2]:
import numpy as np
import unicodedata
import re
import time
import pickle

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from keras.utils.vis_utils import plot_model

In [3]:
#only if GPU is available
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
class InputEmbedding(layers.Layer) :
    def __init__(self, embedding_dim, enc_max_length, input_lang_vocab_size) :
        super(InputEmbedding, self).__init__()
        self.embedding_dim = embedding_dim
        self.max_length = enc_max_length
        self.vocab_size = input_lang_vocab_size
        self.embedding_layer = layers.Embedding(input_dim=self.vocab_size,
                                               output_dim=self.embedding_dim,
                                               trainable=True,
                                               name='input_embedding')

        
    def call(self, input_tensor) :
        '''
        parameters : input_tensor - tensor of shape (batch_size, sequence_length)
        returns : word_embeddings - tensor of shape (batch_size, sequence_length, embedding_dim)
        '''
        word_embeddings = self.embedding_layer(input_tensor)
        return word_embeddings

In [None]:
class PositionalEmbedding(layers.Layer) :
    def __init__(self) :
        super(PositionalEmbedding, self).__init__()
        
    def call(self, word_embeddings) :
        '''
        parameters : word_embeddings - tensor of shape (batch_size, sequence_length, embedding_dim)
        returns : embeddings_with_position - tensor of shape (batch_size, sequence_length, embedding_dim)
        '''
        positional_embeddings = np.zeros((word_embeddings.shape[1], word_embeddings.shape[2]))
        for i  in range(positional_embeddings.shape[0]) :
            if i % 2 == 0 :
                positional_embeddings[i] = np.array([np.sin(i/(1000 ** (2 * j / positional_embeddings.shape[1]))) for j in range(positional_embeddings.shape[1])])
            else :
                positional_embeddings[i] = np.array([np.cos(i/(1000 ** (2 * j / positional_embeddings.shape[1]))) for j in range(positional_embeddings.shape[1])])
        
        positional_embeddings = np.repeat(positional_embeddings[np.newaxis, :, :], word_embeddings.shape[0], axis=0)
        
        embeddings_with_position = positional_embeddings + word_embeddings
        return embeddings_with_position

In [None]:
class ScaledDotProductAttention(layers.Layer) :
    def __init__(self, is_mask=False) :
        super(ScaledDotProductAttention, self).__init__()
        self.is_mask = is_mask
        
    def call(self, query, key, value) :
        '''
        parameters : query - tensor of shape (batch_size, num_heads, seq_len_q, dim) 
                     key - tensor of shape (batch_size, num_heads, seq_len_k, dim) 
                     value - tensor of shape (batch_size, num_heads, seq_len_v, dim) 
                     **seq_len_k == seq_len_v
        returns : attention - tensor of shape (batch_size, num_heads, seq_len, dim) 
        '''
        # (batch_size, num_heads, seq_len_q, seq_len_k)
        pre_attention = tf.linalg.matmul(query, key, transpose_b=True) / np.sqrt(key.shape[1])

        if self.is_mask is True :
            mask = np.zeros((pre_attention.shape[-2], pre_attention.shape[-1]))
            mask.fill(-1e10)            
            mask = np.triu(mask, k=1)
            pre_attention = tf.math.multiply(pre_attention, mask)
            
        attention_weights = tf.nn.softmax(pre_attention, axis=-1)
        
        # (batch_size, num_heads, seq_len_q, dim)
        attention = tf.linalg.matmul(attention_weights, value)
        
        return attention

In [None]:
class MultiHeadAttention(layers.Layer) :
    def __init__(self, d_model, num_heads, is_mask=False) :
        super(MultiHeadAttention, self).__init__()
        
        self.num_heads = num_heads
        self.d_model = d_model
        self.dim = self.d_model // self.num_heads
        self.is_mask = is_mask
        
        assert(self.dim * self.num_heads == self.d_model), "d_model should be divisible by num_heads."

        self.query_layer = layers.Dense(self.d_model)
        self.key_layer = layers.Dense(self.d_model)
        self.value_layer = layers.Dense(self.d_model)
        
        self.scaled_dot_product_attention = ScaledDotProductAttention(is_mask=self.is_mask)
        
        self.linear_layer = layers.Dense(self.d_model)
    
    
    def split_heads(self, input_tensor) :
        '''
        parameters : input_tensor - tensor of shape (batch_size, seq_len, d_model)
        returns : input_tensor - resize tensor of shape (batch_size, num_heads, seq_len, dim)
        '''
        input_tensor = tf.reshape(input_tensor, (input_tensor.shape[0], -1, self.num_heads, self.dim))
        return tf.transpose(input_tensor, [0,2,1,3])
        
        
    def call(self, query, key, value) :
        '''
        parameters : query - tensor of shape (batch_size, seq_len, d_model)
                     key - tensor of shape (batch_size, seq_len, d_model)
                     value - tensor of shape (batch_size, seq_len, d_model)
        returns : res - tensor of shape (batch_size, seq_len, d_model)
        '''

        query = self.query_layer(query)
        key = self.key_layer(key)
        value = self.value_layer(value)
        
        query = self.split_heads(query)
        key = self.split_heads(key)
        value = self.split_heads(value)
        
        attention = self.scaled_dot_product_attention(query, key, value)
        
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (attention.shape[0], -1, self.d_model))
        
        res = self.linear_layer(concat_attention)
        return res

In [None]:
class AddandNorm(layers.Layer) :
    def __init__(self) :
        super(AddandNorm, self).__init__()
        self.layer_norm = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, input_tensor, skip_connection) :
        '''
        parameters : input_tensor - tensor of shape (batch_size, seq_len, d_model)
                     skip_connection - tensor of shape (batch_size, seq_len, d_model)
        returns : res - normalized tensor of shape (batch_size, seq_len, d_model)
        '''
        res = input_tensor + skip_connection
        res = self.layer_norm(res)
        return res

In [None]:
class FeedForward(layers.Layer) :
    def __init__(self, hidden_dim, output_dim) :
        super(FeedForward, self).__init__()
        self.layer_1 = layers.Dense(hidden_dim, activation='relu')
        self.layer_2 = layers.Dense(output_dim)
        
    def call(self, input_tensor) :
        '''
        parameters : input_tensor - tensor of shape (batch_size, seq_len, d_model)
        returns : input_tensor - tensor of shape (batch_size, seq_len, d_model)
        '''
        res = self.layer_1(input_tensor)
        res = self.layer_2(res)
        return res

In [19]:
def concat_role_vector(x, role_vector, turns) :
    '''
    parameters : x - tensor of shape (batch_size, num_turns, embedding_dim)
                 role_vector - tensor of shape (num_roles, role_vector_dim)
                 turns - np array representing the sequence of turns
    returns : concat_vector - concatenated vector of x and role_vector for each turn
                              of shape (batch_size, num_turns, embedding_dim + role_vector_dim)
    '''
    concat_vector = []
    for i, meeting in enumerate(x) :
        arr = np.array([role_vector[j] for j in turns[i]])
        meeting_with_role = np.concatenate((meeting, arr), axis=1)
        concat_vector.append(meeting_with_role)
    return np.array(concat_vector)

# x = tf.random.uniform((1, 10, 512))
# role_vector = tf.random.uniform((2, 32))
# turns = [[0,1,1,0,1,0,1,0,1,0]]

# print(concat_role_vector(x, role_vector, turns).shape)         

In [None]:
class EncoderBlock(layers.Layer) :
    def __init__(self, d_model, num_heads) :
        super(EncoderBlock, self).__init__()
        self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
        self.add_and_norm_1 = AddandNorm()
        self.feed_forward = FeedForward(200, d_model)
        self.add_and_norm_2 = AddandNorm()
        
    def call(self, input_tensor) :
        '''
        parameters : input_tensor - tensor of shape (batch_size, seq_len, d_model)
        returns : input_tensor - tensor of shape (batch_size, seq_len, d_model)
        '''
        res = self.multi_head_attention(input_tensor, input_tensor, input_tensor)
        res_skip = self.add_and_norm_1(res, input_tensor)
        res = self.feed_forward(res_skip)
        res = self.add_and_norm_2(res, res_skip)
        return res

In [None]:
class DecoderBlock(layers.Layer) :
    def __init__(self, d_model, num_heads) :
        super(DecoderBlock, self).__init__()
        self.masked_multi_head_attention = MultiHeadAttention(d_model, num_heads, is_mask=True)
        self.add_and_norm_1 = AddandNorm()
        
        self.multi_head_attention_1 = MultiHeadAttention(d_model, num_heads)
        self.add_and_norm_2 = AddandNorm()
        
        self.multi_head_attention_2 = MultiHeadAttention(d_model, num_heads)
        self.add_and_norm_3 = AddandNorm()
        
        self.feed_forward = FeedForward(200, d_model)
        self.add_and_norm_4 = AddandNorm()
        
    def call(self, input_tensor, sentence_level_encoder_output, turn_level_encoder_output) :
        res = self.masked_multi_head_attention(input_tensor, input_tensor, input_tensor)
        res_skip = self.add_and_norm_1(res, input_tensor)
        
        res = self.multi_head_attention_1(res_skip, sentence_level_encoder_output, sentence_level_encoder_output)
        res_skip = self.add_and_norm_2(res, res_skip)
        
        res = self.multi_head_attention_2(res_skip, turn_level_encoder_output, turn_level_encoder_output)
        res_skip = self.add_and_norm_3(res, res_skip)
        
        res = self.feed_forward(res_skip)
        res = self.add_and_norm_4(res, res_skip)
        return res

In [None]:
class Encoder(layers.Layer) :
    def __init__(self, num_layers, d_model, num_heads, enc_max_length, input_lang_vocab_size) :
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.positional_embedding_layer = PositionalEmbedding()
        
        self.encoder_blocks = [EncoderBlock(d_model, num_heads) for _ in range(num_layers)]
        
    def call(self, input_tensor) :
        x = self.positional_embedding_layer(x)
        
        for i in range(self.num_layers) :
            x = self.encoder_blocks[i](x)
            
        return x

In [None]:
class Decoder(layers.Layer) :
    def __init__(self, num_layers, d_model, num_heads, dec_max_length, target_lang_vocab_size) :
        super(Decoder, self).__init__()
        self.num_layers = num_layers
        self.embedding_layer = InputEmbedding(d_model, dec_max_length, target_lang_vocab_size)
        self.positional_embedding_layer = PositionalEmbedding()
        
        self.decoder_blocks = [DecoderBlock(d_model, num_heads) for _ in range(num_layers)]
        
    def call(self, input_tensor, sentence_level_encoder_output, turn_level_encoder_output) :
        x = self.embedding_layer(input_tensor)
        x = self.positional_embedding_layer(x)
        
        for i in range(self.num_layers) :
            x = self.decoder_blocks[i](x, sentence_level_encoder_output, turn_level_encoder_output)
            
        return x

In [None]:
class MTNet(tf.keras.Model) :
    def __init__(self, num_layers, d_model, num_heads, enc_max_length, dec_max_length, input_lang_vocab_size, 
                target_lang_vocab_size, role_vector_size) :
        super(MTNet, self).__init__()
        self.sentence_level_encoder = Encoder(num_layers, d_model, num_heads, enc_max_length, input_lang_vocab_size)
        self.turn_level_encoder = Encoder(num_layers, d_model, num_heads, enc_max_length, input_lang_vocab_size)
        
        self.role_vector = layers.Dense(role_vector_size)
        
        self.decoder = Decoder(num_layers, d_model, num_heads, dec_max_length, target_lang_vocab_size)
        
        self.fully_connected_layer = layers.Dense(target_lang_vocab_size)
        
    def call(self, input_tensor, target_tensor, input_role_vector) :
        x1 = self.sentence_level_encoder(input_tensor)

        role_vector = self.role_vector(input_role_vector)
        x1_concat = concat_role_vector(x1, role_vector)
        
        x2 = self.turn_level_encoder(x1_concat)
        
        x = self.decoder(target_tensor, x1, x2)
        x = self.fully_connected_layer(x)
        
        return x

In [None]:
sample_mtnet = MTNet(
    num_layers=2, d_model=100, num_heads=10, enc_max_length=enc_max_length, dec_max_length=dec_max_length,
    input_lang_vocab_size=input_lang_vocab_size, target_lang_vocab_size=output_lang_vocab_size, 32)

temp_input = tf.random.uniform((64, enc_max_length), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((64, dec_max_length), dtype=tf.int64, minval=0, maxval=200)

fn_out = sample_mtnet(temp_input, temp_target)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

In [None]:
optimizer = tf.keras.optimizers.Adam(0.1, beta_1=0.9, beta_2=0.98,epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=2))

    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)

    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [None]:
mtnet = MTNet(num_layers=2, 
                          d_model=256, 
                          num_heads=8, 
                          enc_max_length=enc_max_length, 
                          dec_max_length=dec_max_length,
                          input_lang_vocab_size=input_lang_vocab_size, 
                          target_lang_vocab_size=output_lang_vocab_size,
                          role_vector_size=32)

In [None]:
EPOCHS = 20

# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(64, 30), dtype=tf.int64),
    tf.TensorSpec(shape=(64, 30), dtype=tf.int64),
]

@tf.autograph.experimental.do_not_convert
@tf.function(input_signature=train_step_signature)
def train_step(input_tensor, target_tensor):
    target_inp = target_tensor[:, :-1]
    target_real = target_tensor[:, 1:]


    with tf.GradientTape() as tape:
        predictions = mtnet(input_tensor, target_inp)
        loss = loss_function(target_real, predictions)

    gradients = tape.gradient(loss, mtnet.trainable_variables)
    optimizer.apply_gradients(zip(gradients, mtnet.trainable_variables))

    train_loss(loss)
    train_accuracy(accuracy_function(target_real, predictions))

In [None]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    for (batch, (input_tensor, target_tensor)) in enumerate(dataset):
        train_step(input_tensor, target_tensor)

        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

#     if (epoch + 1) % 5 == 0:
#         ckpt_save_path = ckpt_manager.save()
#         print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')