# ECE 595 Project - Unsupervised Multi-modal Neural Machine Translation

# Project is based on the paper below:

The model is composed of the following modules:
- 2 transformer encoders
- 2 transformer decoders
- 1 image encoder

This implementation will only look at the English to German translations.

In [150]:
import tensorflow as tf
import numpy as np
#import pandas as pd
import fasttext
import io
from tensorflow.keras.applications import ResNet152
from tensorflow.python.keras.preprocessing.text import Tokenizer

## Pre-Training and Preprocessing
The Multi30k and WMT News Crawl datasets were used in the paper. To prevent the model from seeing any paired sentence information, the Multi30k training and testing datasets were split in half, one for English, and the other for German. 

### Pre-training embeddings using fastText
Pre-training of the model was implemented following what was done in [reference 20 of paper]. The original paper used the first 10 million sentences from the 2007 - 2019 datasets for each language, but due to limited processing resources, this implementation only used the 2018 German and 2019 English datasets. The embeddings were generated using fasttext. The training was executed once and stored for later access.

Reference: https://github.com/facebookresearch/fastText/blob/master/python/README.md

In [67]:
# Pre-training the model using News Crawl 2018 and 2019 data
englishDataPath = './DataFiles/TRAIN_news.2019.en.shuffled.deduped'
germanDataPath  = './DataFiles/TRAIN_news.2018.de.shuffled.deduped'

# train model to get the attenuation inputs
# paper provided parameters: dimension = 512, window size = 5, negative samples = 10
#englishModel = fasttext.train_unsupervised(englishDataPath, model='skipgram', dim=512, ws=5, neg=10)
#englishModel.save_model("englishModel2019.bin")
#germanModel = fasttext.train_unsupervised(germanDataPath, model='skipgram', dim=512, ws=5, neg=10)
#germanModel.save_model("germanModel2018.bin")

englishModel = fasttext.load_model("./GeneratedFiles/englishModel2019.bin")
germanModel  = fasttext.load_model("./GeneratedFiles/germanModel2018.bin")



In [6]:
# create an embedding matrix that will be used to initialize the model
# will be storing all vectors associated with the traied words
enSize = len(englishModel.get_words())
deSize = len(germanModel.get_words())

enInitEmbeddings = englishModel.get_output_matrix()
deInitEmbeddings = germanModel.get_output_matrix()

print(len(enInitEmbeddings))

283843


In [103]:
# takes the input file and tokenizes the input sentences
def tokenizer(sentence):
    textTokenizer = Tokenizer()
    
    # Updates internal vocabulary based on the input.
    textTokenizer.fit_on_texts(sentence)
    
    # Transforms each sequence into a list of text.
    token = textTokenizer.texts_to_sequences(sentence)
    token = tf.keras.preprocessing.sequence.pad_sequences(token, padding='post')
    
    return token, textTokenizer

In [104]:
# takes in a text file and returns arrays with indexes into the tokens and token objects
def load_dataset(path):
    
    fileData = io.open(path, encoding='UTF-8').read().strip().split('\n')
    tokenInfo = []
    tokenArray = []
    
    token, textTokenizer = tokenizer(fileData)
    
#    token = tf.transpose(token)
#     tokenObjectArray = []
#     yvette = 0
    
#     for line in fileData:
#         yvette +=1
#         token, textTokenizer = tokenizer(line)
#         token = tf.transpose(token)
#         #tokenInfo.append([token, textTokenizer])
#         #tokenArray.append(token)
#         #tokenObjectArray.append(textTokenizer)
    
    return token, textTokenizer

In [133]:
enFilePath2007 = './DataFiles/en2007_reduced_DataCrawl.txt'
deFilePath2007 = './DataFiles/de2007_reduced_DataCrawl.txt'
#enSmall = './DataFiles/enSmall_2007_DataCrawl.txt'
#deSmall = './DataFiles/deSmall_2007_DataCrawl.txt'
enSmall = './DataFiles/test_2017_flickr.en'
deSmall = './DataFiles/test_2017_flickr.de'

# obtain the tokenized version of the input files
en2007_token, en2007_token_object = load_dataset(enSmall)
de2007_token, de2007_token_object = load_dataset(deSmall)

# print out stuff just to check valid inputs
print('EN Shape: ', en2007_token.shape)
print('DE Shape: ', de2007_token.shape)


EN Shape:  (1000, 41)
DE Shape:  (1000, 39)


## Model Implementation
The entire model is composed of the following transformer modules:
- English Encoder
- German Encoder
- Image Encoder
- English Decoder
- German Decoder

The transformer model taken from tensorflow was modified to account for the pretrained embeddings previously generated.

### ResNet-152 Encoder

In [153]:
imageModel = ResNet152()

In [159]:
img = tf.keras.preprocessing.image.load_img('./images/22439193117_4183a49ec2.jpg', target_size=(224, 224))
imgArray = tf.keras.preprocessing.image.img_to_array(img)
#imgBatch = np.expand_dim(imgArray, axis=0)
#imgPreProc = tf.keras.preprocessing.image.preprocess_input(image_batch, 
imgPreProc = preprocess_input(imgArray)

# x = preprocess(img)
y = imageModel.predict(imgPreProc)

NameError: name 'preprocess_input' is not defined

### Calculate Queries, Keys, and Values
The embeddings for each word are used to create the queries, keys, and values vectors that will be needed when calculating attention. These vectors are created by multiplying the embedding by three matricies that were trained during the training process. 

reference: https://medium.com/inside-machine-learning/what-is-a-transformer-d07dd1fbec04
Q - vector representation of one word in the sequence
K - vector representation of all words in the sequence
V - vector representation of all words in the sequence
Q is a matrix that contains the query (vector representation of one word in the sequence),
 K are all the keys (vector representations of all the words in the sequence) and V are the values, which are again the vector representations of all the words in the sequence. 
 
 $ Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{dk}}) V $
 
Transformer taken from Tensorflow help guides.
Referenced https://www.tensorflow.org/tutorials/text/transformer#top_of_page
 

In [137]:
# Populate parameters for the transformer
layers      = 4
dimension   = 512
dff         = 2018
numHeads    = 8
#enVocabSize = en2007_token.shape[1]
#deVocabSize = de2007_token.shape[1]
enVocabSize = 8500
deVocabSize = 8000

In [148]:
enTransformer = Transformer(
                        num_layers=layers, 
                        d_model=dimension, 
                        num_heads=numHeads, 
                        dff=dff, 
                        input_vocab_size=enVocabSize,
                        target_vocab_size=deVocabSize,
                        pe_input=10000, # tensorflow values used
                        pe_target=6000) # tensorflow values used
                        #initEmbeddings=enInitEmbeddings)

enOut, enWeights = enTransformer(en2007_token, de2007_token, training=False, 
                               enc_padding_mask=None, 
                               look_ahead_mask=None,
                               dec_padding_mask=None)

print(enOut.shape)  # (batch_size, tar_seq_len, target_vocab_size)
enTransformer.compile(optimizer=optimizer, loss=loss)

(1000, 39, 8000)


In [149]:
optimizer = tf.keras.optimizers.Adam() # Yvette - 'SGD' ...?
crossEntropyLoss = tf.keras.losses.SparseCategoricalCrossentropy()

# passing in the target language
def lossFcn(real, prediction):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = crossEntropyLoss(real, prediction, sample_weight=mask)
    
    return loss

# YVETTE - what is the difference between tarLangIn and tarLangOut in our dataset...?
def trainStep(inLang, tarLangIn, tarLangOut):
    encoderPadMask, combinedPadMask, decoderPadMask = create_masks(inLang, tarLang)
    
    with tf.GradientTape() as tape: 
        prediction, predictionWeights = enTransformer(inLang, tarLang, training=True, 
                                                     enc_padding_mask=encoderPadMask, 
                                                     look_ahead_mask=combinedPadMask,
                                                     dec_padding_mask=decoderPadMask)
        
        loss = lossFcn(inLang, tarLangOut)
        
        gradients = tape.gradient(loss, enTransformer.trainable_variables)    
        optimizer.apply_gradients(zip(gradients, enTransformer.trainable_variables))

    return loss

def predict(sentence):
    # start with tokenizing the input to test
    textTokenizer = Tokenizer()
    inputSentence = texts_to_sequences(sentence)
    
    # pass through the transformer

## Tensorflow's Transformer Model
NOTE: FOLLOWING CODE IS NOT MINE!!! Came from the site below: 
https://www.tensorflow.org/tutorials/text/transformer

### Positional Encoding and Masks

In [69]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

In [141]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

def create_masks(inp, tar):
  # Encoder padding mask
  enc_padding_mask = create_padding_mask(inp)

  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  dec_padding_mask = create_padding_mask(inp)

  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

  return enc_padding_mask, combined_mask, dec_padding_mask

### Multi-Head Attenuation

In [126]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.

  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.

  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

In [72]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

### Encoder

In [75]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

    return out2

class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)


    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]

    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)

    return x  # (batch_size, input_seq_len, d_model)



### Decoder

In [76]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)


  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)

    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

    return out3, attn_weights_block1, attn_weights_block2

class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}

    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)

      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights



### Transformer

In [143]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, pe_input, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, pe_target, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)

    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

    return final_output, attention_weights