In [48]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [49]:
import random

random.seed(0)

def shuffle_list(list):
  shuffled = list.copy()
  random.shuffle(shuffled)

  return shuffled

In [50]:
from nltk import RegexpTokenizer

toknizer = RegexpTokenizer(r'''\w+'|[\w-]+|[^\w\s]''')

def tokenize(sentence):
  return toknizer.tokenize(sentence)

In [51]:
def get_vocab(sequences):
  token_id_map = {
      "<pad>": 0,
      "<start>": 1,
      "<stop>": 2
  }

  for sequence in sequences:
    for word in sequence:
      if word not in token_id_map:
        token_id_map[word] = len(token_id_map)

  return token_id_map

In [52]:
def encode_with_vocab(sequences, vocab):
  encoded_sequences = []
  encoded_sequence = []

  for sequence in sequences:
    for word in sequence:
      encoded_sequence.append(vocab[word])
    encoded_sequences.append(encoded_sequence)
    encoded_sequence = []

  return encoded_sequences

In [53]:
import tensorflow as tf
import numpy as np
import pandas as pd


def get_dataset(name):
  if (name == "fr-en"):
    f = open("gdrive/MyDrive/Data/fr-en-train.txt", "r")
    file = f.read()
    return file.split("\n")
  else:
    raise SystemError("Dataset not found")

def prepare_dataset(dataset, shuffle, lowercase, max_window_size):
  encoder_input, decoder_input, decoder_output = [], [], []
  encoder_vocab, decoder_vocab, encoder_inverted_vocab, decoder_inverted_vocab = {}, {}, {}, {}

  if shuffle:
    dataset = shuffle_list(dataset)

  for line in dataset:
    if lowercase:
      line = line.lower()
    en, fr, credits = line.split("\t")

    encoder_input.append(tokenize(fr))
    decoder_input.append(tokenize(en))

  decoder_output = [tokens + ["<stop>"] for tokens in decoder_input]
  encoder_input = [["<start>"] + tokens + ["<stop>"] for tokens in encoder_input]
  decoder_input = [["<start>"] + tokens + ["<stop>"] for tokens in decoder_input]

  source_max_len = max_window_size
  target_max_len = max_window_size
  if (max(map(len, encoder_input)) > max_window_size or max(map(len, decoder_input)) > max_window_size):
    raise SystemError("Maximum window size is too small", max(map(len, encoder_input)), max(map(len, decoder_input)))

  encoder_input = [tokens + ["<pad>"] * (source_max_len - len(tokens)) for tokens in encoder_input]
  decoder_input = [tokens + ["<pad>"] * (target_max_len - len(tokens)) for tokens in decoder_input]
  decoder_output = [tokens + ["<pad>"] * (target_max_len - len(tokens)) for tokens in decoder_output]

  encoder_vocab = get_vocab(encoder_input)
  decoder_vocab = get_vocab(decoder_input)

  encoder_inverted_vocab = { v: k for k, v in encoder_vocab.items() }
  decoder_inverted_vocab = { v: k for k, v in decoder_vocab.items() }

  encoder_input = encode_with_vocab(encoder_input, encoder_vocab)
  decoder_input = encode_with_vocab(decoder_input, decoder_vocab)
  decoder_output = encode_with_vocab(decoder_output, decoder_vocab)
  decoder_output = [[[token] for token in tokens] for tokens in decoder_output]

  return (encoder_input,
    decoder_input,
    decoder_output,
    encoder_vocab,
    decoder_vocab,
    encoder_inverted_vocab,
    decoder_inverted_vocab)

In [54]:
def mask_zero(x):
    mask = tf.greater(x, 0)
    mask = tf.cast(mask, dtype = tf.float32)
    return mask

class WordEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, embedding_size, **kwargs):
    super().__init__(**kwargs)

    self.vocab_size = vocab_size
    self.embedding_size = embedding_size

  def get_config(self):
    config = super().get_config().copy()

    config.update({
      'vocab_size': self.vocab_size,
      'embedding_size': self.embedding_size
    })

    return config

  def build(self, input_shape):
    super().build(input_shape)

    self.word_embedding = tf.keras.layers.Embedding(
      self.vocab_size,
      self.embedding_size
    )

  def call(self, x):
    word_embedding = self.word_embedding(x)

    window_dim = x.get_shape().as_list()[1]
    masks = tf.keras.layers.Lambda(mask_zero, output_shape=(-1,))(x)
    masks = tf.keras.layers.Reshape(target_shape=(-1, 1))(masks)
    word_embedding = tf.keras.layers.Multiply()([word_embedding, masks])

    return word_embedding


In [55]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, embedding_size, nb_head, **kwargs):
    super().__init__(**kwargs)

    if not embedding_size % nb_head == 0:
      raise SystemError("Embedding_size should be divisible by number of heads")

    self.embedding_size = embedding_size
    self.nb_head = nb_head
    self.head_dim = embedding_size // nb_head

  def get_config(self):
    config = super().get_config().copy()

    config.update({
      'embedding_size': self.embedding_size,
      'nb_head': self.nb_head
    })

    return config

  def build(self, input_shape):
    super().build(input_shape)

    self.query_layer = tf.keras.layers.Dense(self.embedding_size)
    self.value_layer = tf.keras.layers.Dense(self.embedding_size)
    self.key_layer = tf.keras.layers.Dense(self.embedding_size)
    self.out_proj = tf.keras.layers.Dense(self.embedding_size)

  def call(self, x, mask = False):
    Q_input, K_input, V_input = x

    Q = self.query_layer(Q_input)
    K = self.key_layer(K_input)
    V = self.value_layer(V_input)

    if self.nb_head > 1:
      batch_size = tf.shape(Q)[0]
      Q_seq_len = tf.shape(Q)[1]
      K_seq_len = tf.shape(K)[1]
      V_seq_len = tf.shape(V)[1]

      Q = tf.reshape(Q, [batch_size, Q_seq_len, self.nb_head, self.head_dim])
      K = tf.reshape(K, [batch_size, K_seq_len, self.nb_head, self.head_dim])
      V = tf.reshape(V, [batch_size, V_seq_len, self.nb_head, self.head_dim])

      Q = tf.transpose(Q, [0, 2, 1, 3])
      K = tf.transpose(K, [0, 2, 1, 3])
      V = tf.transpose(V, [0, 2, 1, 3])

      Q = tf.reshape(Q, [batch_size * self.nb_head, Q_seq_len, self.head_dim])
      K = tf.reshape(K, [batch_size * self.nb_head, K_seq_len, self.head_dim])
      V = tf.reshape(V, [batch_size * self.nb_head, V_seq_len, self.head_dim])

    dot_product = tf.matmul(Q, K, transpose_b = True)
    scaled_dot_product = dot_product / tf.math.sqrt(float(self.embedding_size))

    if mask:
      diag_vals = tf.ones_like(scaled_dot_product[0, :, :])
      tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
      future_masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(scaled_dot_product)[0], 1, 1])
      padding_num = -float("Inf")
      paddings = tf.ones_like(future_masks) * padding_num

      scaled_dot_product = tf.where(tf.equal(future_masks, 0), paddings, scaled_dot_product)

    softmax_product = tf.nn.softmax(scaled_dot_product, axis = -1)
    attention = tf.matmul(softmax_product, V)

    if self.nb_head > 1:
      attention = tf.reshape(
        attention, [batch_size, self.nb_head, Q_seq_len, self.head_dim]
      )

      attention = tf.transpose(attention, [0, 2, 1, 3])

      attention = tf.reshape(
        attention, [batch_size, Q_seq_len, self.nb_head * self.head_dim]
      )

    out_attention = self.out_proj(attention)

    return out_attention

In [56]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)

  def build(self, input_shape):
    super().build(input_shape)

  def call(self, x):
    input_shape = tf.shape(x)
    batch_size, seq_len, output_dim = input_shape[0], input_shape[1], input_shape[2]
    pos_input = tf.tile(tf.expand_dims(tf.keras.backend.arange(0, seq_len), axis = 0), [batch_size, 1])
    pos_input = tf.keras.backend.cast(pos_input, tf.float32)
    evens = tf.keras.backend.arange(0, output_dim // 2) * 2
    odds = tf.keras.backend.arange(0, output_dim // 2) * 2 + 1
    even_embedding = tf.sin(
      tf.keras.backend.dot(
        tf.expand_dims(pos_input, -1),
        tf.expand_dims(1.0 / tf.pow(
          10000.0,
          tf.cast(evens, dtype = tf.float32) / tf.cast(output_dim, dtype = tf.float32)
          ), 0)
        )
      )
    odd_embedding = tf.cos(
      tf.keras.backend.dot(
        tf.expand_dims(pos_input, -1),
        tf.expand_dims(1.0 / tf.pow(
          10000.0,
          tf.cast((odds - 1), dtype = tf.float32) / tf.cast(output_dim, dtype = tf.float32)
          ), 0)
        )
      )
    embedding = tf.stack([even_embedding, odd_embedding], axis = -1)
    output = tf.reshape(embedding, [-1, tf.shape(x)[1], output_dim])
    output += x

    return output

In [57]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, dense_layer_size, nb_head, **kwargs):
    super().__init__(**kwargs)

    self.embedding_size = embedding_size
    self.dense_layer_size = dense_layer_size
    self.nb_head = nb_head

  def get_config(self):
    config = super().get_config().copy()

    config.update({
      'embedding_size': self.embedding_size,
      'dense_layer_size': self.dense_layer_size,
      'nb_head': self.nb_head
    })

    return config

  def build(self, input_shape):
    super().build(input_shape)

    self.attention = MultiHeadAttention(self.embedding_size, self.nb_head)
    self.norm_1 = tf.keras.layers.LayerNormalization()
    self.norm_2 = tf.keras.layers.LayerNormalization()
    self.dense_1 = tf.keras.layers.Dense(self.dense_layer_size)
    self.dense_2 = tf.keras.layers.Dense(self.embedding_size)

  def call(self, x):
    attention = self.attention((x, x, x))
    post_attention = self.norm_1(attention + x)

    dense_out = self.dense_1(post_attention)
    dense_out = self.dense_2(dense_out)

    enc_output = self.norm_2(dense_out + x)

    return enc_output

In [58]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, nb_encoder, embedding_size, dense_layer_size, nb_head, **kwargs):
    super().__init__(**kwargs)

    self.nb_encoder = nb_encoder
    self.embedding_size = embedding_size
    self.dense_layer_size = dense_layer_size
    self.nb_head = nb_head
    self.encoder_layers = []

  def get_config(self):
    config = super().get_config().copy()

    config.update({
      'nb_encoder': self.nb_encoder,
      'embedding_size': self.embedding_size,
      'dense_layer_size': self.dense_layer_size,
      'nb_head': self.nb_head
    })

    return config

  def build(self, input_shape):
    super().build(input_shape)

    for nb in range(self.nb_encoder):
      self.encoder_layers.append(
        EncoderLayer(self.embedding_size, self.dense_layer_size, self.nb_head)
      )

  def call(self, x):
    for encoder_layer in self.encoder_layers:
      x = encoder_layer(x)
    return x

In [59]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, dense_layer_size, nb_head, **kwargs):
    super().__init__(**kwargs)

    self.embedding_size = embedding_size
    self.dense_layer_size = dense_layer_size
    self.nb_head = nb_head

  def get_config(self):
    config = super().get_config().copy()

    config.update({
      'embedding_size': self.embedding_size,
      'dense_layer_size': self.dense_layer_size,
      'nb_head': self.nb_head
    })

    return config

  def build(self, input_shape):
    super().build(input_shape)

    self.attention_1 = MultiHeadAttention(self.embedding_size, self.nb_head)
    self.attention_2 = MultiHeadAttention(self.embedding_size, self.nb_head)
    self.norm_1 = tf.keras.layers.LayerNormalization()
    self.norm_2 = tf.keras.layers.LayerNormalization()
    self.norm_3 = tf.keras.layers.LayerNormalization()
    self.dense_1 = tf.keras.layers.Dense(self.dense_layer_size)
    self.dense_2 = tf.keras.layers.Dense(self.embedding_size)

  def call(self, x):
    output_embedding, encoder_output = x

    self_attention = self.attention_1((output_embedding, output_embedding, output_embedding), mask = True)
    post_self_attention = self.norm_1(self_attention + output_embedding)

    decoder_attention = self.attention_2((post_self_attention, encoder_output, encoder_output))
    post_decoder_attention = self.norm_2(decoder_attention + post_self_attention)

    dense_out = self.dense_1(post_decoder_attention)
    dense_out = self.dense_2(dense_out)

    decoder_output = self.norm_3(dense_out + post_decoder_attention)

    return decoder_output

In [60]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, nb_decoder, embedding_size, dense_layer_size, nb_head = 1, **kwargs):
    super().__init__(**kwargs)

    self.nb_decoder = nb_decoder
    self.embedding_size = embedding_size
    self.dense_layer_size = dense_layer_size
    self.decoder_layers = []
    self.nb_head = nb_head

  def get_config(self):
    config = super().get_config().copy()

    config.update({
      'nb_decoder': self.nb_decoder,
      'embedding_size': self.embedding_size,
      'dense_layer_size': self.dense_layer_size,
      'nb_head': self.nb_head
    })

    return config

  def build(self, input_shape):
    super().build(input_shape)

    for nb in range(self.nb_decoder):
      self.decoder_layers.append(
        DecoderLayer(self.embedding_size, self.dense_layer_size, self.nb_head)
      )

  def call(self, x):
    output_embedding, encoder_output = x

    decoder_output = output_embedding

    for decoder_layer in self.decoder_layers:
      decoder_output = decoder_layer((decoder_output, encoder_output))

    return decoder_output

In [61]:
def get_model(
    EMBEDDING_SIZE = 64,
    DENSE_LAYER_SIZE = 128,
    ENCODER_VOCAB_SIZE = 12,
    DECODER_VOCAB_SIZE = 12,
    ENCODER_LAYERS = 1,
    DECODER_LAYERS = 1,
    NUMBER_HEADS = 1
  ):

  encoder_layer_input = tf.keras.Input(shape = (None,), name = "Encoder-Input")
  decoder_layer_input = tf.keras.Input(shape = (None,), name = "Decoder-Input")

  encoder_embedding = WordEmbedding(ENCODER_VOCAB_SIZE, EMBEDDING_SIZE, name = "Encoder-Word-Embedding")(encoder_layer_input)
  decoder_embedding = WordEmbedding(DECODER_VOCAB_SIZE, EMBEDDING_SIZE, name = "Decoder-Word-Embedding")(decoder_layer_input)

  encoder_embedding = PositionalEmbedding(name = "Encoder-Positional-Embedding")(encoder_embedding)
  decoder_embedding = PositionalEmbedding(name = "Decoder-Positional-Embedding")(decoder_embedding)

  encoder_output = Encoder(ENCODER_LAYERS, EMBEDDING_SIZE, DENSE_LAYER_SIZE, NUMBER_HEADS, name = "Encoder")(encoder_embedding)
  decoder_output = Decoder(DECODER_LAYERS, EMBEDDING_SIZE, DENSE_LAYER_SIZE, NUMBER_HEADS, name = "Decoder")((decoder_embedding, encoder_output))

  output_predictions = tf.keras.layers.Dense(DECODER_VOCAB_SIZE, activation = "softmax", name = "Decoder-Output")(decoder_output)

  model = tf.keras.Model([encoder_layer_input, decoder_layer_input], output_predictions, name = "Transformer-Model")

  return model

In [62]:
def make_translate(model, encoder_vocab, decoder_vocab, decoder_inverted_vocab, max_window_size = 10):
  def translate(sentence):
    sentence_tokens = [tokens + ['<stop>', '<pad>'] for tokens in [tokenize(sentence)]]
    tr_input = [list(map(lambda x: encoder_vocab[x], tokens)) for tokens in sentence_tokens][0]

    prediction = [[1]]
    i = 0

    while int(prediction[0][-1]) is not decoder_vocab['<stop>'] and i < max_window_size + 2:
      prediction_auto = model.predict([np.array([tr_input]), np.array(prediction)])
      prediction[0].append(tf.argmax(prediction_auto[0][i], axis = -1).numpy())
      i += 1

    print('Original: {}'.format(sentence))
    print('Traduction: {}'.format(' '.join(map(lambda x: decoder_inverted_vocab[x], prediction[0][1:-1]))))

  return translate

In [63]:
dataset = get_dataset("fr-en")

print("Dataset loaded. Length:", len(dataset), "lines")

train_dataset = dataset[0:100000]

print("Train data loaded. Length:", len(train_dataset), "lines")

(encoder_input,
decoder_input,
decoder_output,
encoder_vocab,
decoder_vocab,
encoder_inverted_vocab,
decoder_inverted_vocab) = prepare_dataset(
  train_dataset,
  shuffle = False,
  lowercase = True,
  max_window_size = 200
)

transformer_model = get_model(
  EMBEDDING_SIZE = 64,
  ENCODER_VOCAB_SIZE = len(encoder_vocab),
  DECODER_VOCAB_SIZE = len(decoder_vocab),
  ENCODER_LAYERS = 2,
  DECODER_LAYERS = 2,
  NUMBER_HEADS = 4,
  DENSE_LAYER_SIZE = 128
)

transformer_model.compile(
  optimizer = "adam",
  loss = [
    "sparse_categorical_crossentropy"
  ],
  metrics = [
    "accuracy"
  ]
)

transformer_model.summary()

x = [np.array(encoder_input), np.array(decoder_input)]
y = np.array(decoder_output)

name = "transformer"
checkpoint_filepath = "./gdrive/MyDrive/transformer_ep-{epoch:02d}_loss-{loss:.2f}_acc-{accuracy:.2f}.ckpt"

tensorboard_callback = tf.keras.callbacks.TensorBoard(
  log_dir = "logs/{}".format(name)
)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
  filepath = checkpoint_filepath,
  monitor = "val_accuracy",
  mode = "max",
  save_weights_only = True,
  save_best_only = True,
  verbose = True
)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(
  monitor = "val_accuracy",
  mode = "max",
  patience = 2,
  min_delta = 0.001,
  verbose = True
)

transformer_model.fit(
  x,
  y,
  epochs = 15,
  batch_size = 32,
  validation_split = 0.1,
  callbacks=[
    model_checkpoint_callback,
    tensorboard_callback,
    early_stopping_callback
  ]
)

Dataset loaded. Length: 185583 lines
Train data loaded. Length: 100000 lines
Model: "Transformer-Model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Encoder-Input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 Decoder-Input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 Encoder-Word-Embedding (WordEm  (None, None, 64)    1110016     ['Encoder-Input[0][0]']          
 bedding)                                                                                         
                                                                                                  
 Deco

KeyboardInterrupt: ignored

In [None]:
train_dataset = dataset[0:100000]

print("Train data loaded. Length:", len(train_dataset), "lines")

(encoder_input,
decoder_input,
decoder_output,
encoder_vocab,
decoder_vocab,
encoder_inverted_vocab,
decoder_inverted_vocab) = prepare_dataset(
  train_dataset,
  shuffle = False,
  lowercase = True,
  max_window_size = 20
)

transformer_model = get_model(
  EMBEDDING_SIZE = 64,
  ENCODER_VOCAB_SIZE = len(encoder_vocab),
  DECODER_VOCAB_SIZE = len(decoder_vocab),
  ENCODER_LAYERS = 2,
  DECODER_LAYERS = 2,
  NUMBER_HEADS = 4,
  DENSE_LAYER_SIZE = 128
)

transformer_model.summary()

transformer_model.load_weights('./gdrive/MyDrive/transformer_ep-0.ckpt')

translate = make_translate(transformer_model, encoder_vocab, decoder_vocab, decoder_inverted_vocab)

translate("c'est une belle journée .")
translate("j'aime manger du gâteau .")
translate("c'est une bonne chose .")
translate("il faut faire à manger pour nourrir les gens .")
translate("tom a acheté un nouveau vélo .")

In [None]:
import time
start_time = time.time()

translate("est a bon exemple est a bon exemple est a bon exemple est a bon exemple est a bon exemple. est a bon exemple est a bon exemple est a bon exemple est a bon exemple est a bon exemple.")

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
from tensorboard.plugins.hparams import api as hp
dataset = get_dataset("fr-en")

train_dataset = dataset[0:150]

(encoder_input,
decoder_input,
decoder_output,
encoder_vocab,
decoder_vocab,
encoder_inverted_vocab,
decoder_inverted_vocab) = prepare_dataset(
  train_dataset,
  shuffle = True,
  lowercase = True,
  max_window_size = 20
)

x_train = [np.array(encoder_input[0:100]), np.array(decoder_input[0:100])]
y_train = np.array(decoder_output[0:100])

x_test = [np.array(encoder_input[100:150]), np.array(decoder_input[100:150])]
y_test = np.array(decoder_output[100:150])

BATCH_SIZE = hp.HParam("batch_num", hp.Discrete([32, 16]))
DENSE_NUM = hp.HParam("dense_num", hp.Discrete([512, 256]))
HEAD_NUM = hp.HParam("head_num", hp.Discrete([8, 4]))
EMBED_NUM = hp.HParam("embed_num", hp.Discrete([512, 256]))
LAYER_NUM = hp.HParam("layer_num", hp.Discrete([6, 4]))

with tf.summary.create_file_writer("logs/hparam_tuning").as_default():
  hp.hparams_config(
    hparams=[LAYER_NUM, HEAD_NUM, EMBED_NUM, DENSE_NUM, BATCH_SIZE],
    metrics=[
      hp.Metric("val_accuracy")
    ],
  )

def train_test_model(hparams):
  transformer_model = get_model(
    EMBEDDING_SIZE = hparams[EMBED_NUM],
    ENCODER_VOCAB_SIZE = len(encoder_vocab),
    DECODER_VOCAB_SIZE = len(decoder_vocab),
    ENCODER_LAYERS = hparams[LAYER_NUM],
    DECODER_LAYERS = hparams[LAYER_NUM],
    NUMBER_HEADS = hparams[HEAD_NUM],
    DENSE_LAYER_SIZE = hparams[DENSE_NUM]
  )

  transformer_model.compile(
    optimizer = "adam",
    loss = ["sparse_categorical_crossentropy"],
    metrics = ["accuracy"]
  )

  transformer_model.fit(x_train, y_train, epochs = 1, batch_size = hparams[BATCH_SIZE])

  _, accuracy = transformer_model.evaluate(x_test, y_test)

  return accuracy

def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)
    accuracy = train_test_model(hparams)
    tf.summary.scalar("val_accuracy", accuracy, step = 1)

session_num = 0

for batch_num in BATCH_SIZE.domain.values:
  for dense_num in DENSE_NUM.domain.values:
    for num_heads in HEAD_NUM.domain.values:
      for num_embed in EMBED_NUM.domain.values:
        for num_units in LAYER_NUM.domain.values:
          hparams = {
              BATCH_SIZE: batch_num,
              DENSE_NUM: dense_num,
              HEAD_NUM: num_heads,
              EMBED_NUM: num_embed,
              LAYER_NUM: num_units
          }
          run_name = "run-%d" % session_num

          print("--- Starting trial: %s" % run_name)
          print({ h.name: hparams[h] for h in hparams })

          run("logs/hparam_tuning/" + run_name, hparams)

          session_num += 1
