# Transformer Pointer Generator Model for Question Answering

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt
import re

In [0]:
from sklearn.model_selection import train_test_split
from importlib import reload

BUFFER_SIZE = 20000
BATCH_SIZE = 64
d_model = 1024 #300 for GloVe embeddings
MAX_VOCAB_SIZE = 2**14
MAX_Q_LEN = 30
MAX_C_LEN = 500
MAX_A_LEN = 8

### File Imports

In [0]:
# !git clone https://github.com/ahathaway821/w266-gutenberg-quiz
# !mv /content/w266-gutenberg-quiz /content/w266_gutenberg_quiz
#!git -C ./w266_gutenberg_quiz reset --hard
#!git -C ./w266_gutenberg_quiz pull origin master

# from google.colab import drive
# drive.mount('/content/drive')

In [0]:
# from google.colab import auth
# auth.authenticate_user()
# project_id ='ace-element-251203'
# !gcloud config set project {project_id}
# !gsutil cp gs://gutenberg-qa-train-data/train.data /content/mini_train.data
# !gsutil cp gs://gutenberg-qa-train-data/train.data /content/train.data
# !gsutil cp gs://gutenberg-qa-train-data/train.data /content/valid.data
# !gsutil cp gs://gutenberg-qa-train-data/train.data /content/test.data

In [0]:
#from w266_gutenberg_quiz.library.models.seq2seq_att import Seq2SeqAtt
#qa = Seq2SeqAtt()
#qa.load_glove_model('w266_gutenberg_quiz/embeddings')
#qa.load_model(model_dir_path='w266_gutenberg_quiz/experiments/models')

In [0]:
# import os

# GLOVE_DIR ='/content/w266_gutenberg_quiz/embeddings'
# #d_model = 300

# #vocabulary_set = set()
# embeddings_index = {}
# i = 0
# with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt')) as f:
#     for line in f:
#         word, coefs = line.split(maxsplit=1)
#         coefs = np.fromstring(coefs, 'f', sep=' ')
#         embeddings_index[word] = coefs
#         # if i < MAX_VOCAB_SIZE:
#         #   vocabulary_set.add(word)
#         i = i + 1
# print('Found %s word vectors.' % len(embeddings_index))

## Setup input pipeline

In [0]:
from w266_gutenberg_quiz.library.utility.squad_v3 import SquADDataSetV3

data_set = SquADDataSetV3(data_path="w266_gutenberg_quiz/data/SQuAD/train-v1.1.json")

test_size=.2
random_state=42
question_train, question_val, context_train, context_val, answer_train, answer_val = train_test_split(data_set.questions, 
                                                                                                      data_set.contexts, 
                                                                                                      data_set.answers, 
                                                                                                      test_size=test_size, 
                                                                                                      random_state=random_state)
train_examples = tf.data.Dataset.from_tensor_slices((question_train, context_train, answer_train))
val_examples = tf.data.Dataset.from_tensor_slices((question_val, context_val, answer_val))



In [0]:
vocabulary_set = set()
tokenizer = tfds.features.text.Tokenizer()

v = 0
for question, context, answer in train_examples:
  some_tokens = tokenizer.tokenize((question + " " + context + " " + answer).numpy())
  v = v + 1
  if len(vocabulary_set) > MAX_VOCAB_SIZE:
    break
  vocabulary_set.update(some_tokens)
  
tokenizer_all = tfds.features.text.TokenTextEncoder(vocabulary_set)
tokenizer_all.load_from_file("/content/drive/My Drive/W266/FInalProject/transformer_v7_vocab/l_squad_elmo_4l_b64_lvocab_slr")

In [0]:
# Due to GPU memory constraints in Colab, need to set hard limits on lengths 
filtered_question_train = []
filtered_question_val = []
filtered_context_train = []
filtered_context_val = []
filtered_answer_train = []
filtered_answer_val = []

max_answer_char_length = 50
max_question_char_length = 200
max_context_char_length = 2500
empty_a = 0
for i in range(0, len(question_train)):
  if len(answer_train[i]) < max_answer_char_length and len(question_train[i]) < max_question_char_length and len(context_train[i]) < max_context_char_length:
    if len(answer_train[i]) == 0:
      empty_a = empty_a + 1
      continue

    filtered_question_train.append(re.sub(r'([^\s\w]|_)+', '', question_train[i]))
    filtered_context_train.append(re.sub(r'([^\s\w]|_)+', '', context_train[i]))
    filtered_answer_train.append(re.sub(r'([^\s\w]|_)+', '', answer_train[i]))

for i in range(0, len(question_val)):
  if len(answer_val[i]) < max_answer_char_length and len(question_val[i]) < max_question_char_length and len(context_val[i]) < max_context_char_length:
    if len(answer_val[i]) == 0:
      continue
    filtered_question_val.append(re.sub(r'([^\s\w]|_)+', '',question_val[i]))
    filtered_context_val.append(re.sub(r'([^\s\w]|_)+', '',context_val[i]))
    filtered_answer_val.append(re.sub(r'([^\s\w]|_)+', '', answer_val[i]))

In [0]:
# Convert text into encoded ids with the tokenizer
def encoder_text_to_ids(question_words, tokenizer):
  ids = []
  oovs = []
  unk_id = tokenizer_all.vocab_size-1
  for w in question_words:
    i = tokenizer.encode(w)[0]
    if i == unk_id: # If w is OOV
      if w not in oovs: # Add to list of OOVs
        oovs.append(w)
      oov_num = oovs.index(w) # This is 0 for the first article OOV, 1 for the second article OOV...
      ids.append(tokenizer.vocab_size + oov_num) # This is e.g. 50000 for the first article OOV, 50001 for the second...
    else:
      ids.append(i)
  return ids, oovs

def answer_to_ids(answer_words, tokenizer, context_oovs):
  ids = []
  unk_id = tokenizer_all.vocab_size-1
  for w in answer_words:
    i = tokenizer.encode(w)[0]
    if i == unk_id: # If w is an OOV word
      if w in context_oovs: # If w is an in-context OOV
        vocab_idx = tokenizer.vocab_size + context_oovs.index(w) # Map to its temporary context OOV number
        ids.append(vocab_idx)
      else: # If w is an out-of-context OOV
        ids.append(unk_id) # Map to the UNK token id
    else:
      ids.append(i)
  return ids

def get_dec_inp_targ_seqs(sequence, max_len, start_id, stop_id):
  """
    Given the reference summary as a sequence of tokens, return the input sequence for the decoder, and the target sequence which we will use to calculate loss. The sequence will be truncated if it is longer than max_len. The input sequence must start with the start_id and the target sequence must end with the stop_id (but not if it's been truncated).
    Args:
      sequence: List of ids (integers)
      max_len: integer
      start_id: integer
      stop_id: integer
    Returns:
      inp: sequence length <=max_len starting with start_id
      target: sequence same length as input, ending with stop_id only if there was no truncation
  """
  inp = [start_id] + sequence[:]
  target = sequence[:]
  if len(inp) > max_len: # truncate
    inp = inp[:max_len]
    target = target[:max_len] # no end_token
  else: # no truncation
    target.append(stop_id) # end token
  assert len(inp) == len(target)
  return inp, target

def add_padding(x, max_length, padding_value=0):
  while len(x) < max_length:
    x.append(padding_value)
  return x

In [0]:
def example_generator(parsed_dataset, max_q_enc_len=40, max_c_enc_len=500, max_dec_len=10, training=False):
  for q,c,a in parsed_dataset:

    question = q.decode()
    context = c.decode()
    answer =a.decode()

    start_decoding = tokenizer_all.vocab_size
    stop_decoding = tokenizer_all.vocab_size + 1

    question_words = question.split()[ : max_q_enc_len]
    enc_q_len = len(question_words)
    enc_q_input = [tokenizer_all.encode(w)[0] for w in question_words]

    enc_q_input_extend_vocab, question_oovs = encoder_text_to_ids(question_words, tokenizer_all)
    enc_q_input = enc_q_input

    context_words = context.split()[ : max_c_enc_len]
    enc_c_len = len(context_words)
    enc_c_input = [tokenizer_all.encode(w)[0] for w in context_words]
    enc_c_input_extend_vocab, context_oovs = encoder_text_to_ids(context_words, tokenizer_all)
    enc_c_input = enc_c_input# + [stop_decoding]

    answer_words = answer.split()[ : max_dec_len-2]
    answer_ids = [tokenizer_all.encode(w)[0] for w in answer_words]
    answer_ids_extend_vocab = answer_to_ids(answer_words, tokenizer_all, context_oovs)

    dec_input = [start_decoding] + answer_ids + [stop_decoding]
    target = dec_input
    dec_len = max_dec_len-1#len(dec_input) - 1
    answer_words = ["<s>"] + answer_words

    output = {
      "enc_q_len": enc_q_len,
      "enc_q_input" : enc_q_input,
      "enc_q_input_extend_vocab"  : enc_q_input_extend_vocab,
      "question_oovs" : question_oovs,
      "question_words" : question_words,
      "enc_c_len": enc_c_len,
      "enc_c_input" : enc_c_input,
      "enc_c_input_extend_vocab"  : enc_c_input_extend_vocab,
      "context_words": context_words,
      "context_oovs" : context_oovs,
      "dec_input" : dec_input,
      "target" : target,
      "dec_len" : dec_len,
      "answer_words" : answer_words
    }

    yield output


def batch_generator(generator, parsed_dataset, max_q_enc_len=MAX_Q_LEN, max_c_enc_len=MAX_C_LEN, max_dec_len=MAX_A_LEN, batch_size=BATCH_SIZE, training=True):
  dataset = tf.data.Dataset.from_generator(generator, args = [parsed_dataset, max_q_enc_len, max_c_enc_len, max_dec_len, training],
                      output_types = {
                        "enc_q_len": tf.int32,
                        "enc_q_input" : tf.int32,
                        "enc_q_input_extend_vocab" : tf.int32,
                        "question_oovs" : tf.string,
                        "question_words" : tf.string,
                        "enc_c_len":tf.int32,
                        "enc_c_input" : tf.int32,
                        "enc_c_input_extend_vocab" : tf.int32,
                        "context_oovs" : tf.string,
                        "context_words": tf.string,
                        "dec_input" : tf.int32,
                        "target" : tf.int32,
                        "dec_len" : tf.int32,
                        "answer_words": tf.string
                      }, output_shapes={
                        "enc_q_len": [],
                        "enc_q_input" : [None],
                        "enc_q_input_extend_vocab" : [None],
                        "question_oovs" : [None],
                        "question_words": [None],
                        "enc_c_len":[],
                        "enc_c_input" : [None],
                        "enc_c_input_extend_vocab" : [None],
                        "context_words": [None],
                        "context_oovs" : [None],
                        "dec_input" : [None],
                        "target" : [None],
                        "dec_len" : [],
                        "answer_words": [None]
                      })
  dataset = dataset.padded_batch(batch_size, padded_shapes=({
                        "enc_q_len": [],
                        "enc_q_input" : [None],
                        "enc_q_input_extend_vocab" : [None],
                        "question_oovs" : [None],
                        "question_words": [None],
                        "enc_c_len":[],
                        "enc_c_input" : [None],
                        "enc_c_input_extend_vocab"  : [None],
                        "context_oovs" : [None],
                        "context_words": [None],
                        "dec_input" : [max_dec_len],
                        "target" : [max_dec_len],
                        "dec_len" : [],
                        "answer_words": [max_dec_len-1]}),
                      padding_values={
                        "enc_q_len": -1,
                        "enc_q_input" : 0,
                        "enc_q_input_extend_vocab" : 0,
                        "question_oovs" : b'',
                        "question_words": b'',
                        "enc_c_len":-1,
                        "enc_c_input" : 0,
                        "enc_c_input_extend_vocab"  : 0,
                        "context_oovs" : b'',
                        "context_words": b'',
                        "dec_input" : 0,
                        "target" : 0,
                        "dec_len" : -1,
                        "answer_words": b''
                      },
                      drop_remainder=True)
  def update(entry):
    return ({"enc_q_input" : entry["enc_q_input"],
      "extended_enc_q_input" : entry["enc_q_input_extend_vocab"],
      "question_oovs" : entry["question_oovs"],
      "enc_q_len" : entry["enc_q_len"],
      "max_q_oov_len" : tf.shape(entry["question_oovs"])[1],
      "question_tokens": entry["question_words"] },

      {"enc_c_input" : entry["enc_c_input"],
      "extended_enc_c_input" : entry["enc_c_input_extend_vocab"],
      "context_oovs" : entry["context_oovs"],
      "enc_c_len" : entry["enc_c_len"],
      "max_c_oov_len" : tf.shape(entry["context_oovs"])[1],
      "context_tokens": entry["context_words"]},

      {"dec_input" : entry["dec_input"],
      "dec_target" : entry["target"],
      "dec_len" : entry["dec_len"],
      "answer_tokens": entry["answer_words"]})


  dataset = dataset.map(update)

  return dataset

In [0]:
train_dataset = batch_generator(example_generator, np.column_stack((filtered_question_train, filtered_context_train, filtered_answer_train)), training=True)
val_dataset = batch_generator(example_generator, np.column_stack((filtered_question_val, filtered_context_val, filtered_answer_val)) )

In [0]:
# Prefetch the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE)

### Positional encoding

In [0]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [0]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

## Masking

Mask all the pad tokens in the batch of sequence. It ensures that the model does not treat padding as the input. The mask indicates where pad value `0` is present: it outputs a `1` at those locations, and a `0` otherwise.

In [0]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

# def create_padding_mask(seq):
#   seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
#   # add extra dimensions to add the padding
#   # to the attention logits.
#   return seq[:, tf.newaxis, :]  # (batch_size, 1, seq_len)

The look-ahead mask is used to mask the future tokens in a sequence. In other words, the mask indicates which entries should not be used.

This means that to predict the third word, only the first and second word will be used. Similarly to predict the fourth word, only the first, second and the third word will be used and so on.

In [0]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [0]:
def create_masks(q, c, a):
  # Encoder padding mask
  q_enc_padding_mask = create_padding_mask(q)
  c_enc_padding_mask = create_padding_mask(c)

  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  q_dec_padding_mask = create_padding_mask(q)
  c_dec_padding_mask = create_padding_mask(c)

  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.
  look_ahead_mask = create_look_ahead_mask(tf.shape(a)[1])
  dec_target_padding_mask = create_padding_mask(a)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

  
  return q_enc_padding_mask, c_enc_padding_mask, combined_mask, q_dec_padding_mask, c_dec_padding_mask

## Scaled dot product attention

In [0]:
def scaled_dot_product_attention_v1(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask, extra=False):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention_v1(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

As the softmax normalization is done on K, its values decide the amount of importance given to Q.

The output represents the multiplication of the attention weights and the V (value) vector. This ensures that the words you want to focus on are kept as-is and the irrelevant words are flushed out.

## Multi-head attention

In [0]:
def scaled_dot_product_attention_v2(Q, K, V,
                                 num_heads,
                                 mask,
                                 causality=False, dropout_rate=0.,
                                 training=True,
                                 scope="scaled_dot_product_attention"):
    '''See 3.2.1.
    Q: Packed queries. 3d tensor. [N, T_q, d_k].
    K: Packed keys. 3d tensor. [N, T_k, d_k].
    V: Packed values. 3d tensor. [N, T_k, d_v].
    causality: If True, applies masking for future blinding
    dropout_rate: A floating point number of [0, 1].
    training: boolean for controlling droput
    scope: Optional scope for `variable_scope`.
    '''
    dk = Q.get_shape().as_list()[-1]

    # dot product
    outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1]))  # (N, T_q, T_k)

    # scale
    outputs /= dk ** 0.5

    # causality or future blinding masking
    if causality:
      outputs = f_mask(outputs, type="future")

    # softmax
    attn_dists = tf.nn.softmax(tf.reduce_sum(tf.split(outputs, num_heads, axis=0), axis=0))
    outputs = tf.nn.softmax(outputs)
    attention = tf.transpose(outputs, [0, 2, 1])

    # weighted sum (context vectors)
    outputs = tf.matmul(outputs, V)  # (N, T_q, d_v)

    return outputs, attn_dists

class MultiHeadAttention_v2(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention_v2, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask, causality=False):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    num_heads=4
    Q_ = tf.concat(tf.split(q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h)
    K_ = tf.concat(tf.split(k, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)
    V_ = tf.concat(tf.split(v, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)

    # Attention
    dropout_rate=.1
    training=True
    outputs, attn_dists = scaled_dot_product_attention_v2(Q_, K_, V_, num_heads, mask, causality)

    # Restore shape
    outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, d_model)
    # Residual connection
    #outputs = queries + outputs
          
    # Normalize
    #outputs = ln(outputs)
 
    return outputs, attn_dists

## Point wise feed forward network

Point wise feed forward network consists of two fully-connected layers with a ReLU activation in between.

In [0]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

## ELMo Embedding
Unfortunately, ELMo the 4 available training layers for ELMo are not currently supported in TF2


In [0]:

# #import hub
import tensorflow_hub as hub

embed_input_sig = [
    tf.TensorSpec(shape=(BATCH_SIZE, None), dtype=tf.string), #q input
    tf.TensorSpec(shape=(None, ), dtype=tf.int32), #q extended input
    tf.TensorSpec(shape=(BATCH_SIZE, None), dtype=tf.string), #c input
    tf.TensorSpec(shape=(None, ), dtype=tf.int32), #c extended input
    tf.TensorSpec(shape=(BATCH_SIZE, None), dtype=tf.string), # q embed
    tf.TensorSpec(shape=(None, ), dtype=tf.int32), # c embed
]

@tf.function(input_signature=embed_input_sig)
def get_elmo_embeddings(question_tokens, q_len, context_tokens, c_len, tar_tokens ,tar_len):
    q_embed = elmo_embed_layer({"tokens": question_tokens, "sequence_len": q_len})
    c_embed = elmo_embed_layer({"tokens": context_tokens, "sequence_len": c_len })
    tar_embed = elmo_embed_layer({"tokens": tar_tokens, "sequence_len": tar_len})

    return q_embed, c_embed, tar_embed

In [0]:
elmo_embed_layer = hub.KerasLayer('https://tfhub.dev/google/elmo/3', signature="tokens", trainable=False, output_key="elmo")

## Encoder and decoder

The transformer model follows the same general pattern as a standard [sequence to sequence with attention model](nmt_with_attention.ipynb). 

* The input sentence is passed through `N` encoder layers that generates an output for each word/token in the sequence.
* The decoder attends on the encoder's output and its own input (self-attention) to predict the next word. 

### Encoder layer

Each encoder layer consists of sublayers:

1.   Multi-head attention (with padding mask) 
2.    Point wise feed forward networks. 

Each of these sublayers has a residual connection around it followed by a layer normalization. Residual connections help in avoiding the vanishing gradient problem in deep networks.

The output of each sublayer is `LayerNorm(x + Sublayer(x))`. The normalization is done on the `d_model` (last) axis. There are N encoder layers in the transformer.

In [0]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):
    attn_output, att_weights = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2, att_weights

In [0]:
sample_encoder_layer = EncoderLayer(d_model, 4, 2048)

sample_encoder_layer_output = sample_encoder_layer(
    tf.random.uniform((64, 43, d_model)), False, None)

print(sample_encoder_layer_output[0].shape)  # (batch_size, input_seq_len, d_model)
print(sample_encoder_layer_output[1].shape)


### Decoder layer

Each decoder layer consists of sublayers:

1.   Masked multi-head attention (with look ahead mask and padding mask)
2.   Multi-head attention (with padding mask). V (value) and K (key) receive the *encoder output* as inputs. Q (query) receives the *output from the masked multi-head attention sublayer.*
3.   Point wise feed forward networks

Each of these sublayers has a residual connection around it followed by a layer normalization. The output of each sublayer is `LayerNorm(x + Sublayer(x))`. The normalization is done on the `d_model` (last) axis.

There are N decoder layers in the transformer.

As Q receives the output from decoder's first attention block, and K receives the encoder output, the attention weights represent the importance given to the decoder's input based on the encoder's output. In other words, the decoder predicts the next word by looking at the encoder output and self-attending to its own output. See the demonstration above in the scaled dot product attention section.

In [0]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)
    self.mha3 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm4 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    self.dropout4 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output_q, enc_output_c, training, 
           look_ahead_mask, padding_mask_q, padding_mask_c):
    # enc_output.shape == (batch_size, input_seq_len, d_model)
    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask, True)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)

    attn2, attn_weights_block2 = self.mha2(enc_output_q, enc_output_q, out1, padding_mask_q)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

    attn3, attn_weights_block3 = self.mha3(enc_output_c, enc_output_c, out2, padding_mask_c)  # (batch_size, target_seq_len, d_model)
    attn3 = self.dropout2(attn3, training=training)
    out3 = self.layernorm2(attn3 + out2)  # (batch_size, target_seq_len, d_model)

    ffn_output = self.ffn(out3)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out4 = self.layernorm3(ffn_output + out3)  # (batch_size, target_seq_len, d_model)

    return out4, attn_weights_block2, attn_weights_block3

In [0]:
sample_decoder_layer = DecoderLayer(d_model, 4, 2048)

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 50, d_model)), sample_encoder_layer_output[0], sample_encoder_layer_output[1],
    False, None, None, None)

sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)

In [0]:
# Deprecated
class PointerDecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, vocab_size, rate=0.1):
    super(PointerDecoderLayer, self).__init__()
    self.vocab_size = vocab_size
    self.mha1 = MultiHeadAttention_v2(d_model, num_heads)
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.ffn = point_wise_feed_forward_network(d_model, dff)
    #self.dropout1 = tf.keras.layers.Dropout(rate)
    self.gen_layer = tf.keras.layers.Dense(1, activation=tf.sigmoid, trainable=True, use_bias=False)


  def call(self, x, before_dec, enc_input, enc_output_q, enc_output_c, training=False):
    # enc_output.shape == (batch_size, input_seq_len, d_model)
    attn1, attn_weights_block1 = self.mha1(enc_output_q, enc_output_c, x, None, True)  # (batch_size, target_seq_len, d_model)
    #attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)

    ffn_output = self.ffn(out1)  # (batch_size, target_seq_len, d_model)
    #ffn_output = self.dropout1(ffn_output, training=training)
    x = self.layernorm1(ffn_output + out1)  # (batch_size, target_seq_len, d_model)

    weights = tf.transpose(embeddingLayer.weights[0]) # (d_model, vocab_size)
    logits = tf.einsum('ntd,dk->ntk', x, weights) # (N, T2, vocab_size)
    gens = self.gen_layer(tf.concat([before_dec, x, attn_weights_block1], axis=-1))
    logits = tf.nn.softmax(logits)
    logits = calc_final_dist(enc_input, gens, logits, attn_weights_block1, self.vocab_size)

    return logits

### Encoder

The `Encoder` consists of:
1.   Input Embedding
2.   Positional Encoding
3.   N encoder layers

The input is put through an embedding which is summed with the positional encoding. The output of this summation is the input to the encoder layers. The output of the encoder is the input to the decoder.

In [0]:
class Similarity(tf.keras.layers.Layer):

    def __init__(self, **kwargs):
        super(Similarity, self).__init__(**kwargs)

    def compute_similarity(self, repeated_context_vectors, repeated_query_vectors):
        element_wise_multiply = repeated_context_vectors * repeated_query_vectors
        concatenated_tensor = tf.keras.backend.concatenate(
            [repeated_context_vectors, repeated_query_vectors, element_wise_multiply], axis=-1)
        dot_product = tf.squeeze(tf.keras.backend.dot(concatenated_tensor, self.kernel), axis=-1)
        return tf.keras.activations.linear(dot_product + self.bias)

    def build(self, input_shape):
        word_vector_dim = input_shape[0][-1]
        weight_vector_dim = word_vector_dim * 3
        self.kernel = self.add_weight(name='similarity_weight',
                                      shape=(weight_vector_dim, 1),
                                      initializer='uniform',
                                      trainable=True)
        self.bias = self.add_weight(name='similarity_bias',
                                    shape=(),
                                    initializer='ones',
                                    trainable=True)
        super(Similarity, self).build(input_shape)

    def call(self, inputs):
        context_vectors, query_vectors = inputs
        num_context_words = tf.shape(context_vectors)[1]
        num_query_words = tf.shape(query_vectors)[1]
        context_dim_repeat = tf.keras.backend.concatenate([[1, 1], [num_query_words], [1]], 0)
        query_dim_repeat = tf.keras.backend.concatenate([[1], [num_context_words], [1, 1]], 0)
        repeated_context_vectors = tf.tile(tf.expand_dims(context_vectors, axis=2), context_dim_repeat)
        repeated_query_vectors = tf.tile(tf.expand_dims(query_vectors, axis=1), query_dim_repeat)
        similarity_matrix = self.compute_similarity(repeated_context_vectors, repeated_query_vectors)
        return similarity_matrix

    def compute_output_shape(self, input_shape):
        batch_size = input_shape[0][0]
        num_context_words = input_shape[0][1]
        num_query_words = input_shape[1][1]
        return (batch_size, num_context_words, num_query_words)

    def get_config(self):
        config = super().get_config()
        return config


class C2QAttention(tf.keras.layers.Layer):

    def __init__(self, **kwargs):
        super(C2QAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        super(C2QAttention, self).build(input_shape)

    def call(self, inputs):
        similarity_matrix, encoded_question = inputs
        context_to_query_attention = tf.keras.layers.Softmax(axis=-1)(similarity_matrix)
        encoded_question = tf.expand_dims(encoded_question, axis=1)
        return tf.keras.backend.sum(tf.expand_dims(context_to_query_attention, axis=-1) * encoded_question, -2)

    def compute_output_shape(self, input_shape):
        similarity_matrix_shape, encoded_question_shape = input_shape
        return similarity_matrix_shape[:-1] + encoded_question_shape[-1:]

    def get_config(self):
        config = super().get_config()
        return config

class Q2CAttention(tf.keras.layers.Layer):

    def __init__(self, **kwargs):
        super(Q2CAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        super(Q2CAttention, self).build(input_shape)

    def call(self, inputs):
        similarity_matrix, encoded_context = inputs
        max_similarity = tf.keras.backend.max(similarity_matrix, axis=-1)
        # by default, axis = -1 in Softmax
        context_to_query_attention = tf.keras.layers.Softmax()(max_similarity)
        weighted_sum = tf.keras.backend.sum(tf.expand_dims(context_to_query_attention, axis=-1) * encoded_context, -2)
        expanded_weighted_sum = tf.expand_dims(weighted_sum, 1)
        num_of_repeatations = tf.shape(encoded_context)[1]
        return tf.tile(expanded_weighted_sum, [1, num_of_repeatations, 1])

    def compute_output_shape(self, input_shape):
        similarity_matrix_shape, encoded_context_shape = input_shape
        return similarity_matrix_shape[:-1] + encoded_context_shape[-1:]

    def get_config(self):
        config = super().get_config()
        return config


In [0]:
embeddingLayer = tf.keras.layers.Embedding(tokenizer_all.vocab_size + 2, d_model)

In [0]:
class Encoder_QA(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, vocab_size,
               q_maximum_position_encoding, c_maximum_position_encoding, rate=0.1, num_q_model_layers=2, num_c_model_layers=2):
    super(Encoder_QA, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    self.num_q_model_layers = num_q_model_layers
    self.num_c_model_layers = num_c_model_layers
    
    self.q_pos_encoding = positional_encoding(q_maximum_position_encoding, 
                                            self.d_model)
    
    self.c_pos_encoding = positional_encoding(c_maximum_position_encoding, 
                                            self.d_model)
    
    self.q_enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.c_enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, q, c, training, q_mask, c_mask):
    

    q_seq_len = tf.shape(q)[1]
    c_seq_len = tf.shape(c)[1]

    q = self.dropout(q, training=training)
    
    for i in range(self.num_layers):
      q, q_concat_att = self.q_enc_layers[i](q, training, q_mask)

    c = self.dropout(c, training=training)
    
    for i in range(self.num_layers):
      c, c_concat_att  = self.c_enc_layers[i](c, training, c_mask)
    
    return q, c, q_concat_att, c_concat_att  # (batch_size, input_seq_len, d_model)

In [0]:
sample_encoder = Encoder_QA(num_layers=2, d_model=d_model, num_heads=4, 
                         dff=2048, vocab_size=8500,
                         q_maximum_position_encoding=10000, c_maximum_position_encoding=10000)
enc_temp_input = tf.random.uniform((64, 62), dtype=tf.int32, minval=0, maxval=200)
enc_temp_input_2 = tf.random.uniform((64, 100), dtype=tf.int32, minval=0, maxval=200)

sample_encoder_output = sample_encoder(embeddingLayer(enc_temp_input), embeddingLayer(enc_temp_input_2), training=False, q_mask=None, c_mask=None)

print (sample_encoder_output[0].shape)  # (batch_size, input_seq_len, d_model)
print (sample_encoder_output[1].shape)  # (batch_size, input_seq_len, d_model)
print (sample_encoder_output[3].shape)  # (batch_size, input_seq_len, d_model)

### Decoder

 The `Decoder` consists of:
1.   Output Embedding
2.   Positional Encoding
3.   N decoder layers

The target is put through an embedding which is summed with the positional encoding. The output of this summation is the input to the decoder layers. The output of the decoder is the input to the final linear layer.

In [0]:
class Decoder_QA(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Decoder_QA, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    self.vocab_size = target_vocab_size
    self.num_heads=num_heads
    self.depth = self.d_model // self.num_heads

    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    
    self.dec_layers= [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)

    self.Wh = tf.keras.layers.Dense(1)
    self.Ws = tf.keras.layers.Dense(1)
    self.Wx = tf.keras.layers.Dense(1)
    self.V = tf.keras.layers.Dense(1)

  def call(self, x, enc_output, enc_input, training, 
           look_ahead_mask, q_padding_mask, c_padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}

    before_x = x

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      # Query encoder ouput
      x, block1, block2 = self.dec_layers[i](x, enc_output[0], enc_output[1], training,
                                              look_ahead_mask, q_padding_mask, c_padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

    #context vectors
    enc_out_shape = tf.shape(enc_output[1])
    context = tf.reshape(enc_output[1],(enc_out_shape[0], enc_out_shape[1], self.num_heads, self.depth) ) # shape : (batch_size, input_seq_len, num_heads, depth)
    context = tf.transpose(context, [0,2,1,3]) # (batch_size, num_heads, input_seq_len, depth)
    context = tf.expand_dims(context, axis=2)  # (batch_size, num_heads, 1, input_seq_len, depth)

    attn = tf.expand_dims(block2, axis=-1)  # (batch_size, num_heads, target_seq_len, input_seq_len, 1)

    context = context * attn # (batch_size, num_heads, target_seq_len, input_seq_len, depth)
    context = tf.reduce_sum(context, axis=3) # (batch_size, num_heads, target_seq_len, depth)
    context = tf.transpose(context, [0,2,1,3]) # (batch_size, target_seq_len, num_heads, depth)
    context = tf.reshape(context, (tf.shape(context)[0], tf.shape(context)[1], self.d_model)) # (batch_size, target_seq_len, d_model)

    #P_gens computing
    a = self.Wx(before_x)
    b = self.Ws(x)
    c = self.Wh(context)
    p_gens = tf.sigmoid(self.V(a + b + c))

    return x, attention_weights, p_gens

In [0]:
sample_decoder = Decoder_QA(num_layers=4, d_model=d_model, num_heads=4, 
                         dff=2048, target_vocab_size=tokenizer_all.vocab_size +2 ,
                         maximum_position_encoding=5000)

temp_input = tf.random.uniform((64, 26), dtype=tf.int32, minval=0, maxval=200)

output, attn, p_gens = sample_decoder(embeddingLayer(temp_input), 
                              enc_output=sample_encoder_output, 
                              enc_input=embeddingLayer(enc_temp_input_2), 
                              training=False,
                              look_ahead_mask=None, 
                              q_padding_mask=None,
                              c_padding_mask=None)

output.shape, attn['decoder_layer2_block2'].shape

# Currently trying to align encoder input shape

In [0]:
def _calc_final_dist( _enc_batch_extend_vocab, vocab_dists, attn_dists, p_gens, batch_oov_len, vocab_size, batch_size):
  """Calculate the final distribution, for the pointer-generator model
  Args:
  vocab_dists: The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file.
  attn_dists: The attention distributions. List length max_dec_steps of (batch_size, attn_len) arrays
  Returns:
  final_dists: The final distributions. List length max_dec_steps of (batch_size, extended_vsize) arrays.
  """
  # Multiply vocab dists by p_gen and attention dists by (1-p_gen)
  vocab_dists = [p_gen * dist for (p_gen,dist) in zip(p_gens, vocab_dists)]
  attn_dists = [(1-p_gen) * dist for (p_gen,dist) in zip(p_gens, attn_dists)]

  # Concatenate some zeros to each vocabulary dist, to hold the probabilities for in-article OOV words
  extended_vsize = vocab_size + batch_oov_len # the maximum (over the batch) size of the extended vocabulary
  extra_zeros = tf.zeros((batch_size, batch_oov_len ))
  vocab_dists_extended = [tf.concat(axis=1, values=[dist, extra_zeros]) for dist in vocab_dists] # list length max_dec_steps of shape (batch_size, extended_vsize)

  # Project the values in the attention distributions onto the appropriate entries in the final distributions
  # This means that if a_i = 0.1 and the ith encoder word is w, and w has index 500 in the vocabulary, then we add 0.1 onto the 500th entry of the final distribution
  # This is done for each decoder timestep.
  # This is fiddly; we use tf.scatter_nd to do the projection
  batch_nums = tf.range(0, limit=batch_size) # shape (batch_size)
  batch_nums = tf.expand_dims(batch_nums, 1) # shape (batch_size, 1)
  attn_len = tf.shape(_enc_batch_extend_vocab)[1] # number of states we attend over
  batch_nums = tf.tile(batch_nums, [1, attn_len]) # shape (batch_size, attn_len)
  indices = tf.stack( (batch_nums, _enc_batch_extend_vocab), axis=2) # shape (batch_size, enc_t, 2)
  shape = [batch_size, extended_vsize]
  attn_dists_projected = [tf.scatter_nd(indices, copy_dist, shape) for copy_dist in attn_dists] # list length max_dec_steps (batch_size, extended_vsize)

  # Add the vocab distributions and the copy distributions together to get the final distributions
  # final_dists is a list length max_dec_steps; each entry is a tensor shape (batch_size, extended_vsize) giving the final distribution for that decoder timestep
  # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore.
  final_dists = [vocab_dist + copy_dist for (vocab_dist,copy_dist) in zip(vocab_dists_extended, attn_dists_projected)]

  return final_dists

## Create the Transformer

Transformer consists of the encoder, decoder and a final linear layer. The output of the decoder is the input to the linear layer and its output is returned.

In [0]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, vocab_size,
                  pe_input_q, pe_input_c, pe_target, embedding_layer, rate=0.1):
    super(Transformer, self).__init__()
    self.vocab_size = vocab_size
    self.num_layers=num_layers
    self.num_heads = num_heads
    self.encoder = Encoder_QA(num_layers, d_model, num_heads, dff, 
                           vocab_size, pe_input_q, pe_input_c, rate)

    self.decoder = Decoder_QA(num_layers, d_model, num_heads, dff, 
                           vocab_size, pe_target, rate)

    self.final_layer = tf.keras.layers.Dense(vocab_size)
    self.embedding_layer = embedding_layer
    
    
  def call(self, q, c, c_ext, tar, training, q_enc_padding_mask, c_enc_padding_mask,
           look_ahead_mask, q_dec_padding_mask, c_dec_padding_mask,                                  
           q_embed, c_embed, tar_embed,
           max_oov_len=0, batch_size=BATCH_SIZE, ):
    #elmo_embed_layer({"tokens": q['question_tokens'], "sequence_len": q['enc_q_len']})
    # embed_q = self.embedding_layer(q)
    # embed_c = self.embedding_layer(c)
    # embed_tar = self.embedding_layer(tar)
    if q_embed is not None:
      embed_q = q_embed
      embed_c = c_embed
      embed_tar = tar_embed
    else:
      embed_q = self.embedding_layer(q)
      embed_c = self.embedding_layer(c)
      embed_tar = self.embedding_layer(tar)
    

    enc_output = self.encoder(embed_q, embed_c, training, q_enc_padding_mask, c_enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

    dec_output, attention_weights, p_gens = self.decoder(
        embed_tar, enc_output, embed_c, training, look_ahead_mask, q_dec_padding_mask, c_dec_padding_mask)
    output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

    output = tf.nn.softmax(output) # (batch_size, tar_seq_len, vocab_size)

    attn_dists = attention_weights['decoder_layer{}_block2'.format(self.num_layers)] # (batch_size,num_heads, targ_seq_len, inp_seq_len)
    attn_dists = tf.reduce_sum(attn_dists, axis=1)/self.num_heads # (batch_size, targ_seq_len, inp_seq_len)

    final_dists = _calc_final_dist( c_ext, tf.unstack(output, axis=1) , tf.unstack(attn_dists, axis=1), tf.unstack(p_gens, axis=1), max_oov_len, self.vocab_size, batch_size)
    final_output = tf.stack(final_dists, axis=1)

    return final_output, attention_weights

In [0]:
sample_transformer = Transformer(
    num_layers=4, d_model=d_model, num_heads=4, dff=2048, 
    vocab_size=tokenizer_all.vocab_size, pe_input_q=10000, pe_input_c=10000, pe_target=6000, embedding_layer=embeddingLayer)

temp_q = tf.random.uniform((BATCH_SIZE, 38), dtype=tf.int32, minval=0, maxval=200)
temp_c = tf.random.uniform((BATCH_SIZE, 38), dtype=tf.int32, minval=0, maxval=200)
temp_target = tf.random.uniform((BATCH_SIZE, 36), dtype=tf.int32, minval=0, maxval=200)

fn_out, _ = sample_transformer(temp_q, temp_c, temp_c, temp_target, training=False, 
                               q_enc_padding_mask=None, 
                               c_enc_padding_mask=None,
                               look_ahead_mask=None,
                               q_dec_padding_mask=None,
                               c_dec_padding_mask=None,
                               q_embed = None,
                               c_embed = None,
                               tar_embed = None,
                               batch_size=BATCH_SIZE,
)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

### GloVE Embedding Layer

In [0]:
# prepare GloVE embedding matrix
# from keras.initializers import Constant

# num_words = len(tokenizer_all.tokens)
# embedding_matrix = np.zeros((tokenizer_all.vocab_size + 2, d_model))
# for i in range(0, num_words-1):
#     word = tokenizer_all.tokens[i]

#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector

# # load pre-trained word embeddings into an Embedding layer
# # note that we set trainable = False so as to keep the embeddings fixed
# embeddingLayer = tf.keras.layers.Embedding(tokenizer_all.vocab_size + 2,
#                             d_model,
#                             embeddings_initializer=Constant(embedding_matrix),
#                             trainable=False)

# num_words = len(tokenizer_all.subwords)
# embedding_matrix = np.zeros((tokenizer_all.vocab_size + 2, d_model))
# for i in range(0, num_words-1):
#     word = tokenizer_all.subwords[i]
#     if word[-1] == '_':
#       word = word[0:-1]

#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector

# # load pre-trained word embeddings into an Embedding layer
# # note that we set trainable = False so as to keep the embeddings fixed
# embeddingLayer = tf.keras.layers.Embedding(tokenizer_all.vocab_size + 2,
#                             d_model,
#                             embeddings_initializer=Constant(embedding_matrix),
#                             trainable=False)

## Set hyperparameters

In [0]:
num_layers = 4
dff = 256
num_heads = 4
vocab_size = tokenizer_all.vocab_size + 2
dropout_rate = 0.05

## Optimizer, Loss, and Metrics

In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)
temp_learning_rate_schedule = CustomSchedule(d_model)

plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

## Training and checkpointing

In [0]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

#embeddingLayer = tf.keras.layers.Embedding(vocab_size, d_model)
# embeddingLayer = tf.keras.layers.Embedding(vocab_size,
#                                           d_model,
#                                           embeddings_initializer=Constant(embedding_matrix),
#                                           trainable=False)
#elmo_embed_layer = hub.KerasLayer('https://tfhub.dev/google/elmo/3', signature="tokens", trainable=False, output_key="elmo")
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          vocab_size,
                          pe_input_q=vocab_size, 
                          pe_input_c=vocab_size,
                          pe_target=vocab_size,
                          rate=dropout_rate,
                          embedding_layer=elmo_embed_layer)

loss = []
accuracy = []
e = []

#checkpoint_path = "./checkpoint/train3/v11"
checkpoint_path = "/content/drive/My Drive/W266/FInalProject/transformer_v7_checkpoints/l_squad_elmo_4l_b64_lvocab_slr"


ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)
#status = ckpt.restore(tf.train.latest_checkpoint(checkpoint_path))
#print(status)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [0]:
tokenizer_all.save_to_file("/content/drive/My Drive/W266/FInalProject/transformer_v7_vocab/l_squad_elmo_4l_b64_lvocab_slr")

Create the checkpoint path and the checkpoint manager. This will be used to save checkpoints every `n` epochs.

In [0]:
print(f'MAX_Q_LEN: {MAX_Q_LEN}')
print(f'MAX_C_LEN: {MAX_C_LEN}')
print(f'MAX_A_LEN: {MAX_A_LEN}')

In [0]:
# # if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

In [0]:
EPOCHS = 8

In [0]:
train_step_signature = [
    tf.TensorSpec(shape=(BATCH_SIZE, None), dtype=tf.int32), #q input
    tf.TensorSpec(shape=(BATCH_SIZE, None), dtype=tf.int32), #q extended input
    tf.TensorSpec(shape=(BATCH_SIZE, None), dtype=tf.int32), #c input
    tf.TensorSpec(shape=(BATCH_SIZE, None), dtype=tf.int32), #c extended input
    tf.TensorSpec(shape=(None), dtype=tf.int32), # c max oov length
    tf.TensorSpec(shape=(BATCH_SIZE, MAX_A_LEN), dtype=tf.int32), # target input
    tf.TensorSpec(shape=(BATCH_SIZE, None, d_model), dtype=tf.float32), # q embed
    tf.TensorSpec(shape=(BATCH_SIZE, None, d_model), dtype=tf.float32), # c embed
    tf.TensorSpec(shape=(BATCH_SIZE, MAX_A_LEN-1, d_model), dtype=tf.float32), # tar embed
]

@tf.function(input_signature=train_step_signature)
def train_step(q, q_e, c, c_e, max_c_oov_len, tar_inp_1, q_embed, c_embed, tar_embed):
  tar_inp = tar_inp_1[:, :-1] # start token
  tar_real = tar_inp_1[:, 1:] # end token

  q_enc_padding_mask, c_enc_padding_mask, combined_mask, q_dec_padding_mask, c_dec_padding_mask = create_masks(q, c, tar_inp)
  with tf.GradientTape() as tape:
    predictions, _ = transformer(q, c, c_e, tar_inp, 
                                 True, 
                                 q_enc_padding_mask, 
                                 c_enc_padding_mask,
                                 combined_mask, 
                                 q_dec_padding_mask,
                                 c_dec_padding_mask,
                                 q_embed, c_embed, tar_embed,
                                 max_oov_len=max_c_oov_len)
    
    loss = loss_function(tar_real, predictions)


  gradients = tape.gradient(loss, transformer.trainable_variables)   
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar_real, predictions)

In [0]:
for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  # question, context -> answer
  for (batch, (q, c, tar)) in enumerate(train_dataset):
    q_embed, c_embed, tar_embed = get_elmo_embeddings(q['question_tokens'], q['enc_q_len'], c['context_tokens'], c['enc_c_len'], tar['answer_tokens'], tar['dec_len'])
    train_step(q['enc_q_input'], q['extended_enc_q_input'], c['enc_c_input'], c['extended_enc_c_input'], c['max_c_oov_len'], tar['dec_input'],
               q_embed,
               c_embed,
               tar_embed)

    if batch % 50 == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
  #if (epoch + 1) % 2 == 0:
  ckpt_save_path = ckpt_manager.save()
  print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
    
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))


  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
  loss.append(train_loss.result())
  accuracy.append(train_accuracy.result())
  e.append(epoch)

In [0]:
import matplotlib
matplotlib.pyplot.clf()
matplotlib.pyplot.plot(e, accuracy)
matplotlib.pyplot.xlabel('epochs')
matplotlib.pyplot.ylabel('accuracy')
matplotlib.pyplot.show()

matplotlib.pyplot.plot(e, loss)
matplotlib.pyplot.xlabel('epochs')
matplotlib.pyplot.ylabel('loss')
matplotlib.pyplot.show()

## Evaluate

In [0]:
def convert_token_ids_to_text(token_ids, batch_size):
  output_text = []
  for i in range(0,batch_size):
    output = tokenizer_all.decode(token_ids[i])
    output_text.append([output])
  return output_text

In [0]:
def predict(q_embed, c_embed, enc_q_input, enc_c_input, ext_c_input, max_c_oov_len):

  output = tf.tile([[tokenizer_all.vocab_size]], [BATCH_SIZE, 1]) # tokenizer_all.vocab_size = start_decoding
  output_tokens = tf.tile([['<s>']], [BATCH_SIZE, 1])
  #q_embed, c_embed, _ = get_elmo_embeddings(q_tokens, enc_q_len, c_tokens, enc_c_len, tar_tokens, tar_len)  
  

  for i in range(MAX_A_LEN):
    
    seq_len = tf.convert_to_tensor(np.repeat(i+1, BATCH_SIZE), tf.int32)#tf.convert_to_tensor(i+1, tf.int32)
    pred_tar_embed = elmo_embed_layer({"tokens": output_tokens, "sequence_len": seq_len})
    q_enc_padding_mask, c_enc_padding_mask, combined_mask, q_dec_padding_mask, c_dec_padding_mask = create_masks(enc_q_input, enc_c_input, output)
    
    predictions, attention_weights = transformer(enc_q_input, enc_c_input, ext_c_input, output, 
                                False, 
                                q_enc_padding_mask, 
                                c_enc_padding_mask,
                                combined_mask, 
                                q_dec_padding_mask,
                                c_dec_padding_mask,
                                q_embed,
                                c_embed,
                                pred_tar_embed,
                                max_oov_len=max_c_oov_len)
    
    # select the last word from the seq_len dimension
    predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)
    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    
    # concatentate the predicted_id to the output which is given to the decoder
    # as its input.
    output = tf.concat([output, predicted_id], axis=-1)
    output_tokens = tf.concat([output_tokens, tf.convert_to_tensor(convert_token_ids_to_text(predicted_id, BATCH_SIZE), tf.string)], axis=-1)

  return output, attention_weights



In [0]:
#import shutil
 #!gsutil cp -r gs://checkpoints /drive/My Drive/W266/FInalProject/transformer_v2_checkpoints/
#shutil.copytree("./checkpoint/train3/v11", "drive/My Drive/W266/FInalProject/transformer_v7_checkpoints/s_elmo") 


In [0]:
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from w266_gutenberg_quiz.library.utility.squad_sparknotes import SquADDataSetSparkNotes

data_set = SquADDataSetSparkNotes(data_path="w266_gutenberg_quiz/data/sp_squad_document_qa_passages/test.data")

test_size=.1
random_state=42
question_dev, question_val, context_dev, context_val, answer_dev, answer_val, mc_answers_a_dev, mc_answers_a_val, mc_answers_b_dev, mc_answers_b_val, mc_answers_c_dev, mc_answers_c_val, mc_answers_d_dev, mc_answers_d_val= train_test_split(data_set.questions, 
                                    data_set.contexts, 
                                    data_set.answers, 
                                    data_set.mc_answers_a, 
                                    data_set.mc_answers_b,
                                    data_set.mc_answers_c,
                                    data_set.mc_answers_d)

filtered_question_dev = []
filtered_question_test = []
filtered_context_dev = []
filtered_context_test = []
filtered_answer_dev = []
filtered_answer_test = []

fil_mc_answers_a_dev=[]
fil_mc_answers_b_dev=[] 
fil_mc_answers_c_dev=[]
fil_mc_answers_d_dev=[]


## For GPU memory purposes - had to limit the size of the question, passage, and answers
max_answer_char_length = 50
max_question_char_length = 200
max_context_char_length = 2000
empty_a = 0
for i in range(0, len(question_dev)):
  if len(answer_dev[i]) < max_answer_char_length and len(question_dev[i]) < max_question_char_length:
    if len(answer_dev[i]) == 0:
      #answer_dev[i]='8'
      #empty_a = empty_a + 1
      continue

    if len(context_dev[i]) > max_context_char_length:
      context_dev[i] = context_dev[i][:max_context_char_length]
    if len(question_dev[i]) > 100:
      context_dev[i] = context_dev[i][:100]

    filtered_question_dev.append(re.sub(r'([^\s\w]|_)+', '', question_dev[i]))
    filtered_context_dev.append(re.sub(r'([^\s\w]|_)+', '', context_dev[i]))
    filtered_answer_dev.append(re.sub(r'([^\s\w]|_)+', '', answer_dev[i]))
    fil_mc_answers_a_dev.append(mc_answers_a_dev[i])
    fil_mc_answers_b_dev.append(mc_answers_b_dev[i])
    fil_mc_answers_c_dev.append(mc_answers_c_dev[i])
    fil_mc_answers_d_dev.append(mc_answers_d_dev[i])

for i in range(0, len(question_val)):
  if len(answer_val[i]) < max_answer_char_length and len(question_val[i]) < max_question_char_length and len(context_val[i]) < max_context_char_length:
    if len(answer_val[i]) == 0:
      answer_val[i]='8'
    #  continue
    filtered_question_test.append(re.sub(r'([^\s\w]|_)+', '',question_val[i]))
    filtered_context_test.append(re.sub(r'([^\s\w]|_)+', '',context_val[i]))
    filtered_answer_test.append(re.sub(r'([^\s\w]|_)+', '', answer_val[i]))


dev_examples = tf.data.Dataset.from_tensor_slices((filtered_question_dev, filtered_context_dev, filtered_answer_dev, fil_mc_answers_a_dev, fil_mc_answers_b_dev, fil_mc_answers_c_dev, fil_mc_answers_d_dev))
dev_dataset = batch_generator(example_generator, np.column_stack((filtered_question_dev, filtered_context_dev, filtered_answer_dev)), training=False)
dev_dataset = dev_dataset.prefetch(tf.data.experimental.AUTOTUNE)


In [0]:
# Full sentence embedding
# unlike token by token embedding layer above
elmo_pred_embed_layer = hub.KerasLayer('https://tfhub.dev/google/elmo/3', signature="default", trainable=False, output_key="default")

In [0]:
# Generate predicted answers
predicted_answers = []
predicted_answer_embeddings = []

for (batch, (q, c, tar)) in enumerate(dev_dataset):
  #print(q['enc_q_input'])
  q_embed, c_embed, _ = get_elmo_embeddings(q['question_tokens'], q['enc_q_len'], c['context_tokens'], c['enc_c_len'], tar['answer_tokens'], tar['dec_len']) 
  results, att = predict(q_embed, c_embed, q['enc_q_input'], c['enc_c_input'], c['extended_enc_c_input'], c['max_c_oov_len'])
  for i in range(0, len(results)):

    predicted_answer_text = tokenizer_all.decode(results[i])
    predicted_answer_text = predicted_answer_text.replace('UNK', '')
    predicted_answer_embedding = elmo_pred_embed_layer(tf.expand_dims(predicted_answer_text, axis=0))

    predicted_answers.append(predicted_answer_text)
    predicted_answer_embeddings.append(predicted_answer_embedding)

### SparkNotes Quiz Scoring

In [0]:
def get_answer_choices_by_embedding(predicted_embedding, mc_answers_a_dev, mc_answers_b_dev, mc_answers_c_dev, mc_answers_d_dev, embed_layer):
  a_embed = embed_layer(tf.expand_dims(mc_answers_a_dev, axis=0))
  b_embed = embed_layer(tf.expand_dims(mc_answers_b_dev, axis=0))
  c_embed = embed_layer(tf.expand_dims(mc_answers_c_dev, axis=0))
  d_embed = embed_layer(tf.expand_dims(mc_answers_d_dev, axis=0))

  a_sim = cosine_similarity(predicted_embedding, a_embed)
  b_sim = cosine_similarity(predicted_embedding, b_embed)
  c_sim = cosine_similarity(predicted_embedding, c_embed)
  d_sim = cosine_similarity(predicted_embedding, d_embed)

  return [a_sim, b_sim, c_sim, d_sim]

def get_answer_choices_by_bleu(predicted_answer, mc_answers_a_dev, mc_answers_b_dev, mc_answers_c_dev, mc_answers_d_dev):
  bleu1_scores = []
  bleu1_scores.append(nltk.translate.bleu_score.sentence_bleu([mc_answers_a_dev.numpy().decode()], predicted_answer, weights=([1])))
  bleu1_scores.append(nltk.translate.bleu_score.sentence_bleu([mc_answers_b_dev.numpy().decode()], predicted_answer, weights=([1])))
  bleu1_scores.append(nltk.translate.bleu_score.sentence_bleu([mc_answers_c_dev.numpy().decode()], predicted_answer, weights=([1])))
  bleu1_scores.append(nltk.translate.bleu_score.sentence_bleu([mc_answers_d_dev.numpy().decode()], predicted_answer, weights=([1])))

  return bleu1_scores

In [0]:

def evaluate_sparknotes_quiz(predicted_answers, predicted_answer_embeddings, sp_quiz_questions, elmo_pred_embed_layer):
  i=0
  num_embed_correct = 0
  num_bleu_correct = 0
  not_zero = 0

  bleu1_scores = []
  bleu4_scores = []


  for question_dev, context_dev, answer_dev, mc_answers_a_dev, mc_answers_b_dev, mc_answers_c_dev, mc_answers_d_dev in sp_quiz_questions:
    if i >= len(predicted_answer_embeddings):
      break

    correct_index = 0
    if mc_answers_b_dev == answer_dev:
      correct_index = 1
    elif mc_answers_b_dev == answer_dev:
      correct_index = 2
    elif mc_answers_b_dev == answer_dev:
      correct_index = 3

    embed_sims = get_answer_choices_by_embedding(predicted_answer_embeddings[i],  mc_answers_a_dev, mc_answers_b_dev, mc_answers_c_dev, mc_answers_d_dev, elmo_pred_embed_layer)
    bleu1_sims = get_answer_choices_by_bleu(predicted_answers[i], mc_answers_a_dev, mc_answers_b_dev, mc_answers_c_dev, mc_answers_d_dev)
    embed_match_index = np.argmax(embed_sims)
    bleu1_match_index = np.argmax(bleu1_sims)
    
    bleu1_scores.append(nltk.translate.bleu_score.sentence_bleu([answer_dev.numpy().decode()], predicted_answers[i], weights=([1])))
    bleu4_scores.append(nltk.translate.bleu_score.sentence_bleu([answer_dev.numpy().decode()], predicted_answers[i], weights=(.25, .25, .25, .25)))

    if embed_match_index == correct_index:
      num_embed_correct = num_embed_correct + 1
    if bleu1_match_index == correct_index:
      num_bleu_correct = num_bleu_correct + 1

    i = i + 1

  quiz_embed_accuracy = num_embed_correct / i
  quiz_bleu1_accuracy = num_bleu_correct / i

  return quiz_embed_accuracy, quiz_bleu1_accuracy, bleu1_scores, bleu4_scores

In [0]:
quiz_embed_accuracy, quiz_bleu1_accuracy, bleu1_scores, bleu4_scores = evaluate_sparknotes_quiz(predicted_answers, predicted_answer_embeddings, dev_examples, elmo_pred_embed_layer)

In [0]:
print(f'quiz accuracy: {quiz_embed_accuracy}')
print(f'quiz bleu1 accuracy: {quiz_bleu1_accuracy}')
print(f'bleu1: {np.mean(bleu1_scores)}')
print(f'bleu4: {np.mean(bleu4_scores)}')

##BiDAF - Baseline

In [0]:
from allennlp.predictors import Predictor
predictor = Predictor.from_path("https://allennlp.s3.amazonaws.com/models/bidaf-model-2017.09.15-charpad.tar.gz")

In [0]:
predicted_answers = []
predicted_answer_embeddings = []

for filtered_question_dev, filtered_context_dev, filtered_answer_dev, fil_mc_answers_a_dev, fil_mc_answers_b_dev, fil_mc_answers_c_dev, fil_mc_answers_d_dev in dev_examples:

  results = predictor.predict(filtered_question_dev.numpy().decode(), passage=filtered_context_dev.numpy().decode())
  predicted_answer_text = results['best_span_str']
  predicted_answer_embedding = elmo_pred_embed_layer(tf.expand_dims(predicted_answer_text, axis=0))

  predicted_answers.append(predicted_answer_text)
  predicted_answer_embeddings.append(predicted_answer_embedding)