a very good explanation of the BIDAF architecture : 

https://towardsdatascience.com/the-definitive-guide-to-bi-directional-attention-flow-d0e96e9e666b

character embedding with CNN :

https://towardsdatascience.com/besides-word-embedding-why-you-need-to-know-character-embedding-6096a34a3b10
https://github.com/makcedward/nlp/blob/master/sample/nlp-character_embedding.ipynb

To run this notebook you should have run the bidaf_preprocessing one.  
You should as well modify all paths

In [1]:
import tensorflow as tf
import pandas as pd
import os
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, TimeDistributed, Layer, Softmax, Concatenate, Dropout, Conv1D, GlobalMaxPooling1D
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tqdm import tqdm
import numpy as np
import pickle
import nltk
import json
nltk.download('punkt')
from nltk import word_tokenize
import gensim.downloader as gloader
import math

import sys
sys.path.append(os.path.abspath('../'))
from utils.datasets import SQUAD_dataset

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\coren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\coren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
path_word_tokenizer = os.path.abspath('../utils/tokenizers/word_tokenizer.pkl')
with open(path_word_tokenizer, 'rb') as handle:
  tokenizer = pickle.load(handle)

path_char_tokenizer = os.path.abspath('../utils/tokenizers/char_tokenizer.pkl')
with open(path_char_tokenizer, 'rb') as char_handle:
  char_tokenizer = pickle.load(char_handle)

In [3]:
train_dataset = SQUAD_dataset.from_file('../utils/datasets/train_dataset.pkl')
valid_dataset = SQUAD_dataset.from_file('../utils/datasets/valid_dataset.pkl')

In [4]:
train_dataset

SQUAD_dataset : questions : (10, 25), contexts : (10, 400), char_questions : (10, 25, 15), char_contexts : (10, 400, 15), index : (10, 1)

In [5]:
print(len(train_dataset))
len(valid_dataset)

6961


1742

In [6]:
# globals variables
QUESTION_MAXLEN = 25
CONTEXT_MAXLEN = 400
EMBEDDING_SIZE = 300 # we can try different embedding size (50, 100, 300) or even try word2vec or fastext instead of glove
WORD_VOCAB_LEN = len(tokenizer.word_index) + 1 # +1 for the pad token
BATCH_SIZE = 10
EPOCHS = 10
CHAR_VOCAB_LEN = char_tokenizer.num_words # PAD token and UNK token included
WORD_MAXLEN = 15
LR = 0.0005
N_FILTERS = EMBEDDING_SIZE
FILTER_SIZE = 3
CHAR_EMBEDDING_SIZE = 8

In [7]:
def download_glove_embedding(embedding_dimension = 50):

  """
  download glove model
  """

  download_path = 'glove-wiki-gigaword-{}'.format(embedding_dimension)
  try:
    emb_model = gloader.load(download_path)
  except ValueError as e:
      print('Glove: 50, 100, 200, 300')
      raise e
  return emb_model

def build_embedding_matrix(tokenizer, path_embedding_matrix, glove_model = None):

  """
  build the word embedding matrix based on the glove vocabulary
  """

  if os.path.exists(path_embedding_matrix):

    embedding_matrix = np.load(path_embedding_matrix)
    return embedding_matrix

  else:

    if glove_model == None:
      glove_model = download_glove_embedding(EMBEDDING_SIZE)

    embedding_matrix = np.zeros((WORD_VOCAB_LEN, EMBEDDING_SIZE))

    for w,i in tokenizer.word_index.items():

      if w in glove_model.vocab:
        embedding_matrix[i,:] = glove_model.get_vector(w)
      else:
        embedding_matrix[i,:] = np.random.randn(1, EMBEDDING_SIZE)

    del glove_model # we don't need it anymore

    np.save(path_embedding_matrix, embedding_matrix)

    return embedding_matrix

def build_char_embedding_matrix(char_tokenizer):

  """
  build the character embedding matrix
  """

  char_embedding_matrix = np.zeros((CHAR_VOCAB_LEN,CHAR_VOCAB_LEN - 1))  # we have 199 characters that we have to one hot so each character has 199 dimensions

  for char, i in char_tokenizer.word_index.items():
    if i <= 199:
      char_embedding_matrix[i][i - 1] = 1
    else:
      break
  return char_embedding_matrix

We build the embedding matrix.  
We can also initialize a char_embedding_matrix, or we can let the model learn these embeddings.

In [8]:
path_embedding_matrix = os.path.abspath('../utils/data/embedding.npy')
embedding_matrix = build_embedding_matrix(tokenizer, path_embedding_matrix)

# instead of one hot encode char tokens maybe we can use glove or randomly fill the matrix
# these embeddings should be trainable
# https://github.com/minimaxir/char-embeddings
#char_embedding_matrix = build_char_embedding_matrix(char_tokenizer)

Then we define all layers of our model

In [9]:
# utils/layers
class WordEmbedding(Layer):
    
    def __init__(self, input_dim, output_dim, input_len, embedding_matrix, trainable = False, mask_zero = True, **kwargs):
        
        super(WordEmbedding, self).__init__(**kwargs)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.input_len = input_len
        self.embedding_matrix = embedding_matrix
        self.trainable = trainable
        self.mask_zero = mask_zero

        self.word_embed = Embedding(
            input_dim = self.input_dim,
            output_dim = self.output_dim,
            weights = [self.embedding_matrix],
            trainable = self.trainable,
            input_length = self.input_len,
            mask_zero = self.mask_zero,
        )

    def build(self, input_shape):
      self.built = True

    def call(self, inputs):
        input = inputs
        return self.word_embed(input) 
    
    # inplement this method in order to get a serializable layer as part of a Functional model
    def get_config(self):
        # the base Layer class takes some keywords arguments like name and dtype, it is good to include 
        # them in the config (so we call the parent method and use the update method)
        config = super().get_config().copy()
        config.update({
            'input_dim': self.input_dim,
            'output_dim': self.output_dim,
            'input_len': self.input_len, 
            'trainable': self.trainable,
            'mask_zero': self.mask_zero
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [10]:
# utils/layers
class CharEmbedding(Layer):
    
    def __init__(self, input_dim, output_dim, input_len, **kwargs):
        
        super(CharEmbedding, self).__init__(**kwargs)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.input_len = input_len
        self.char_embed = Embedding(
            input_dim = self.input_dim, 
            output_dim = self.output_dim,  
            input_length = self.input_len
        )
        # This wrapper allows to apply a layer to every temporal slice of an input.
        # so we apply the same Embedding to every timestep (index 1) independently
        self.timed = TimeDistributed(self.char_embed)
        

    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        return self.timed(inputs)
            
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'input_dim': self.input_dim,
            'output_dim': self.output_dim,
            'input_len': self.input_len, 
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [11]:
# utils/layers
class CharCNN(Layer):
    
    def __init__(self, n_filters, filter_width, **kwargs):
        
        super(CharCNN, self).__init__(**kwargs)
        self.n_filters = n_filters
        self.filter_width = filter_width
        self.conv = Conv1D(self.n_filters, self.filter_width)
        self.pool = GlobalMaxPooling1D()
        self.timed = TimeDistributed(self.pool)
        ## add ReLU activation before max-pooling ?
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        return self.timed(self.conv(inputs))
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_filters': self.n_filters,
            'filter_width': self.filter_width, 
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [12]:
# utils/layers
class HighwayNetwork(Layer):
    
    def __init__(self, hidden_size, **kwargs):
        
        super(HighwayNetwork, self).__init__(**kwargs)
        self.hidden_size = hidden_size
        self.normal = Dense(self.hidden_size, activation = 'relu') 
        self.transform_gate = Dense(self.hidden_size, activation = 'sigmoid')
        
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):        
        
        n = self.normal(inputs)
        g = self.transform_gate(inputs)
        x = g*n + (1-g)*inputs 
        return x

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'hidden_size': self.hidden_size, 
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [13]:
# utils/layers
class ContextualEmbedding(Layer):
    
    def __init__(self, output_dim, **kwargs):
        
        super(ContextualEmbedding, self).__init__(**kwargs)
        self.output_dim = output_dim
        self.contextual = Bidirectional(LSTM(self.output_dim, return_sequences = True, dropout = 0.2))

    def build(self, input_shape):
        self.built = True 

    def call(self, inputs):
        return self.contextual(inputs)
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'output_dim': self.output_dim,
        })
        return config
    
    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [14]:
# utils/layers
class Modelling(Layer):
    
    def __init__(self, output_dim, **kwargs):
        
        super(Modelling, self).__init__(**kwargs)
        self.output_dim = output_dim
        self.modelling1 = Bidirectional(LSTM(self.output_dim, return_sequences = True, dropout = 0.2))
        self.modelling2 = Bidirectional(LSTM(self.output_dim, return_sequences = True, dropout = 0.2))
        
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        return self.modelling2(self.modelling1(inputs))
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'output_dim': self.output_dim,
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [15]:
# utils/layers
class Start(Layer):
    
    def __init__(self, **kwargs):
        
        super(Start, self).__init__(**kwargs)
        self.dense = Dense(1, activation = 'linear', use_bias = False)
        self.dropout = Dropout(0.2)
        
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        
        GM = inputs
        start = self.dense(GM)
        start = self.dropout(start)
        p1 = tf.nn.softmax(tf.squeeze(start, axis = 2))
        return p1

    def get_config(self):
      
      config = super().get_config().copy()
      return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [16]:
# utils/layers
class ModellingEnd(Layer):
    
    def __init__(self, output_dim, **kwargs):
        
        super(ModellingEnd, self).__init__(**kwargs)
        self.output_dim = output_dim
        self.end = Bidirectional(LSTM(self.output_dim, return_sequences = True, dropout = 0.2))
        
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        
        G, M = inputs
        M2 = self.end(M)
        GM2 = tf.concat([G, M2], axis = 2)
        return GM2
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'output_dim': self.output_dim,
        })
        return config

    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [17]:
# utils/layers
class End(Layer):
    
    def __init__(self, **kwargs):
        
        super(End, self).__init__(**kwargs)
        self.dense = Dense(1, activation = 'linear', use_bias = False)
        self.dropout = Dropout(0.2)
        
    def build(self, input_shape):
        self.built = True

    def call(self, inputs):
        
        GM2 = inputs
        end = self.dense(GM2)
        end = self.dropout(end)
        p2 = tf.nn.softmax(tf.squeeze(end, axis = 2))
        
        return p2


    def get_config(self):

      config = super().get_config().copy()

      return config
    
    @classmethod
    def from_config(cls, config):
      return cls(**config)

In [18]:
# utils/models
class BIDAF(Model):

  """
  the BIDAF model
  """

  def __init__(self, 
               question_maxlen, 
               context_maxlen, 
               word_vocab_len, 
               embedding_size, 
               embedding_matrix, 
               char_vocab_len,
               word_maxlen, 
               n_filters, 
               filter_size, 
               char_embedding_size,
               word_tokenizer_path,
               char_tokenizer_path,
               **kwargs):
    
    
    super(BIDAF, self).__init__(name = 'BIDAF', **kwargs)

    self.question_maxlen = question_maxlen
    self.context_maxlen = context_maxlen
    self.word_vocab_len = word_vocab_len
    self.embedding_size = embedding_size
    self.embedding_matrix = embedding_matrix
    self.char_vocab_len = char_vocab_len
    self.char_embedding_size = char_embedding_size
    self.word_maxlen = word_maxlen
    self.n_filters = n_filters
    self.filter_size = filter_size

    with open(word_tokenizer_path, 'rb') as handle:
      self.word_tokenizer = pickle.load(handle)

    with open(char_tokenizer_path, 'rb') as handle:
      self.char_tokenizer = pickle.load(handle)

    self.similarity_weights = Dense(1, use_bias = False)

    # layers
    self.word_embedding = WordEmbedding(self.word_vocab_len, self.embedding_size, self.question_maxlen, self.embedding_matrix)
    self.char_embedding = CharEmbedding(self.char_vocab_len, self.char_embedding_size, self.word_maxlen)
    self.cnn = CharCNN(self.n_filters, self.filter_size)
    self.highway = HighwayNetwork(hidden_size = self.embedding_size + self.n_filters)
    self.contextual = ContextualEmbedding(self.embedding_size)
    self.modelling = Modelling(self.embedding_size)
    self.modelling_end = ModellingEnd(self.embedding_size)
    self.output_start = Start()
    self.ouput_end = End()

  def _get_tokens(self):

    self.question = self.word_tokenizer.texts_to_sequences([self._question])
    self.context = self.word_tokenizer.texts_to_sequences([self._context])
    self.context_ids = self.context

  def _get_padded_sequences(self):

    self.question = tf.keras.preprocessing.sequence.pad_sequences(self.question, maxlen = self.question_maxlen, padding = 'post', truncating = 'post')
    self.context = tf.keras.preprocessing.sequence.pad_sequences(self.context, maxlen = self.context_maxlen, padding = 'post', truncating = 'post')

  def make_prediction(self, question, context):

    self._question = word_tokenize(question)
    self._context = word_tokenize(context)

    self._get_tokens()
    self._get_padded_sequences()

    self.__get_tokens()
    self.__get_padded_sequences()

    start, end = self.predict([
                      self.question,
                      self.context,
                      self.question_char,
                      self.context_char
                ])
    
    start = start.argmax()
    end = end.argmax() + 1

    if start > end:
      start = end
      end = start

    answer = ''

    for i in range(start, end):
      answer += self.word_tokenizer.index_word[self.context_ids[0][i]] + ' '
    return answer.strip()

  def multi_predictions(self, datasets, path):
    predictions = {}

    for dataset in datasets:

      for batch in dataset:

        if len(batch) == 3:
          sequences = batch[0]
          id = batch[2][0].tolist()
        else:
          sequences = batch[0]
          id = batch[1][0].tolist()          

        qw, cw, qc, cc = sequences

        start, end = self.predict([qw, cw, qc, cc])

        start = start.argmax(axis = 1)
        end = end.argmax(axis = 1)

        answers = []

        for idx, (s, e) in enumerate(zip(start, end)):
          if s > e:
            s = e
            e = s

          answer = ''
          for i in range(s,e):
            answer += self.word_tokenizer.index_word[cw[idx][i]] + ' '
          answers.append(answer.strip())
        
        predictions.update({i.strip(): a for i,a in zip(id, answers)})
    
    with open(path, 'w') as handle:
      json.dump(predictions, handle)

    print(f' the file containing the predictions has been created in {path}')
    
  def __get_tokens(self):

    self.question_char = self.char_tokenizer.texts_to_sequences(self._question)
    self.context_char = self.char_tokenizer.texts_to_sequences(self._context)

  def __get_padded_sequences(self):

    # pad question at the character level
    v = tf.keras.preprocessing.sequence.pad_sequences(self.question_char, padding = 'post', truncating = 'post', maxlen = self.word_maxlen)
    to_add = self.question_maxlen - v.shape[0]
    add = np.zeros((to_add, self.word_maxlen))
    arr = np.vstack([v,add])
    self.question_char = arr

    # pad context at the character level
    v = tf.keras.preprocessing.sequence.pad_sequences(self.context_char, padding = 'post', truncating = 'post', maxlen = self.word_maxlen)
    to_add = self.context_maxlen - v.shape[0]
    add = np.zeros((to_add, self.word_maxlen))
    arr = np.vstack([v,add])
    self.context_char = arr

    self.question_char = tf.expand_dims(self.question_char, axis = 0)
    self.context_char = tf.expand_dims(self.context_char, axis = 0)


  def call(self, inputs, training = True):
    qw, cw, qc, cc = inputs  # (bs, q_len), (bs, ctx_len), (bs, q_len, w_len), (bs, ctx_len, w_len)

    # embedding always non-trainable
    qw = self.word_embedding(qw) # (bs, q_len, emb)
    cw = self.word_embedding(cw) # (bs, ctx_len, emb)

    qc = self.char_embedding(qc) # (bs, q_len, w_len, char_emb)
    cc = self.char_embedding(cc) # (bs, ctx_len, w_len, char_emb)

    qc = self.cnn(qc) # (bs, q_len, n_filters)
    cc = self.cnn(cc) # (bs, ctx_len, n_filters)

    H = tf.concat([cw, cc], axis = 2) # (bs, ctx_len, emb + n_filters)
    U = tf.concat([qw, qc], axis = 2) # (bs, q_len, emb + n_filters)

    # highway
    H = self.highway(H) # (bs, ctx_len, emb + n_filters)
    U = self.highway(U) # (bs, q_len, emb + n_filters)

    # contextual embedding
    H = self.contextual(H) # (bs, ctx_len, emb + n_filters)
    U = self.contextual(U) # (bs, q_len, emb + n_filters)

    # similarity matrix
    expand_h = tf.concat([[1, 1], [tf.shape(U)[1]], [1]], axis = 0) # [1, 1, q_len, 1]
    expand_u = tf.concat([[1], [tf.shape(H)[1]], [1, 1]], axis = 0) # [1, ctx_len, 1, 1]

    h = tf.tile(tf.expand_dims(H, axis = 2), expand_h) # (bs, ctx_len, q_len, emb + n_filters)
    u = tf.tile(tf.expand_dims(U, axis = 1), expand_u) # (bs, ctx_len, q_len, emb + n_filters)
    h_u = h * u # (bs, ctx_len, q_len, emb + n_filters)

    alpha = tf.concat([h, u, h_u], axis = -1) # (bs, ctx_len, q_len, 3 * (emb + n_filters))
    
    similarity_matrix = self.similarity_weights(alpha) # (bs, ctx_len, q_len, 1)
    similarity_matrix = tf.squeeze(similarity_matrix, 3) # (bs, ctx_len, q_len)

    # context to query attention
    attention_weights = tf.nn.softmax(similarity_matrix, axis = -1) # (bs, ctx_len, q_len)
    C2Q = K.batch_dot(attention_weights, U) # (bs, ctx_len, emb + n_filters)

    # query to context attention
    attention_weights = tf.nn.softmax(tf.math.reduce_max(similarity_matrix, axis = 2), axis = -1) # (bs, ctx_len)
    attention_weights = tf.expand_dims(attention_weights, axis = 1) # (bs, 1, ctx_len)
    Q2C = K.batch_dot(attention_weights, H) # (bs, 1, emb + n_filters)
    Q2C = tf.tile(Q2C, [1, tf.shape(H)[1], 1]) # (bs, ctx_len, emb + n_filters)

    # query aware representation
    G = tf.concat([H, C2Q, (H * C2Q), (H * Q2C)], axis = 2) # (bs, ctx_len, 4 * (emb + n_filters) )

    # modelling
    M = self.modelling(G) # (bs, ctx_len, emb + n_filters)

    # output
    M2 = self.modelling_end([G,M]) # (bs, ctx_len, emb + n_filters)

    # start prediction
    start = self.output_start(tf.concat([G, M], axis = 2)) # (bs, ctx_len)

    # end prediction
    end = self.ouput_end(M2) # (bs, ctx_len)

    return start, end

In [19]:
bidaf_model = BIDAF(
    QUESTION_MAXLEN,
    CONTEXT_MAXLEN,
    WORD_VOCAB_LEN,
    EMBEDDING_SIZE,
    embedding_matrix,
    CHAR_VOCAB_LEN,
    WORD_MAXLEN,
    N_FILTERS,
    FILTER_SIZE,
    CHAR_EMBEDDING_SIZE,
    path_word_tokenizer,
    path_char_tokenizer
)

In [20]:
loss_function = tf.keras.losses.CategoricalCrossentropy(reduction = 'auto')
optimizer = tf.keras.optimizers.Nadam(learning_rate = LR)

In [21]:
# https://udai.gitbook.io/practical-ml/nn/training-and-debugging-of-nn <- useful blog about machine learning / deep learning
# steps to be performed in each training step
@tf.function
def train_step(model, input_vector, output_vector, loss_fn):
    with tf.GradientTape() as tape:
        # forward propagation
        output_predicted = model(input_vector, training = True)
        # loss
        loss_start = loss_function(output_vector[0], output_predicted[0])
        loss_end = loss_function(output_vector[1], output_predicted[1])
        loss_final = loss_start + loss_end
    # getting gradients
    gradients = tape.gradient(loss_final, model.trainable_variables)
    # applying gradients
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss_start, loss_end, output_predicted, gradients

In [22]:
# https://udai.gitbook.io/practical-ml/nn/training-and-debugging-of-nn
# steps to be performed in each validation step
@tf.function
def val_step(model, input_vector, output_vector, loss_fn):
    # getting output of validation data
    output_predicted = model(input_vector, training = False)
    # loss calculation
    loss_start = loss_function(output_vector[0], output_predicted[0])
    loss_end = loss_function(output_vector[1], output_predicted[1])
    return loss_start, loss_end, output_predicted

In [23]:
def f1_score(y_true, y_pred):    # taken from old keras source code
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    
    f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon())
    
    return f1_val

In [24]:
# defining functions to compute the mean loss for each epoch
train_start_loss = tf.keras.metrics.Mean(name = 'train_start_loss')
train_end_loss = tf.keras.metrics.Mean(name = 'train_end_loss')
val_start_loss = tf.keras.metrics.Mean(name = 'val_start_loss')
val_end_loss = tf.keras.metrics.Mean(name = 'val_end_loss')
train_start_f1 = tf.keras.metrics.Mean(name = 'train_start_f1')
train_end_f1 = tf.keras.metrics.Mean(name = 'train_end_f1')
val_start_f1 = tf.keras.metrics.Mean(name = 'val_start_f1')
val_end_f1 = tf.keras.metrics.Mean(name = 'val_end_f1')
train_start_acc = tf.keras.metrics.CategoricalAccuracy(name = 'train_start_acc')
train_end_acc = tf.keras.metrics.CategoricalAccuracy(name = 'train_end_acc')
val_start_acc = tf.keras.metrics.CategoricalAccuracy(name = 'val_start_acc')
val_end_acc = tf.keras.metrics.CategoricalAccuracy(name = 'val_end_acc')

In [25]:
best_loss = 100 # we initialize a loss value for model checkpoint

In [20]:
# don't run the next cell if your model is already trained
# don't run this cell if your model need to be trained but run the next one
bidaf_model.load_weights('../utils/models/weights/bidaf_weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x19f743e62e0>

In [None]:
for epoch in range(EPOCHS):
    
    # resetting the states of the loss and metrics
    train_start_loss.reset_states()
    train_end_loss.reset_states()
    val_start_loss.reset_states()
    val_end_loss.reset_states()
    train_start_f1.reset_states()
    train_end_f1.reset_states()
    val_start_f1.reset_states()
    val_end_f1.reset_states()
    train_start_acc.reset_states()
    train_end_acc.reset_states()
    val_start_acc.reset_states()
    val_end_acc.reset_states()
    
    # iterating over train data batch by batch
    for text_seq, label_seq, _ in tqdm(iterable = train_dataset, total = len(train_dataset)):
        # train step
        loss_start_, loss_end_, pred_out, gradients = train_step(bidaf_model, text_seq, label_seq, loss_function)
        # adding loss to train loss
        train_start_loss(loss_start_)
        train_end_loss(loss_end_)
        
        # calculating f1 for batch
        f1_start = f1_score(label_seq[0], pred_out[0])
        f1_end = f1_score(label_seq[1], pred_out[1])
        train_start_f1(f1_start)
        train_end_f1(f1_end)
        train_start_acc(label_seq[0], pred_out[0])
        train_end_acc(label_seq[1], pred_out[1])
    
    # validation data
    for text_seq_val, label_seq_val, _ in valid_dataset:
        # getting val output
        loss_val_start, loss_val_end, pred_out_val = val_step(bidaf_model, text_seq_val, label_seq_val, loss_function)
        
        val_start_loss(loss_val_start)
        val_end_loss(loss_val_end)
        
        # calculating metric
        f1_start_val = f1_score(label_seq_val[0], pred_out_val[0])
        f1_end_val = f1_score(label_seq_val[1], pred_out_val[1])
        val_start_f1(f1_start_val)
        val_end_f1(f1_end_val)
        val_start_acc(label_seq_val[0], pred_out_val[0])
        val_end_acc(label_seq_val[1], pred_out_val[1])
    
   
    # printing
    template = '''Epoch {}, Train Start Loss: {:0.6f}, Train Start Acc : {:0.5f}, Start F1 Score: {:0.5f}, Train End Loss: {:0.6f}, Train End Acc : {:0.5f}, End F1 Score: {:0.5f},
    Val Start Loss: {:0.6f}, Val Start Acc : {:0.5f}, Val Start F1 Score: {:0.5f}, Val End Loss: {:0.6f}, Val End Acc : {:0.5f}, Val End F1 Score: {:0.5f}'''

    print(template.format(epoch + 1, train_start_loss.result(), train_start_acc.result(), train_start_f1.result(), 
                          train_end_loss.result(), train_end_acc.result(), train_end_f1.result(),
                          val_start_loss.result(), val_start_acc.result(), val_start_f1.result(),
                          val_end_loss.result(), val_end_acc.result(), val_end_f1.result()))


    if (val_start_loss.result() + val_end_loss.result()) < best_loss:
      print('Saving weights...')
      bidaf_model.save_weights('../utils/models/weights/bidaf_weights')
      print('\n Done !')
      best_loss = (val_start_loss.result() + val_end_loss.result())

100%|██████████| 6961/6961 [44:21<00:00,  2.62it/s]


Epoch 1, Train Start Loss: 3.715324, Train Start Acc : 0.21323, Start F1 Score: 0.10070, Train End Loss: 3.507812, Train End Acc : 0.23734, End F1 Score: 0.11095,
    Val Start Loss: 2.235232, Val Start Acc : 0.43508, Val Start F1 Score: 0.28522, Val End Loss: 2.044395, Val End Acc : 0.46603, Val End F1 Score: 0.31757
Saving weights...


  0%|          | 0/6961 [00:00<?, ?it/s]


 Done !


100%|██████████| 6961/6961 [43:44<00:00,  2.65it/s]


Epoch 2, Train Start Loss: 2.681711, Train Start Acc : 0.42797, Start F1 Score: 0.37533, Train End Loss: 2.526541, Train End Acc : 0.45785, End F1 Score: 0.41619,
    Val Start Loss: 1.780379, Val Start Acc : 0.53696, Val Start F1 Score: 0.44636, Val End Loss: 1.579908, Val End Acc : 0.57331, Val End F1 Score: 0.52164
Saving weights...


  0%|          | 0/6961 [00:00<?, ?it/s]


 Done !


100%|██████████| 6961/6961 [43:42<00:00,  2.65it/s]


Epoch 3, Train Start Loss: 2.409225, Train Start Acc : 0.48303, Start F1 Score: 0.45983, Train End Loss: 2.260953, Train End Acc : 0.51763, End F1 Score: 0.50738,
    Val Start Loss: 1.675886, Val Start Acc : 0.55200, Val Start F1 Score: 0.49601, Val End Loss: 1.510953, Val End Acc : 0.58657, Val End F1 Score: 0.55100
Saving weights...


  0%|          | 0/6961 [00:00<?, ?it/s]


 Done !


100%|██████████| 6961/6961 [43:38<00:00,  2.66it/s]


Epoch 4, Train Start Loss: 2.222225, Train Start Acc : 0.52494, Start F1 Score: 0.51462, Train End Loss: 2.092869, Train End Acc : 0.55336, End F1 Score: 0.55884,
    Val Start Loss: 1.646435, Val Start Acc : 0.55774, Val Start F1 Score: 0.52987, Val End Loss: 1.514827, Val End Acc : 0.59433, Val End F1 Score: 0.58174
Saving weights...


  0%|          | 0/6961 [00:00<?, ?it/s]


 Done !


100%|██████████| 6961/6961 [43:52<00:00,  2.64it/s]
  0%|          | 0/6961 [00:00<?, ?it/s]

Epoch 5, Train Start Loss: 2.065901, Train Start Acc : 0.55566, Start F1 Score: 0.55514, Train End Loss: 1.943553, Train End Acc : 0.58259, End F1 Score: 0.59834,
    Val Start Loss: 1.676296, Val Start Acc : 0.55545, Val Start F1 Score: 0.53732, Val End Loss: 1.542550, Val End Acc : 0.58778, Val End F1 Score: 0.57692


100%|██████████| 6961/6961 [43:41<00:00,  2.66it/s]
  0%|          | 0/6961 [00:00<?, ?it/s]

Epoch 6, Train Start Loss: 1.927224, Train Start Acc : 0.58334, Start F1 Score: 0.59266, Train End Loss: 1.814614, Train End Acc : 0.61016, End F1 Score: 0.63235,
    Val Start Loss: 1.717615, Val Start Acc : 0.55085, Val Start F1 Score: 0.54641, Val End Loss: 1.594208, Val End Acc : 0.59122, Val End F1 Score: 0.58632


100%|██████████| 6961/6961 [43:36<00:00,  2.66it/s]
  0%|          | 0/6961 [00:00<?, ?it/s]

Epoch 7, Train Start Loss: 1.831880, Train Start Acc : 0.60262, Start F1 Score: 0.61802, Train End Loss: 1.711476, Train End Acc : 0.63170, End F1 Score: 0.65915,
    Val Start Loss: 1.758022, Val Start Acc : 0.55453, Val Start F1 Score: 0.55285, Val End Loss: 1.607760, Val End Acc : 0.58761, Val End F1 Score: 0.58862


 16%|█▋        | 1146/6961 [07:11<36:28,  2.66it/s]

In [28]:
def print_predictions(batch):

  """
  utility function to visualize some predictions
  """

  idx = np.random.randint(BATCH_SIZE)
  samples = valid_dataset[batch]

  sequences, labels, _ = samples

  qw = sequences[0][idx]
  cw = sequences[1][idx]
  qc = sequences[2][idx]
  cc = sequences[3][idx]

  real_start = labels[0][idx]
  real_end = labels[1][idx]

  """
  Function that takes record numbers as input and predicts the answer for that record
  """

  print('Question:')
  for i in qw:
    if i == 0:
      break
    else:
      print(tokenizer.index_word[i], end = ' ')

  print('\nContext:')
  for i in cw:
    if i == 0:
      break
    else:
      print(tokenizer.index_word[i], end = ' ')
      
  print('\nPredicted Answer:')
  _qw = qw.reshape(1, qw.shape[0])
  _cw = cw.reshape(1, cw.shape[0])
  _qc = np.expand_dims(qc, axis = 0)
  _cc = np.expand_dims(cc, axis = 0)
  start, end = bidaf_model.predict((_qw, _cw, _qc, _cc))
  start = start.argmax()
  end = end.argmax() + 1

  if start > end:
    start = end
    end = start

  for i in range(start, end ):
    print(tokenizer.index_word[cw[i]], end = ' ')
  print('\n')

In [29]:
data_points = [8,15,52,152,332]
for i in data_points:
  print_predictions(i)

Question:
in how many scenarios will sydney remain higher than melbourne in population beyond 2056 ? 
Context:
in recent years , melton , wyndham and casey , part of the melbourne statistical division , have recorded the highest growth rate of all local government areas in australia . melbourne could overtake sydney in population by 2028 , the abs has projected in two scenarios that sydney will remain larger than melbourne beyond 2056 , albeit by a margin of less than 3 % compared to a margin of 12 % today . melbourne 's population could overtake that of sydney by 2037 or 2039 , according to the first scenario projected by the abs ; primarily due to larger levels of internal migration losses assumed for sydney . another study claims that melbourne will surpass sydney in population by 2040 . 
Predicted Answer:
two scenarios 

Question:
who owned the rights to oswald ? 
Context:
universal owned the rights to the `` oswald the lucky rabbit '' character , although walt disney and ub iwerks

In [30]:
question = 'In what country is Normandy located?'
context = "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ('Norman' comes from 'Norseman') raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."

In [31]:
bidaf_model.make_prediction(question,context)

'france .'

In [32]:
path_save_validation = os.path.abspath('../utils/data/predictions.json')
bidaf_model.multi_predictions([valid_dataset], path_save_validation)

 the file containing the predictions has been created in /content/drive/MyDrive/NLP/BIDAF/utils/data/predictions.json


In [33]:
# the evaluate.py file can be downloaded on the SQUAD website https://rajpurkar.github.io/SQuAD-explorer/
!python3 drive/MyDrive/NLP/BIDAF/utils/data/evaluate.py drive/MyDrive/NLP/BIDAF/utils/data/valid_set.json drive/MyDrive/NLP/BIDAF/utils/data/predictions.json

{
  "exact": 51.07678171481077,
  "f1": 65.8105272912672,
  "total": 17413,
  "HasAns_exact": 51.07678171481077,
  "HasAns_f1": 65.8105272912672,
  "HasAns_total": 17413
}


With the (mock) unseen dataset 

In [34]:
unseen_data = SQUAD_dataset.from_file('../utils/datasets/unseen_dataset.pkl')

In [35]:
path_save_unseen = os.path.abspath('../utils/data/unseen_predictions.json')
bidaf_model.multi_predictions([unseen_data], path_save_unseen)

 the file containing the predictions has been created in /content/drive/MyDrive/NLP/BIDAF/utils/data/unseen_predictions.json


**FURTHER WORK**:
* try with GRU instead of LSTM (GRU are usually faster)
* make batches with different padding size (so far, `CONTEXT_MAXLEN`, `WORD_MAXLEN` and `QUESTION_MAXLEN` are the same for each batch, while we could create local variables for each batch )
* try others models (QANet, BERT,  Multi-Perspective Context Matching, ... )
* try different initialization methods of the char_embedding matrix