Credit : https://github.com/ParikhKadam/bidaf-keras

In [1]:
import tensorflow as tf
import json
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

In [2]:
tf.__version__

'2.0.0'

In [3]:
# Credit to : https://www.kaggle.com/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe
# Modified to include first answer_start and text for dev set

def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main


def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
#     ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
#     js['q_idx'] = ndx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    answer_start = []
    answer_text = []

    for answers in tqdm_notebook(main['answers'].values):
        answer_start.append(answers[0]['answer_start'])
        answer_text.append(answers[0]['text'])

    main['answer_start'] = answer_start
    main['text'] = answer_text
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [4]:
df_train = squad_json_to_dataframe_train('./data/train-v1.1.json')
df_dev = squad_json_to_dataframe_dev('./data/dev-v1.1.json')

Reading the json file
processing...
shape of the dataframe is (87599, 6)
Done
Reading the json file
processing...


HBox(children=(IntProgress(value=0, max=10570), HTML(value='')))


shape of the dataframe is (10570, 7)
Done


In [5]:
TRAIN_NUM_SAMPLES = 10 #df_train.shape[0]
DEV_NUM_SAMPLES = 10 #df_dev.shape[0]


df_train = df_train[:TRAIN_NUM_SAMPLES]
df_dev = df_dev[:DEV_NUM_SAMPLES]

In [6]:
from tqdm import tqdm_notebook
from nltk import word_tokenize

answer_start = []
answer_end = []
for i in tqdm_notebook(range(df_train.shape[0])): 
    context_split = word_tokenize(df_train.context.values[i][:df_train.answer_start.values[i]])
    answer_start.append(len(context_split))
    answer_end.append(len(context_split) + len(word_tokenize(df_train.text.values[i])))
df_train['answer_end'] = answer_end
df_train['answer_start'] = answer_start

answer_start = []
answer_end = []
for i in tqdm_notebook(range(df_dev.shape[0])): 
    context_split = word_tokenize(df_dev.context.values[i][:df_dev.answer_start.values[i]])
    answer_start.append(len(context_split))
    answer_end.append(len(context_split) + len(word_tokenize(df_dev.text.values[i])))
df_dev['answer_end'] = answer_end
df_dev['answer_start'] = answer_start


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [7]:
max_q_train = max([len(word_tokenize(t)) for t in df_train.question.values])
max_c_train = max([len(word_tokenize(t)) for t in df_train.context.values])

max_q_dev = max([len(word_tokenize(t)) for t in df_dev.question.values])
max_c_dev = max([len(word_tokenize(t)) for t in df_dev.context.values])


MAX_QUESTION_LENGTH = max(max_q_train, max_q_dev)
MAX_CONTEXT_LENGTH = max(max_c_train, max_c_dev)

In [8]:
from pymagnitude import MagnitudeUtils
from scripts import MagnitudeVectors

vectors = MagnitudeVectors(50).load_vectors()

Will download magnitude files from the server if they aren't avaialble locally.. So, grab a cup of coffee while the downloading is under progress..


In [9]:
y_train = df_train.answer_start.values, df_train.answer_end.values
x_train = df_train.context.values, df_train.question.values

y_dev = df_dev.answer_start.values, df_dev.answer_end.values
x_dev = df_dev.context.values, df_dev.question.values

In [10]:
BATCH_SIZE = 2
num_batches_per_epoch_train = int(np.ceil(TRAIN_NUM_SAMPLES/float(BATCH_SIZE)))
num_batches_per_epoch_dev = int(np.ceil(DEV_NUM_SAMPLES/float(BATCH_SIZE)))

In [11]:
from tensorflow.keras.utils import Sequence

class BatchGenerator(Sequence):
    "Generator for fit_generator"
    
    def __init__(self, X, Y, batch_size, MAX_CONTEXT_LENGTH , MAX_QUESTION_LENGTH, NUM_SAMPLES):
        self.batch_size = batch_size
        self.MAX_CONTEXT_LENGTH = MAX_CONTEXT_LENGTH
        self.MAX_QUESTION_LENGTH = MAX_QUESTION_LENGTH
        self.NUM_SAMPLES = NUM_SAMPLES
        self.num_batches = self.NUM_SAMPLES // self.batch_size
        self.X = X
        self.Y = Y
        #self.shuffle = shuffle
        
        
    def __len__(self):
        return self.num_batches
    
    def __getitem__(self, index):
        
        start_index = (index * self.batch_size) + 1
        end_index = ((index + 1) * self.batch_size) + 1
        
        X_context_batch = vectors.query([word_tokenize(q) for q in self.X[0][start_index:end_index]], pad_to_length = self.MAX_CONTEXT_LENGTH)
        X_question_batch = vectors.query([word_tokenize(q) for q in self.X[1][start_index:end_index]], pad_to_length = self.MAX_QUESTION_LENGTH)

        Y_start_batch = MagnitudeUtils.to_categorical(self.Y[0][start_index:end_index], self.MAX_CONTEXT_LENGTH)
        Y_end_batch = MagnitudeUtils.to_categorical(self.Y[1][start_index:end_index], self.MAX_CONTEXT_LENGTH)
        
        return ((X_context_batch, X_question_batch), (Y_start_batch, Y_end_batch))
    

In [12]:
train_batch = BatchGenerator(x_train, y_train, BATCH_SIZE, MAX_CONTEXT_LENGTH, MAX_QUESTION_LENGTH, TRAIN_NUM_SAMPLES)
dev_batch = BatchGenerator(x_dev, y_dev, BATCH_SIZE, MAX_CONTEXT_LENGTH, MAX_QUESTION_LENGTH, DEV_NUM_SAMPLES)

In [43]:
from tensorflow.keras import backend as K

K.clear_session()

In [44]:
def prepare_for_end_prob(inputs):
    encoded_context, merged_context, modeled_context, span_begin_probabilities = inputs
    weighted_sum = K.sum(K.expand_dims(span_begin_probabilities, axis=-1) * modeled_context, -2)
    passage_weighted_by_predicted_span = K.expand_dims(weighted_sum, axis=1)
    tile_shape = K.concatenate([[1], [K.shape(encoded_context)[1]], [1]], axis=0)
    passage_weighted_by_predicted_span = K.tile(passage_weighted_by_predicted_span, tile_shape)
    multiply1 = modeled_context * passage_weighted_by_predicted_span
    span_end_representation = K.concatenate(
            [merged_context, modeled_context, passage_weighted_by_predicted_span, multiply1])

    return span_end_representation


In [45]:
EMBED_LENGTH = 300
DROPOUT_RATE = 0.2

from tensorflow.keras.layers import Input, LSTM, Bidirectional, Concatenate, TimeDistributed, Dense, Softmax, Flatten, Lambda, Multiply, Add, Dropout
from tensorflow.keras.models import Model 
from tensorflow.keras.optimizers import Adam, Adadelta
from tensorflow.keras.activations import linear
from layers import Similarity, C2QAttention, Q2CAttention, MergedContext, SpanBegin, SpanEnd, Highway

######## INPUT LAYER #########
context_input = Input(shape = (MAX_CONTEXT_LENGTH, EMBED_LENGTH), dtype = 'float32', name = 'context_input')
question_input = Input(shape = (MAX_QUESTION_LENGTH, EMBED_LENGTH), dtype = 'float32', name = 'question_input')


####### HIGHWAY LAYER #########

highway_layer = Highway(name='highway_1')
                        
question_layer = TimeDistributed(highway_layer, name='highway_qtd')
question_embedding = question_layer(question_input)
                        
passage_layer = TimeDistributed(highway_layer, name='highway__ptd')
context_embedding = passage_layer(context_input)


######## CONTEXTUAL EMBEDDING LAYER ########
encoder_layer = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout = DROPOUT_RATE), name='bidirectional_encoder')
encoded_question = encoder_layer(question_embedding)
encoded_context = encoder_layer(context_embedding)


######## SIMILARITY LAYER ########
similarity_matrix = Similarity(name='similarity_layer')([encoded_context, encoded_question])

####### ATTENTION LAYER #########
context_to_query_attention = C2QAttention(name='context_to_query_attention')([
            similarity_matrix, encoded_question])
query_to_context_attention = Q2CAttention(name='query_to_context_attention')([
            similarity_matrix, encoded_context])

###### MERGE ATTENTIONS ########
merged_context = MergedContext(name='merged_context')(
            [encoded_context, context_to_query_attention, query_to_context_attention])

###### MODELLING LAYER #########
modeled_context = Bidirectional(LSTM(64,return_sequences=True, recurrent_dropout = DROPOUT_RATE), name='decoder')(merged_context)

###### OUTPUT LAYER SPAN BEGIN#########
span_begin_concat = K.concatenate([merged_context, modeled_context], axis = -1)
span_begin_weights = TimeDistributed(Dense(1), name = 'Dense_span_begin')(span_begin_concat)
span_begin_weights = Dropout(0.2)(span_begin_weights)
span_begin_probabilities = Softmax(name = 'span-begin-output')(K.squeeze(span_begin_weights, axis=-1))

span_end_representation = Lambda(prepare_for_end_prob)((encoded_context, merged_context, modeled_context, span_begin_probabilities))
span_end_representation = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout = DROPOUT_RATE), name='output_end_prob_decoder')(span_end_representation)

span_end_input = K.concatenate([merged_context, span_end_representation])
span_end_weights = TimeDistributed(Dense(1), name = 'Dense_span_end')(span_end_input)
span_end_weights = Dropout(0.2)(span_end_weights)
span_end_probabilities = Softmax(name = 'span-end-output')(K.squeeze(span_end_weights, axis=-1))



model = Model([context_input, question_input], [span_begin_probabilities, span_end_probabilities])

In [46]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
context_input (InputLayer)      [(None, 249, 300)]   0                                            
__________________________________________________________________________________________________
question_input (InputLayer)     [(None, 15, 300)]    0                                            
__________________________________________________________________________________________________
highway__ptd (TimeDistributed)  (None, 249, 300)     180600      context_input[0][0]              
__________________________________________________________________________________________________
highway_qtd (TimeDistributed)   (None, 15, 300)      180600      question_input[0][0]             
______________________________________________________________________________________________

In [47]:
from tensorflow.keras.optimizers import Adam, Adadelta

model.compile(optimizer = Adadelta(0.5), loss = 'categorical_crossentropy' , metrics = ['accuracy'])

In [48]:
# CSVLogger, Model Checkpoint(Test save model), 2-highway, Dev set

In [51]:
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint

checkpoint = ModelCheckpoint('./logs/saved_models/bidaf-weights-best.h5', save_best_only = True, save_weights_only = True, mode = 'min', monitor = 'val_loss', verbose = 1)
logger = CSVLogger('./logs/training.log', append = True)

In [52]:
history = model.fit_generator(generator = train_batch,
                    steps_per_epoch = num_batches_per_epoch_train, 
                    epochs = 2, 
                    validation_data = dev_batch, 
                    validation_steps = num_batches_per_epoch_dev,
                    workers = 2,
                    use_multiprocessing = True,
                    shuffle = True,
                    callbacks = [checkpoint, logger])

Epoch 1/2
Epoch 00001: val_loss improved from inf to 10.94897, saving model to ./logs/saved_models/bidaf-weights-best.h5
Epoch 2/2
Epoch 00002: val_loss improved from 10.94897 to 10.92802, saving model to ./logs/saved_models/bidaf-weights-best.h5
