Credit : https://github.com/ParikhKadam/bidaf-keras

In [1]:
import tensorflow as tf
import json
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
    # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
tf.__version__

'2.0.0'

In [4]:
# Credit to : https://www.kaggle.com/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe
# Modified to include first answer_start and text for dev set

def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main


def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
#     ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
#     js['q_idx'] = ndx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    answer_start = []
    answer_text = []

    for answers in tqdm_notebook(main['answers'].values):
        answer_start.append(answers[0]['answer_start'])
        answer_text.append(answers[0]['text'])

    main['answer_start'] = answer_start
    main['text'] = answer_text
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [5]:
df_train = squad_json_to_dataframe_train('./data/train-v1.1.json')
df_dev = squad_json_to_dataframe_dev('./data/dev-v1.1.json')

Reading the json file
processing...
shape of the dataframe is (87599, 6)
Done
Reading the json file
processing...


HBox(children=(IntProgress(value=0, max=10570), HTML(value='')))


shape of the dataframe is (10570, 7)
Done


In [6]:
TRAIN_NUM_SAMPLES = df_train.shape[0]
DEV_NUM_SAMPLES = df_dev.shape[0]


#df_train = df_train[:TRAIN_NUM_SAMPLES]
#df_dev = df_dev[:DEV_NUM_SAMPLES]

In [10]:
from tqdm import tqdm_notebook
from nltk import word_tokenize

answer_start = []
answer_end = []
for i in tqdm_notebook(range(df_train.shape[0])): 
    context_split = word_tokenize(df_train.context.values[i][:df_train.answer_start.values[i]])
    answer_start.append(len(context_split))
    answer_end.append(len(context_split) + len(word_tokenize(df_train.text.values[i])) -1)
df_train['answer_end'] = answer_end
df_train['answer_start'] = answer_start

answer_start = []
answer_end = []
for i in tqdm_notebook(range(df_dev.shape[0])): 
    context_split = word_tokenize(df_dev.context.values[i][:df_dev.answer_start.values[i]])
    answer_start.append(len(context_split))
    answer_end.append(len(context_split) + len(word_tokenize(df_dev.text.values[i])) -1)
df_dev['answer_end'] = answer_end
df_dev['answer_start'] = answer_start


HBox(children=(IntProgress(value=0, max=87599), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10570), HTML(value='')))




In [7]:
from pymagnitude import MagnitudeUtils, Magnitude
#from scripts import MagnitudeVectors

#vectors = MagnitudeVectors(50).load_vectors()
vectors = Magnitude('./data/magnitude/glove.6B.100d.magnitude')

In [11]:
df_train.question = [word_tokenize(q) for q in tqdm_notebook(df_train.question.values)]
df_train.context = [word_tokenize(q) for q in tqdm_notebook(df_train.context.values)]

df_dev.question = [word_tokenize(q) for q in tqdm_notebook(df_dev.question.values)]
df_dev.context = [word_tokenize(q) for q in tqdm_notebook(df_dev.context.values)]


HBox(children=(IntProgress(value=0, max=87599), HTML(value='')))




HBox(children=(IntProgress(value=0, max=87599), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10570), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10570), HTML(value='')))




In [12]:
import joblib

joblib.dump(df_train, './data/df_train.h5')
joblib.dump(df_dev, './data/df_dev.h5')

['./data/df_dev.h5']

In [8]:
import joblib

df_train = joblib.load('./data/df_train.h5')
df_dev = joblib.load('./data/df_dev.h5')

In [9]:
y_train = df_train.answer_start.values, df_train.answer_end.values
x_train = df_train.context.values, df_train.question.values

y_dev = df_dev.answer_start.values, df_dev.answer_end.values
x_dev = df_dev.context.values, df_dev.question.values

In [10]:
BATCH_SIZE = 32
num_batches_per_epoch_train = int(np.ceil(TRAIN_NUM_SAMPLES /float(BATCH_SIZE)))
num_batches_per_epoch_dev = int(np.ceil(DEV_NUM_SAMPLES /float(BATCH_SIZE))) 

In [11]:
def train_gen():
    for i in range(TRAIN_NUM_SAMPLES):
        if not (i % BATCH_SIZE):
            context_pad_length = max([len(t) for t in x_train[0][i:i+BATCH_SIZE]])
            question_pad_length = max([len(t) for t in x_train[1][i:i+BATCH_SIZE]])

        X_context_batch = vectors.query(x_train[0][i], pad_to_length = context_pad_length)
        X_question_batch = vectors.query(x_train[1][i], pad_to_length = question_pad_length)

        Y_start_batch = tf.keras.utils.to_categorical(y_train[0][i],context_pad_length)
        Y_end_batch = tf.keras.utils.to_categorical(y_train[1][i],context_pad_length)
        
        yield ((tf.constant(X_context_batch, dtype = 'float16'), tf.constant(X_question_batch, dtype = 'float16')), 
               (tf.constant(Y_start_batch, dtype = 'int8'), tf.constant(Y_end_batch, dtype = 'int8')))
        
def dev_gen():
    for i in range(DEV_NUM_SAMPLES):
        if not (i%BATCH_SIZE):
            context_pad_length = max([len(t) for t in x_dev[0][i:i+BATCH_SIZE]])
            question_pad_length = max([len(t) for t in x_dev[1][i:i+BATCH_SIZE]])

        X_context_batch = vectors.query(x_dev[0][i], pad_to_length = context_pad_length)
        X_question_batch = vectors.query(x_dev[1][i], pad_to_length = question_pad_length)

        Y_start_batch = tf.keras.utils.to_categorical(y_dev[0][i],context_pad_length)
        Y_end_batch =tf.keras.utils.to_categorical(y_dev[1][i],context_pad_length)
        
        yield ((tf.constant(X_context_batch, dtype = 'float16'), tf.constant(X_question_batch, dtype = 'float16')), 
               (tf.constant(Y_start_batch, dtype = 'int8'), tf.constant(Y_end_batch, dtype = 'int8')))
       
    

In [12]:
dataset_train = tf.data.Dataset.from_generator(train_gen, ((tf.float16, tf.float16), (tf.int8, tf.int8))).batch(BATCH_SIZE).repeat().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
dataset_dev = tf.data.Dataset.from_generator(dev_gen, ((tf.float16, tf.float16), (tf.int8, tf.int8))).batch(BATCH_SIZE).repeat().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)


In [13]:
from tensorflow.keras import backend as K

K.set_floatx('float16')
K.set_epsilon(1e-4)

K.clear_session()

In [14]:
import os 
!mkdir ./logs/temp_weights

class SaveWeights(tf.keras.callbacks.Callback):
    def on_train_batch_end(self, batch, logs=None):
        if not (batch% int(num_batches_per_epoch_train/10)):
            print('Saving Temp Weights')
            self.model.save_weights('./logs/temp_weights/weights-batch-{}.h5'.format(batch))


save_weights = SaveWeights()

mkdir: cannot create directory ‘./logs/temp_weights’: File exists


In [15]:
def prepare_for_end_prob(inputs):
    encoded_context, merged_context, modeled_context, span_begin_probabilities = inputs
    weighted_sum = K.sum(K.expand_dims(span_begin_probabilities, axis=-1) * modeled_context, -2)
    passage_weighted_by_predicted_span = K.expand_dims(weighted_sum, axis=1)
    tile_shape = K.concatenate([[1], [K.shape(encoded_context)[1]], [1]], axis=0)
    passage_weighted_by_predicted_span = K.tile(passage_weighted_by_predicted_span, tile_shape)
    multiply1 = modeled_context * passage_weighted_by_predicted_span
    span_end_representation = K.concatenate(
            [merged_context, modeled_context, passage_weighted_by_predicted_span, multiply1])

    return span_end_representation


In [16]:
EMBED_LENGTH = 100
DROPOUT_RATE = 0.2

from tensorflow.keras.layers import Input, LSTM, Bidirectional, Concatenate, TimeDistributed, Dense, Softmax, Flatten, Lambda, Multiply, Add, Dropout, SpatialDropout1D
from tensorflow.keras.models import Model 
from tensorflow.keras.optimizers import Adam, Adadelta
from tensorflow.keras.activations import linear
from layers import Similarity, C2QAttention, Q2CAttention, MergedContext, SpanBegin, SpanEnd, Highway

######## INPUT LAYER #########
context_input = Input(shape = (None, EMBED_LENGTH), dtype = 'float16', name = 'context_input')
question_input = Input(shape = (None, EMBED_LENGTH), dtype = 'float16', name = 'question_input')


####### HIGHWAY LAYER #########

highway_layer = Highway(name='highway_1')
                        
question_layer = TimeDistributed(highway_layer, name='highway_qtd')
question_embedding = question_layer(question_input)
                        
passage_layer = TimeDistributed(highway_layer, name='highway__ptd')
context_embedding = passage_layer(context_input)

    ## LAYER 2 ##

highway_layer_2 = Highway(name='highway_2')
                        
question_layer_2 = TimeDistributed(highway_layer_2, name='highway_qtd_2')
question_embedding = question_layer(question_embedding)
                        
passage_layer_2 = TimeDistributed(highway_layer_2, name='highway__ptd_2')
context_embedding = passage_layer(context_embedding)


######## CONTEXTUAL EMBEDDING LAYER ########
encoder_layer = Bidirectional(LSTM(64, return_sequences=True), name='bidirectional_encoder')
encoded_question = encoder_layer(question_embedding)
encoded_context = encoder_layer(context_embedding)
encoded_question = SpatialDropout1D(0.2)(encoded_question)
encoded_context = SpatialDropout1D(0.2)(encoded_context)


######## SIMILARITY LAYER ########
similarity_matrix = Similarity(name='similarity_layer')([encoded_context, encoded_question])

####### ATTENTION LAYER #########
context_to_query_attention = C2QAttention(name='context_to_query_attention')([
            similarity_matrix, encoded_question])
query_to_context_attention = Q2CAttention(name='query_to_context_attention')([
            similarity_matrix, encoded_context])

###### MERGE ATTENTIONS ########
merged_context = MergedContext(name='merged_context')(
            [encoded_context, context_to_query_attention, query_to_context_attention])

###### MODELLING LAYER #########
modeled_context = Bidirectional(LSTM(64,return_sequences=True), name='decoder')(merged_context)
modeled_context = SpatialDropout1D(0.2)(modeled_context)


###### OUTPUT LAYER SPAN BEGIN#########
span_begin_concat = K.concatenate([merged_context, modeled_context], axis = -1)
span_begin_weights = TimeDistributed(Dense(1), name = 'Dense_span_begin')(span_begin_concat)
span_begin_weights = Dropout(0.2)(span_begin_weights)
span_begin_probabilities = Softmax(name = 'span-begin-output')(K.squeeze(span_begin_weights, axis=-1))

span_end_representation = Lambda(prepare_for_end_prob)((encoded_context, merged_context, modeled_context, span_begin_probabilities))
span_end_representation = Bidirectional(LSTM(64, return_sequences=True), name='output_end_prob_decoder')(span_end_representation)

span_end_input = K.concatenate([merged_context, span_end_representation])
span_end_weights = TimeDistributed(Dense(1), name = 'Dense_span_end')(span_end_input)
span_end_weights = Dropout(0.2)(span_end_weights)
span_end_probabilities = Softmax(name = 'span-end-output')(K.squeeze(span_end_weights, axis=-1))



model = Model([context_input, question_input], [span_begin_probabilities, span_end_probabilities])

In [17]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
context_input (InputLayer)      [(None, None, 100)]  0                                            
__________________________________________________________________________________________________
question_input (InputLayer)     [(None, None, 100)]  0                                            
__________________________________________________________________________________________________
highway__ptd (TimeDistributed)  (None, None, 100)    20200       context_input[0][0]              
                                                                 highway__ptd[0][0]               
__________________________________________________________________________________________________
highway_qtd (TimeDistributed)   (None, None, 100)    20200       question_input[0][0]         

In [36]:
model.load_weights('./logs/temp_weights/weights-batch-324.h5')

In [37]:
from tensorflow.keras.optimizers import Adam, Adadelta, Nadam

model.compile(optimizer = Adam(0.5), loss = 'categorical_crossentropy' , metrics = ['accuracy'])

In [38]:
# CSVLogger, Model Checkpoint(Test save model), 2-highway, Dev set

In [39]:
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint

checkpoint = ModelCheckpoint('./logs/saved_models/bidaf-weights-best.h5', save_best_only = True, save_weights_only = True, mode = 'min', monitor = 'val_loss', verbose = 1)
logger = CSVLogger('./logs/training.log', append = True)

In [40]:
history = model.fit_generator(generator = dataset_train,
                    steps_per_epoch = num_batches_per_epoch_train/3, 
                    epochs = 2, 
                    validation_data = dataset_dev, 
                    validation_steps = num_batches_per_epoch_dev,
                    #workers = 8,
                    #use_multiprocessing = True,
                    #shuffle = True,
                    callbacks = [checkpoint, logger, save_weights])

Epoch 1/2
Saving Temp Weights
135/912 [===>..........................] - ETA: 51:01 - loss: 0.0677 - span-begin-output_loss: 0.0346 - span-end-output_loss: 0.0331 - span-begin-output_accuracy: 0.0197 - span-end-output_accuracy: 0.0051

KeyboardInterrupt: 