Credit : https://github.com/ParikhKadam/bidaf-keras

In [1]:
import tensorflow as tf
import json
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import subprocess as sp
import os

In [2]:
SET_FLOAT_DTYPE = 'float32'

if SET_FLOAT_DTYPE == 'float32':
    TF_DATATYPE = tf.float32
else:
    TF_DATATYPE = tf.float16
    

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
    # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [4]:
tf.__version__

'2.0.0'

In [5]:
# Credit to : https://www.kaggle.com/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe
# Modified to include first answer_start and text for dev set

def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main


def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
#     ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
#     js['q_idx'] = ndx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    answer_start = []
    answer_text = []

    for answers in tqdm_notebook(main['answers'].values):
        answer_start.append(answers[0]['answer_start'])
        answer_text.append(answers[0]['text'])

    main['answer_start'] = answer_start
    main['text'] = answer_text
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [6]:
TRAIN_NUM_SAMPLES = 40000 #df_train.shape[0]
DEV_NUM_SAMPLES = 10000 #df_dev.shape[0]

if not os.path.isfile('./data/data.h5'):
    df_train = squad_json_to_dataframe_train('./data/train-v1.1.json')
    df_dev = squad_json_to_dataframe_dev('./data/dev-v1.1.json')
 
    #df_train = df_train[:TRAIN_NUM_SAMPLES]
    #df_dev = df_dev[:DEV_NUM_SAMPLES]
    TRAIN_NUM_SAMPLES = df_train.shape[0]
    DEV_NUM_SAMPLES = df_dev.shape[0]

else:
    df_train = pd.read_hdf('./data/data.h5', 'train')
    df_dev = pd.read_hdf('./data/data.h5', 'dev')
    
idx = np.random.permutation(df_train.shape[0])
df_train = df_train.iloc[idx]

Reading the json file
processing...
shape of the dataframe is (87599, 6)
Done
Reading the json file
processing...


HBox(children=(IntProgress(value=0, max=10570), HTML(value='')))


shape of the dataframe is (10570, 7)
Done


In [7]:
from tqdm import tqdm_notebook
from nltk import word_tokenize

if not os.path.isfile('./data/data.h5'):
    answer_start = []
    answer_end = []
    for i in tqdm_notebook(range(df_train.shape[0])): 
        context_split = word_tokenize(df_train.context.values[i][:df_train.answer_start.values[i]])
        answer_start.append(len(context_split))
        answer_end.append(len(context_split) + len(word_tokenize(df_train.text.values[i])) -1)
    df_train['answer_end'] = answer_end
    df_train['answer_start'] = answer_start

    answer_start = []
    answer_end = []
    for i in tqdm_notebook(range(df_dev.shape[0])): 
        context_split = word_tokenize(df_dev.context.values[i][:df_dev.answer_start.values[i]])
        answer_start.append(len(context_split))
        answer_end.append(len(context_split) + len(word_tokenize(df_dev.text.values[i])) -1)
    df_dev['answer_end'] = answer_end
    df_dev['answer_start'] = answer_start

    df_train.question = [word_tokenize(q) for q in tqdm_notebook(df_train.question.values)]
    df_train.context = [word_tokenize(q) for q in tqdm_notebook(df_train.context.values)]

    df_dev.question = [word_tokenize(q) for q in tqdm_notebook(df_dev.question.values)]
    df_dev.context = [word_tokenize(q) for q in tqdm_notebook(df_dev.context.values)]
    
    df_train.to_hdf('./data/data.h5', 'train')
    df_dev.to_hdf('./data/data.h5', 'dev')

HBox(children=(IntProgress(value=0, max=87599), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10570), HTML(value='')))




HBox(children=(IntProgress(value=0, max=87599), HTML(value='')))




HBox(children=(IntProgress(value=0, max=87599), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10570), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10570), HTML(value='')))




your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['index', 'question', 'context', 'text']]

  pytables.to_hdf(path_or_buf, key, self, **kwargs)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['id', 'question', 'context', 'answers', 'text']]

  pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [8]:
from pymagnitude import MagnitudeUtils, Magnitude
#from scripts import MagnitudeVectors

#vectors = MagnitudeVectors(50).load_vectors()
vectors = Magnitude('./data/magnitude/glove.6B.100d.magnitude')

In [9]:
y_train = df_train.answer_start.values, df_train.answer_end.values
x_train = df_train.context.values, df_train.question.values

y_dev = df_dev.answer_start.values, df_dev.answer_end.values
x_dev = df_dev.context.values, df_dev.question.values

In [10]:
BATCH_SIZE = 8
num_batches_per_epoch_train = int(np.ceil(TRAIN_NUM_SAMPLES /float(BATCH_SIZE)))
num_batches_per_epoch_dev = int(np.ceil(DEV_NUM_SAMPLES /float(BATCH_SIZE))) 

In [11]:
import gc

def train_gen():
    for i in range(TRAIN_NUM_SAMPLES):
        if not (i % BATCH_SIZE):
            context_pad_length = max([len(t) for t in x_train[0][i:i+BATCH_SIZE]])
            question_pad_length = max([len(t) for t in x_train[1][i:i+BATCH_SIZE]])

        X_context_batch = vectors.query(x_train[0][i], pad_to_length = context_pad_length)
        X_question_batch = vectors.query(x_train[1][i], pad_to_length = question_pad_length)

        Y_start_batch = tf.keras.utils.to_categorical(y_train[0][i],context_pad_length).reshape(1,-1)
        Y_end_batch = tf.keras.utils.to_categorical(y_train[1][i],context_pad_length).reshape(1,-1)
        Y_batch = np.concatenate([Y_start_batch, Y_end_batch])
        #Y_batch = np.concatenate([y_train[0][i], y_train[1][i]])
        
        yield ((tf.constant(X_context_batch, dtype = SET_FLOAT_DTYPE), tf.constant(X_question_batch, dtype = SET_FLOAT_DTYPE)), 
               tf.constant(Y_batch, dtype = SET_FLOAT_DTYPE)) #,
        
        
def dev_gen():
    for i in range(DEV_NUM_SAMPLES):
        if not (i%BATCH_SIZE):
            context_pad_length = max([len(t) for t in x_dev[0][i:i+BATCH_SIZE]])
            question_pad_length = max([len(t) for t in x_dev[1][i:i+BATCH_SIZE]])

        X_context_batch = vectors.query(x_dev[0][i], pad_to_length = context_pad_length)
        X_question_batch = vectors.query(x_dev[1][i], pad_to_length = question_pad_length)

        Y_start_batch = tf.keras.utils.to_categorical(y_dev[0][i],context_pad_length).reshape(1,-1)
        Y_end_batch =tf.keras.utils.to_categorical(y_dev[1][i],context_pad_length).reshape(1,-1)
        Y_batch = np.concatenate([Y_start_batch, Y_end_batch])

        
        
        yield ((tf.constant(X_context_batch, dtype = SET_FLOAT_DTYPE), tf.constant(X_question_batch, dtype = SET_FLOAT_DTYPE)), 
               tf.constant(Y_batch, dtype = SET_FLOAT_DTYPE)) #, y_dev[1][i]]
        
    

In [12]:
dataset_train = tf.data.Dataset.from_generator(train_gen, ((TF_DATATYPE, TF_DATATYPE), TF_DATATYPE)).batch(BATCH_SIZE).repeat().prefetch(1)
dataset_dev = tf.data.Dataset.from_generator(dev_gen, ((TF_DATATYPE, TF_DATATYPE), TF_DATATYPE)).batch(BATCH_SIZE).repeat().prefetch(1)


In [13]:
from tensorflow.keras import backend as K


K.set_floatx(SET_FLOAT_DTYPE)
#K.set_epsilon(1e-4)

K.clear_session()

In [14]:
def prepare_for_end_prob(inputs):
    encoded_context, merged_context, modeled_context, span_begin_probabilities = inputs
    weighted_sum = K.sum(K.expand_dims(span_begin_probabilities, axis=-1) * modeled_context, -2)
    passage_weighted_by_predicted_span = K.expand_dims(weighted_sum, axis=1)
    tile_shape = K.concatenate([[1], [K.shape(encoded_context)[1]], [1]], axis=0)
    passage_weighted_by_predicted_span = K.tile(passage_weighted_by_predicted_span, tile_shape)
    multiply1 = modeled_context * passage_weighted_by_predicted_span
    span_end_representation = K.concatenate(
            [merged_context, modeled_context, passage_weighted_by_predicted_span, multiply1])

    return span_end_representation


In [15]:
EMBED_LENGTH = 100
DROPOUT_RATE = 0.2

from tensorflow.keras.layers import Input, Add, LSTM, Bidirectional, Concatenate, TimeDistributed, Dense, Softmax, Flatten, Lambda, Multiply, Add, Dropout, SpatialDropout1D
from tensorflow.keras.models import Model 
from tensorflow.keras.optimizers import Adam, Adadelta
from tensorflow.keras.activations import linear
from layers import Similarity, C2QAttention, Q2CAttention, MergedContext, SpanBegin, SpanEnd, Highway

######## INPUT LAYER #########
context_input = Input(shape = (None, EMBED_LENGTH), dtype = SET_FLOAT_DTYPE, name = 'context_input')
question_input = Input(shape = (None, EMBED_LENGTH), dtype = SET_FLOAT_DTYPE, name = 'question_input')

#highway_layer = Highway(name='highway_1')                       
skip_conn_question = question_input
skip_conn_context = context_input

resnet_nonskip = Dense(EMBED_LENGTH, name = 'resnet-nonskip')
context_resnet = resnet_nonskip(context_input)
question_resnet = resnet_nonskip(question_input)

question_embedding = Add()([question_resnet, skip_conn_question])                       
context_embedding = Add()([context_resnet, skip_conn_context])

skip_conn_question = question_embedding
skip_conn_context = context_embedding

resnet_nonskip_2 = Dense(EMBED_LENGTH, name = 'resnet-nonskip-2')
context_resnet = resnet_nonskip_2(context_embedding)
question_resnet = resnet_nonskip_2(question_embedding)

question_embedding = Add()([question_resnet, skip_conn_question])                       
context_embedding = Add()([context_resnet, skip_conn_context])


#question_embedding = highway_layer(question_input)                       
#context_embedding = highway_layer(context_input)

#highway_layer_2 = Highway(name='highway_2')                       
#question_embedding = highway_layer_2(question_embedding)                       
#context_embedding = highway_layer_2(context_embedding)

#context_embedding = context_input
#question_embedding = question_input


######## CONTEXTUAL EMBEDDING LAYER ########
encoder_layer = Bidirectional(LSTM(EMBED_LENGTH, return_sequences=True, recurrent_initializer='glorot_uniform'))
encoded_question = encoder_layer(question_embedding)
encoded_context = encoder_layer(context_embedding)

######## SIMILARITY LAYER ########
similarity_matrix = Similarity(name='similarity_layer')([encoded_context, encoded_question])


####### ATTENTION LAYER #########
context_to_query_attention = C2QAttention(name='context_to_query_attention')([
           similarity_matrix, encoded_question])
query_to_context_attention = Q2CAttention(name='query_to_context_attention')([
            similarity_matrix, encoded_context])

###### MERGE ATTENTIONS ########
context_concat = K.concatenate([encoded_context, context_to_query_attention, query_to_context_attention], axis = -1)
merged_context = Dense(500, name = 'merged_context')(context_concat)

#merged_context = MergedContext(name='merged_context')(
#            [encoded_context, context_to_query_attention, query_to_context_attention])

###### MODELLING LAYER #########
modeled_context = Bidirectional(LSTM(EMBED_LENGTH,return_sequences=True, recurrent_initializer='glorot_uniform'), name='decoder')(merged_context)
modeled_context = SpatialDropout1D(0.2)(modeled_context)


span_begin_concat = K.concatenate([merged_context, modeled_context], axis = -1)
span_begin_weights = Dense(1, name = 'Dense_span_begin')(span_begin_concat)
span_begin_weights = Dropout(0.2)(span_begin_weights)
span_begin_probabilities = Softmax(name = 'span-begin-output')(K.squeeze(span_begin_weights, axis=-1))

span_end_representation = Lambda(prepare_for_end_prob)((encoded_context, merged_context, modeled_context, span_begin_probabilities))
span_end_representation = Bidirectional(LSTM(EMBED_LENGTH, return_sequences=True, recurrent_initializer='glorot_uniform'), name='output_end_prob_decoder')(span_end_representation)
modeled_context = SpatialDropout1D(0.2)(modeled_context)

span_end_input = K.concatenate([merged_context, span_end_representation])
span_end_weights = Dense(1, name = 'Dense_span_end')(span_end_input)
span_end_weights = Dropout(0.2)(span_end_weights)
span_end_probabilities = Softmax(name = 'span-end-output')(K.squeeze(span_end_weights, axis=-1))

output = Lambda(lambda x : K.stack([x[0], x[1]], axis = 1), name = 'output', dtype = 'float32')([span_begin_probabilities, span_end_probabilities])

model = Model([context_input, question_input], output)

In [16]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
context_input (InputLayer)      [(None, None, 100)]  0                                            
__________________________________________________________________________________________________
question_input (InputLayer)     [(None, None, 100)]  0                                            
__________________________________________________________________________________________________
resnet-nonskip (Dense)          (None, None, 100)    10100       context_input[0][0]              
                                                                 question_input[0][0]             
__________________________________________________________________________________________________
add_1 (Add)                     (None, None, 100)    0           resnet-nonskip[0][0]         

In [17]:
import os 
!mkdir ./logs/temp_weights

class SaveWeights(tf.keras.callbacks.Callback):
    def on_train_batch_end(self, batch, logs=None):
        #print(self.model.layers[2].get_weights()[0])

        if not (batch% int(num_batches_per_epoch_train/10)):
            
            print('Saving Temp Weights')
            self.model.save_weights('./logs/temp_weights/weights-batch-{}.h5'.format(batch))


save_weights = SaveWeights()

mkdir: cannot create directory ‘./logs/temp_weights’: File exists


In [18]:
from tensorflow.keras.losses import sparse_categorical_crossentropy
def negative_avg_log_error(y_true, y_pred):
    
    def sum_of_log_probabilities(true_and_pred):
        y_true_start, y_true_end, y_pred_start, y_pred_end = true_and_pred
        
        start_probability = K.sum(y_true_start * y_pred_start, axis = -1)
        end_probability = K.sum(y_true_end * y_pred_end, axis = -1)
        #start_probability = tf.gather(y_pred_start, K.cast(y_true[0], dtype= tf.int32))
        #end_probability = tf.gather(y_pred_start, K.cast(y_true[1], dtype= tf.int32))
        return (K.log(start_probability) + K.log(end_probability))
    
    y_true_start = y_true[:, 0, :]
    y_true_end = y_true[:, 0, :]
    y_pred_start = y_pred[:,0, :]
    y_pred_end = y_pred[:,1,:]
    
    batch_probability_sum = K.map_fn(sum_of_log_probabilities, (y_true_start, y_true_end, y_pred_start, y_pred_end), dtype = SET_FLOAT_DTYPE)
    
    return -K.mean(batch_probability_sum, axis = 0) 

In [19]:
from tensorflow.keras.optimizers import Adam, Adadelta, Nadam

model.compile(optimizer = Adadelta(learning_rate = 0.5), loss = negative_avg_log_error, metrics = ['accuracy', tf.keras.metrics.AUC()])


In [20]:
memory_usage = []
class MemoryCheck(tf.keras.callbacks.Callback):
    def on_epoch_end(self, batch, logs=None):
        mem = sp.check_output('nvidia-smi | grep python', shell=True).split()[-2].decode('utf-8')
        memory_usage.append(int(mem[:-3]))
        print(' ' + mem)

mem_check = MemoryCheck()

In [21]:
# CSVLogger, Model Checkpoint(Test save model), 2-highway, Dev set

In [22]:
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint

checkpoint = ModelCheckpoint('./logs/saved_models/bidaf-weights-best.h5', save_best_only = True, save_weights_only = True, mode = 'min', monitor = 'val_loss', verbose = 1)
logger = CSVLogger('./logs/training.log', append = True)

In [23]:
history = model.fit_generator(generator = dataset_train,
                    steps_per_epoch = num_batches_per_epoch_train, 
                    epochs = 10, 
                    validation_data = dataset_dev, 
                    validation_steps = num_batches_per_epoch_dev,
                    #workers = 8,
                    #use_multiprocessing = True,
                    #shuffle = True,
                    callbacks = [mem_check])

Epoch 1/10
   10/10950 [..............................] - ETA: 1:58:39 - loss: 10.3434 - accuracy: 0.0250 - auc: 0.6344

KeyboardInterrupt: 