# Accuracy calculation and Production testing

In [1]:
# packages used for processing: 
import cPickle as pickle # for reading the data
import matplotlib.pyplot as plt # for visualization
import numpy as np

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# the boss of frameworks
import tensorflow as tf

# for dataset building:
import collections

# for regex based preprocessing
import re

# to plot the images inline
%matplotlib inline

In [2]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

In [3]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
Models
Scripts



In [4]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data/WikiSQL/data" # the data path

train_files = {
    "questions": os.path.join(data_path, "train.jsonl"),
    "tables": os.path.join(data_path, "train.tables.jsonl")
}

base_model_path = '../Models'

processed_data_file_path = os.path.join(data_path, "processed.pickle")
plug_and_play_data_file_path = os.path.join(data_path, "plug_and_play.pickle")

# constants:
model_name = "Model3"
matcher_regex = r"[\w']+|[.,!?;\"]"
vocab_size = 55000 # total words in our vocabulary
lstm_hidden_state_size = 512 # hidden state size
seqs_length = 85
no_of_epochs = 5000
batch_size = 512 # we look at only 64 examples in a single batch            
checkpoint_factor = 50 # save the model after every 5 epochs

In [5]:
# check the contents of the data path
exec_command(['ls', data_path])

dev.db
dev.jsonl
dev.tables.jsonl
plug_and_play.pickle
processed.pickle
test.db
test.jsonl
test.tables.jsonl
train.db
train.jsonl
train.tables.jsonl



In [6]:
# create a function to unpickle the data into a python object
def unpickle(pickle_file):
    '''
        function to unpickle the pickle file into a python compatible object
        @param
        pickle => the pickle file path
        @return => the unpickled object
    '''
    with open(pickle_file) as dumper:
        return pickle.load(dumper)

In [7]:
# load the processed pickled data into the script.
data = unpickle(plug_and_play_data_file_path)

dqueries = data["queries"]
dquestions = data["questions"]
dictionary = data["dictionary"]
reverse_dictionary = data["reverse_dictionary"]

In [37]:
# The loader graph:
# Computation graph defining the network architecture of the model

graph = tf.Graph()

with graph.as_default():
    
    # constant to hold the training data and labels
    with tf.variable_scope("input"):
        input_data = [tf.placeholder(tf.int32, shape=(seqs_length), name="input_sequences")] # list of placeholders
        input_translation = [tf.placeholder(tf.int32, shape=(seqs_length), name="ideal_output_sequences")]
        
        loss_targets = tf.one_hot(tf.squeeze(input_translation), depth=len(dictionary.keys()))

    # create the basic_rnn_seq2seq
    outputs, states = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq (
                        input_data, # encoder input
                        input_translation, # decoder input
                        tf.contrib.rnn.LSTMCell(lstm_hidden_state_size),
                        len(dictionary.keys()),
                        len(dictionary.keys()),
                        128,
                        feed_previous = True
                      )
    
    # The output is now required for calculating the loss.
    with tf.variable_scope("loss"):
        # we use the euclidean loss as the measure of minimization
        # loss = tf.reduce_mean(tf.abs(tf.stack(outputs) - tf.stack(input_translation)), name="mean_loss")
        
        # changed the loss to softmax_cross_entropy_with_logits
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                labels = loss_targets,
                logits = outputs
            ))
        loss_summary = tf.summary.scalar("loss_summary", loss)
        
    with tf.variable_scope("prediction"):
        # Evaluate model (with test logits, for dropout to be disabled)
        prediction = tf.argmax(tf.squeeze(outputs), axis = 1)
        
    # the train module for running the optimization op
    train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    
    all_summaries = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()

In [38]:
prediction

<tf.Tensor 'prediction/ArgMax:0' shape=(85,) dtype=int64>

In [39]:
# define a function to encode a question
def encode(string_question):
    '''
        function to encode a given question string
        @param 
        string_question => the question string
        @return => list (sequence) of encoded integers 
    '''
    
    # drop the question mark if it exists in the given string
    if(string_question[-1] == '?'):
        string_question = string_question[:-1] # drop the question mark if it exists
    
    # split the question sentence into words and encode every word using the vocabulary
    question_words = string_question.lower().split()
    
    encode = [] # start with empty list
    for word in question_words:
        if(word in dictionary):
            encode.append(dictionary[word])
        else:
            encode.append(dictionary['UNK'])
            
    # pad the encode list using leading blanks
    encode = encode[:seqs_length] # clip the length to the seqs_length
    
    while(len(encode) != seqs_length):
        encode.append(dictionary['<blank>'])
        
    # return the encoded list
    return encode

In [40]:
# function to decode the encoded query:
def decode(encoded_list):
    '''
        function to decode the integer sequence
        @param
        encoded_list => the sequence of integers to decode
        @return => the string fromed by decoding the input sequence
    '''
    
    # generate the decoded words from the given list:
    decoded_list = [] # start with empty list
    for word in encoded_list:
        if(word in reverse_dictionary):
            decoded_list.append(reverse_dictionary[word])
        else:
            decoded_list.append('UNK')
    
    decoded_string = reduce(lambda x, y: x + " " + y, decoded_list)
    
    # return the decoded string
    return decoded_string.replace("<blank>", "")

In [41]:
# function to generate a mock decoder_input_string
def generate_mock_decoder_input_sequence():
    '''
        @return => a mock decoder input sequence
    '''
    mock = [dictionary["<blank>"] for _ in range(seqs_length)]
    
    # replace the first token by the '<go>' symbol
    mock[0] = dictionary['<go>']
    
    # return the so created mock 
    return mock

In [13]:
question = "Who is the President of India?"
encoded_question = encode(question)

In [14]:
decode(encoded_question)

u'who is the president of india                                                                               '

In [15]:
print generate_mock_decoder_input_sequence()

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# restore the model and try running a few sample questions on the given graph

In [49]:
itera = 2
global_index = 0
data_size = len(dquestions)

In [50]:
# time to run this session:
'''
code snippet to run a tensorflow session for performing the training.
'''

''' 
    WARNING WARNING WARNING!!! This is the main training cell. 
    This cell will take a really really long time on low-end machines. It will however not crash your pc, since 
    I have bootstrapped the training in such a way that it loads a small chunk of data at a time to train.
'''

with tf.Session(graph=graph) as sess:
    # bring the global_index into current scope:
    global global_index
    
    # The saver object for saving and loading the model
    saver = tf.train.Saver(max_to_keep=2)
    
    # the path where the model will be saved 
    # let's visualize this graph in tensorboard:
    model_path = os.path.join(base_model_path, model_name)
    
    # create the summary_writer for tensorboard
    tensorboard_writer = tf.summary.FileWriter(model_path, graph=sess.graph)
    
    if(os.path.isfile(os.path.join(model_path, "checkpoint"))):
        # load the weights from the model
        # instead of global variable initializer, restore the graph:
        saver.restore(sess, tf.train.latest_checkpoint(model_path))
        
    else:
        
        # initialize all the variables
        sess.run(init)
    
    for ep in range((itera - 1) * no_of_epochs, itera * no_of_epochs):  # start the loop 
        
        start = global_index
        end = start + batch_size
            
        questions_raw = dquestions[start: end]
        queries_raw = dqueries[start: end]
            
        input_questions_batch = list(np.array(questions_raw).reshape(len(questions_raw), seqs_length))
        input_translate_batch = list(np.array(queries_raw).reshape(len(queries_raw), seqs_length))
            
        global_index = (global_index + batch_size) % data_size
        
        # construct the feed dictionary
        ques_dict = {i: d for (i, d) in zip(input_data, input_questions_batch)}
        quer_dict = {i: d for (i, d) in zip(input_translation, input_translate_batch)}
        combined_dict = dict(ques_dict.items() + quer_dict.items())
        
        # execute the training op
        _, cost = sess.run([train_op, loss], feed_dict=combined_dict)
        
        if((ep + 1) % checkpoint_factor == 0):
            # print the log statements:
            print "epoch: " + str(ep + 1)
            print "================================================================================================="
            print "================================================================================================="
            print('loss = {}'.format(cost))
            print "\n=========================================================================================\n"
            print "================================================================================================="
            print "================================================================================================="
            
            # run the summary op also
            summary = sess.run(all_summaries, feed_dict=combined_dict)

            # add the generated summary to the fileWriter
            tensorboard_writer.add_summary(summary, (ep + 1))
            
            # save the model trained so far:
            saver.save(sess, os.path.join(model_path, model_name), global_step = (ep + 1))

INFO:tensorflow:Restoring parameters from ../Models/Model3/Model3-5250


KeyboardInterrupt: 

In [51]:
# create an interactive Session to run the model
sess = tf.InteractiveSession(graph=graph)

In [52]:
# create a saver object
saver = tf.train.Saver(max_to_keep = 2)

In [53]:
# synthesize the model_path
model_path = os.path.join(base_model_path, model_name)

In [54]:
# restore the model
if(os.path.isfile(os.path.join(model_path, "checkpoint"))):
    # load the weights from the model
    # instead of global variable initializer, restore the graph:
    saver.restore(sess, tf.train.latest_checkpoint(model_path))
    
else:
    print "Something is wrong with the saved model! You will have to train the model again"

INFO:tensorflow:Restoring parameters from ../Models/Model3/Model3-5250


In [55]:
# obtain the predictions for a sample question:
question

'Who is the President of India?'

In [64]:
sess.run(prediction, feed_dict={input_data[0]: encode(question), 
                                input_translation[0]: generate_mock_decoder_input_sequence()})

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])