
# Script to create a seq2seq model for training on the given data
-----------------------------------------------------------------------------------------------------------------
# Technology used: Tensorflow 

I start with the usual cells for utility purposes.

In [1]:
# packages used for processing: 
import cPickle as pickle # for reading the data
import matplotlib.pyplot as plt # for visualization
import numpy as np

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# the boss of frameworks
import tensorflow as tf

# for dataset building:
import collections

# for regex based preprocessing
import re

# to plot the images inline
%matplotlib inline

In [2]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

In [3]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
Models
Scripts



In [4]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data/WikiSQL/data" # the data path

train_files = {
    "questions": os.path.join(data_path, "train.jsonl"),
    "tables": os.path.join(data_path, "train.tables.jsonl")
}

base_model_path = '../Models'

processed_data_file_path = os.path.join(data_path, "processed.pickle")
plug_and_play_data_file_path = os.path.join(data_path, "plug_and_play.pickle")

# constants:
matcher_regex = r"[\w']+|[.,!?;\"]"
vocab_size = 55000 # total words in our vocabulary
lstm_hidden_state_size = 512 # hidden state size
seqs_length = 85
no_of_epochs = 5000
batch_size = 128 # we look at only 64 examples in a single batch            
checkpoint_factor = 50 # save the model after every 5 epochs

In [5]:
# check the contents of the data path
exec_command(['ls', data_path])

dev.db
dev.jsonl
dev.tables.jsonl
plug_and_play.pickle
processed.pickle
test.db
test.jsonl
test.tables.jsonl
train.db
train.jsonl
train.tables.jsonl



In [6]:
# create a function to unpickle the data into a python object
def unpickle(pickle_file):
    '''
        function to unpickle the pickle file into a python compatible object
        @param
        pickle => the pickle file path
        @return => the unpickled object
    '''
    with open(pickle_file) as dumper:
        return pickle.load(dumper)

In [7]:
# load the processed pickled data into the script.
data = unpickle(processed_data_file_path)

queries = data["queries"]
questions = data["questions"]

In [8]:
len(questions), len(queries)

(61297, 61297)

### Since all are questions, we can safely drop the question mark at the end of the question sentences
### Another reason for doing this is that some examples have a question mark while others don't
### Besides, there doesn't seem to be anything unique that can be learnt by adding it to the vocabulary

In [9]:
# drop question mark from end of all the questions if there exists one
for index in range(len(questions)):
    orig = questions[index]
    orig_ans = queries[index]
    
    # remove the question mark if it exists
    if(orig[-1] == '?'):
        # remove the question mark from the end of the sentence.
        orig = orig[:-1]
        
    # make everything lowercase:
    questions[index] = orig.lower()
    queries[index] = orig_ans.lower()

In [10]:
for _ in range(5):
    # now: check the ideal output for a random question from the dataset
    random_index = np.random.randint(questions.shape[0])

    print "Random sample from the dataset:\n"

    # print the natural language question:
    print "Natural_Language question: " + questions[random_index]
    print "SQL query for the same   : " + queries[random_index] + "\n\n"

Random sample from the dataset:

Natural_Language question: what grade was the 2.4km run (min:sec) of 13:01-13:40
SQL query for the same   : select grade from <table> where 2.4km run (min:sec) = 13:01-13:40


Random sample from the dataset:

Natural_Language question: what is karen handel polling at in the insideradvantage poll where john oxendine is at 15%
SQL query for the same   : select karen handel from <table> where poll source = insideradvantage and john oxendine = 15%


Random sample from the dataset:

Natural_Language question: name the country that has ken doherty
SQL query for the same   : select country from <table> where athlete = ken doherty


Random sample from the dataset:

Natural_Language question: what is the adjusted gdp when the nominal gdp per capita is 2874
SQL query for the same   : select gdp adjusted ($ billions) from <table> where gdp per capita nominal ($) = 2874


Random sample from the dataset:

Natural_Language question: which surface has a tournament of 

In [11]:
type(questions), type(queries)

(numpy.ndarray, numpy.ndarray)

In [12]:
# function to build the dataset for the given task:

def build_dataset(words, questions, queries, n_words):
    """Process raw inputs into a dataset."""
    count = [['<blank>', 0], ['<go>', 1], ['<eos>', 2], ['UNK', -1]] # start with this list.
    count.extend(collections.Counter(words).most_common(n_words - 1)) # this is inplace. i.e. has a side effect

    dictionary = dict() # initialize the dictionary to empty one
    # fill this dictionary with the most frequent words
    for word, _ in count:
        dictionary[word] = len(dictionary)
  
    # loop to replace all the rare words by the UNK token
    data_questions = list() # start with empty list
    data_queries = list()
    unk_count = 0 # counter for keeping track of the unknown words
    for question, query in zip(questions, queries):
        
        # first transform the question
        data = [] # initilalize to empty
        for word in question:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        data_questions.append(data)
        
        # now transform the query
        data = [] # initilalize to empty
        for word in query:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        data_queries.append(data)

    count[0][1] = unk_count # replace the earlier -1 by the so calculated unknown count

    print("Total rare words replaced: ", unk_count) # log the total replaced rare words
  
    # construct the reverse dictionary for the original dictionary
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    # return all the relevant stuff	
    return data_questions, data_queries, count, dictionary, reversed_dictionary

In [13]:
# split the sentences into words
split_questions = map(lambda x: x.split(), questions)
split_queries = map(lambda x: x.split(), queries)

#small loop to put all the words together:
all_splits = map(lambda x: x.split(), list(questions) + list(queries))
all_words = []
for split_sentence in all_splits:
    all_words += split_sentence

In [14]:
print "All words in the dataset   : " + str(len(all_words))
print "unique words in the dataset: " + str(len(list(set(all_words))))

All words in the dataset   : 1473073
unique words in the dataset: 55713


In [15]:
dquestions, dqueries, count, dictionary, reverse_dictionary = build_dataset(all_words, 
                                                                        split_questions, split_queries, vocab_size)

('Total rare words replaced: ', 714)


In [16]:
print (dquestions[:3], dqueries[:3])

([[184, 178, 11, 5, 281, 79, 19, 189, 424], [11, 12, 5, 441, 101, 7, 5, 161, 101, 1582, 23, 173, 155], [11, 12, 5, 265, 19, 189, 424]], [[8, 281, 6, 9, 7, 441, 53789, 4, 189, 424], [8, 441, 101, 6, 9, 7, 281, 4, 161, 101, 1582, 23, 173, 155], [8, 265, 6, 9, 7, 4677, 4, 189, 424]])


In [17]:
# add go and eos entry to every sequence. and then pad the sequence to the fixed length
index = 0
for (dquestion, dquery) in zip(dquestions, dqueries):
    dquery = [1] + dquery + [2]
    
    # pad the dquestion
    while(len(dquestion) != seqs_length):
        dquestion += [0]
    
    # pad the dquery:
    while(len(dquery) != seqs_length):
        dquery += [0]
    
    dqueries[index] = dquery
    dquestions[index] = dquestion
    index += 1

In [18]:
sum(map(lambda x: len(x), dquestions)) / len(dquestions), sum(map(lambda x: len(x), dqueries)) / len(dqueries)

(85, 85)

In [19]:
# again visualize a few sequences:
print [reverse_dictionary[i] for i in dquestions[0]]

[u'tell', u'me', u'what', u'the', u'notes', u'are', u'for', u'south', u'australia', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>', '<blank>']


In [20]:
# pickle the proper data so that we can use it in a plug and play manner
plug_and_play_data = {
    "dictionary": dictionary,
    "reverse_dictionary": reverse_dictionary,
    "questions": dquestions,
    "queries": dqueries
}

if(not os.path.isfile(plug_and_play_data_file_path)):
    with open(plug_and_play_data_file_path, 'wb') as dumping:
        pickle.dump(plug_and_play_data, dumping, pickle.HIGHEST_PROTOCOL)
else:
    print "The data has been pickled"

The data has been pickled


# So, now the data is processsed and ready for LSTM training

In [73]:
# Computation graph defining the network architecture of the model

graph = tf.Graph()

with graph.as_default():
    
    # constant to hold the training data and labels
    with tf.variable_scope("input"):
        input_data = [tf.placeholder(tf.int32, shape=(seqs_length), name="input_sequences")] # list of placeholders
        input_translation = [tf.placeholder(tf.int32, shape=(seqs_length), name="ideal_output_sequences")]
        
        loss_targets = tf.one_hot(tf.stack(input_translation), depth=len(dictionary.keys()))

    # create the basic_rnn_seq2seq
    outputs, states = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq (
                        input_data, # encoder input
                        input_translation, # decoder input
                        tf.contrib.rnn.LSTMCell(lstm_hidden_state_size),
                        len(dictionary.keys()),
                        len(dictionary.keys()),
                        128,
                        feed_previous = True
                      )
    
    # The output is now required for calculating the loss.
    with tf.variable_scope("loss"):
        # we use the euclidean loss as the measure of minimization
        # loss = tf.reduce_mean(tf.abs(tf.stack(outputs) - tf.stack(input_translation)), name="mean_loss")
        
        # changed the loss to softmax_cross_entropy_with_logits
        loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(
                labels = loss_targets,
                logits = outputs
            ))
        loss_summary = tf.summary.scalar("loss_summary", loss)
        
    with tf.variable_scope("prediction"):
        # define the op to calculate the predictions
        prediction = tf.argmax(tf.stack(outputs), axis = -1)
        
    
    # the train module for running the optimization op
    train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    
    all_summaries = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    

In [74]:
prediction

<tf.Tensor 'prediction/ArgMax:0' shape=(1, 85) dtype=int64>

In [75]:
model_name = "Model3"
itera = 1

In [76]:
global_index = 0
data_size = len(dquestions)

In [77]:
# function to decode the encoded query:
def decode(encoded_list):
    '''
        function to decode the integer sequence
        @param
        encoded_list => the sequence of integers to decode
        @return => the string fromed by decoding the input sequence
    '''
    
    # generate the decoded words from the given list:
    decoded_list = [] # start with empty list
    for word in encoded_list:
        if(word in reverse_dictionary):
            decoded_list.append(reverse_dictionary[word])
        else:
            decoded_list.append('UNK')
    
    decoded_string = reduce(lambda x, y: x + " " + y, decoded_list)
    
    # return the decoded string
    return decoded_string.replace("<blank>", "")

In [None]:
# time to run this session:
'''
code snippet to run a tensorflow session for performing the training.
'''

''' 
    WARNING WARNING WARNING!!! This is the main training cell. 
    This cell will take a really really long time on low-end machines. It will however not crash your pc, since 
    I have bootstrapped the training in such a way that it loads a small chunk of data at a time to train.
'''

with tf.Session(graph=graph) as sess:
    # bring the global_index into current scope:
    global global_index
    
    # The saver object for saving and loading the model
    saver = tf.train.Saver(max_to_keep=2)
    
    # the path where the model will be saved 
    # let's visualize this graph in tensorboard:
    model_path = os.path.join(base_model_path, model_name)
    
    # create the summary_writer for tensorboard
    tensorboard_writer = tf.summary.FileWriter(model_path, graph=sess.graph)
    
    if(os.path.isfile(os.path.join(model_path, "checkpoint"))):
        # load the weights from the model
        # instead of global variable initializer, restore the graph:
        saver.restore(sess, tf.train.latest_checkpoint(model_path))
        
    else:
        
        # initialize all the variables
        sess.run(init)
    
    for ep in range((itera - 1) * no_of_epochs, itera * no_of_epochs):  # start the loop 
        
        start = global_index
        end = start + batch_size
            
        questions_raw = dquestions[start: end]
        queries_raw = dqueries[start: end]
            
        input_questions_batch = list(np.array(questions_raw).reshape(len(questions_raw), seqs_length))
        input_translate_batch = list(np.array(queries_raw).reshape(len(queries_raw), seqs_length))
            
        global_index = (global_index + batch_size) % data_size
        
        # construct the feed dictionary
        ques_dict = {i: d for (i, d) in zip(input_data, input_questions_batch)}
        quer_dict = {i: d for (i, d) in zip(input_translation, input_translate_batch)}
        combined_dict = dict(ques_dict.items() + quer_dict.items())
        
        # execute the training op
        _, cost = sess.run([train_op, loss], feed_dict=combined_dict)
        
        if((ep + 1) % checkpoint_factor == 0):
            # print the log statements:
            print "epoch: " + str(ep + 1)
            print "================================================================================================="
            print "================================================================================================="
            print('loss = {}'.format(cost))
            # run a random prediction:
            random_index = np.random.randint(batch_size)
            ques = input_questions_batch[random_index]
            ideal_quer = input_translate_batch[random_index]
            mock = np.array([dictionary['<go>']] + [dictionary['blank'] for _ in range(seqs_length - 1)])
            
            quer = sess.run(prediction, feed_dict={input_data[0]: ques, input_translation[0]: mock})
            
            print "Input Question  : " + decode(ques)
            print "Ideal Output    : " + decode(ideal_quer)
            print "Output Received : " + decode(quer[0])
            
            print "\n=========================================================================================\n"
            print "================================================================================================="
            print "================================================================================================="
            
            # run the summary op also
            summary = sess.run(all_summaries, feed_dict=combined_dict)

            # add the generated summary to the fileWriter
            tensorboard_writer.add_summary(summary, (ep + 1))
            
            # save the model trained so far:
            saver.save(sess, os.path.join(model_path, model_name), global_step = (ep + 1))

# The training is now complete. Let's calculate the accuracy and also, try feeding in some random inputs to test how well the model works in the next notebook.