# Creating Seq2Seq model using the unit dynamic_rnn layers and LSTM cells

---------------------------------------------------------------------------------------------------
# Technology used: Tensorflow

In [1]:
# bring in the big daddy of machine learning and deep learning
import tensorflow as tf

# The usual suspects:
import cPickle as pickle # for reading the data
import matplotlib.pyplot as plt # for visualization
import numpy as np

# to plot the images inline
%matplotlib inline

# # for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

In [2]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

In [3]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
Models
Scripts



In [4]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data/WikiSQL/data" # the data path

train_files = {
    "questions": os.path.join(data_path, "train.jsonl"),
    "tables": os.path.join(data_path, "train.tables.jsonl")
}

base_model_path = '../Models'

processed_data_file_path = os.path.join(data_path, "processed.pickle")
plug_and_play_data_file_path = os.path.join(data_path, "plug_and_play.pickle")

# constants:
seqs_length = 85
hidden_state_size = 128
embedding_size = 256
batch_size = 64
no_of_epochs = 1000
checkpoint_factor = 10

In [5]:
# check the contents of the data path
exec_command(['ls', data_path])

dev.db
dev.jsonl
dev.tables.jsonl
plug_and_play.pickle
processed.pickle
test.db
test.jsonl
test.tables.jsonl
train.db
train.jsonl
train.tables.jsonl



In [6]:
# create a function to unpickle the data into a python object
def unpickle(pickle_file):
    '''
        function to unpickle the pickle file into a python compatible object
        @param
        pickle => the pickle file path
        @return => the unpickled object
    '''
    with open(pickle_file) as dumper:
        return pickle.load(dumper)

## Let's start by unpickling the plug and play file.

In [7]:
data = unpickle(plug_and_play_data_file_path)

In [8]:
# unroll the elements from the packed data
dictionary = data["dictionary"]
reverse_dictionary = data["reverse_dictionary"]
dquestions = data["questions"]
dqueries = data["queries"]

In [9]:
np.array(dquestions).shape, np.array(dqueries).shape

((61297, 85), (61297, 85))

In [17]:
# create placeholders for encoder inputs and outputs:
tf.reset_default_graph()

encoder_input = [tf.placeholder(tf.int32, shape=(seqs_length), name="Input_sequence") for _ in range(batch_size)]
decoder_input = [tf.placeholder(tf.int32, shape=(seqs_length), name="Target_sequence") for _ in range(batch_size)]

decoder_input_tensor = tf.cast(tf.stack(decoder_input), tf.float32)

In [18]:
proj_weights = tf.Variable(tf.truncated_normal(shape=(3, len(dictionary))))
proj_biases = tf.Variable(tf.zeros(shape=(len(dictionary))))

In [19]:
# create the embedding_seq2seq model which is used with the tf.nn.GRUCell
# in case this cell is run multiple times,

cell = tf.nn.rnn_cell.LSTMCell(hidden_state_size)
outputs, states = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq (
        encoder_input, 
        decoder_input,
        cell,
        len(dictionary),
        len(dictionary),
        embedding_size,
        output_projection = (proj_weights, proj_biases),
        feed_previous = True
      )

ValueError: Dimensions must be equal, but are 128 and 3 for 'embedding_rnn_seq2seq/embedding_rnn_decoder/rnn_decoder/loop_function/xw_plus_b/MatMul' (op: 'MatMul') with input shapes: [85,128], [3,55003].

In [13]:
np.array(outputs).shape, outputs[0].shape

((64,), TensorShape([Dimension(85), Dimension(128)]))

In [34]:
output_tensor = tf.stack(outputs)
output_tensor

<tf.Tensor 'stack_2:0' shape=(64, 85, 128) dtype=float32>

In [33]:
# define the weights and biases for projecting the dimemsions
final_weights = tf.Variable(tf.truncated_normal(shape=(hidden_state_size, 1)))
final_biases = tf.Variable(tf.zeros(shape=(1)))

In [36]:
output_tensor = tf.map_fn(lambda x: tf.matmul(x, final_weights) + final_biases, output_tensor)
output_tensor = tf.squeeze(output_tensor)

In [39]:
output_tensor, decoder_input_tensor # the dimensions match! so now we can use it for calculating loss

(<tf.Tensor 'Squeeze_1:0' shape=(64, 85) dtype=float32>,
 <tf.Tensor 'Cast:0' shape=(64, 85) dtype=float32>)

In [40]:
states.c

<tf.Tensor 'embedding_rnn_seq2seq/embedding_rnn_decoder/rnn_decoder/rnn_decoder/lstm_cell/lstm_cell_63/add_1:0' shape=(85, 128) dtype=float32>

In [41]:
states.h

<tf.Tensor 'embedding_rnn_seq2seq/embedding_rnn_decoder/rnn_decoder/rnn_decoder/lstm_cell/lstm_cell_63/mul_2:0' shape=(85, 128) dtype=float32>

In [42]:
loss = tf.reduce_mean(tf.squared_difference(output_tensor, decoder_input_tensor), name="loss")
loss_summary = tf.summary.scalar("Loss", loss)

In [43]:
optimizer = tf.train.AdamOptimizer()
train_step = optimizer.minimize(loss)

In [44]:
# initializer op for the graph
init = tf.global_variables_initializer()
all_summaries = tf.summary.merge_all()

# Let's try visualizing the graph in tensorboard to check if all the wirings are correct

In [45]:
itera = 1
model_name = "Model3"
data_size = len(dquestions)

In [46]:
visualizer_path = os.path.join(base_model_path, model_name)
tensorboard_writer = tf.summary.FileWriter(visualizer_path, sess.graph, filename_suffix=".bot")

# Alright, the wiring looks okay at the first look. Let's start training the model!

In [47]:
global_index = 0

In [48]:
# The session runner cell:
'''
code snippet to run a tensorflow session for performing the training.
'''

''' 
    WARNING WARNING WARNING!!! This is the main training cell. 
    This cell will take a really really long time on low-end machines. It will however not crash your pc, since 
    I have bootstrapped the training in such a way that it loads a small chunk of data at a time to train.
'''

with tf.Session() as sess:
    # bring the global_index into current scope:
    global global_index
    
    # The saver object for saving and loading the model
    saver = tf.train.Saver(max_to_keep=2)
    
    # the path where the model will be saved 
    ''' Currently using the Model no. 2 '''
    # let's visualize this graph in tensorboard:
    model_path = os.path.join(base_model_path, model_name)
    
    if(os.path.isfile(os.path.join(model_path, "checkpoint"))):
        # load the weights from the model
        # instead of global variable initializer, restore the graph:
        saver.restore(sess, tf.train.latest_checkpoint(model_path))
        
    else:
        
        # initialize all the variables
        sess.run(init)
    
    for ep in range((itera - 1) * no_of_epochs, itera * no_of_epochs):  # start the loop 
        
        start = global_index
        end = start + batch_size
            
        questions_raw = dquestions[start: end]
        queries_raw = dqueries[start: end]
            
        global_index = (global_index + batch_size) % data_size
        
        # construct the feed dictionary
        ques_dict = {i: d for (i, d) in zip(encoder_input, questions_raw)}
        quer_dict = {i: d for (i, d) in zip(decoder_input, queries_raw)}
        combined_dict = dict(ques_dict.items() + quer_dict.items())
        
        # execute the training op
        _, cost = sess.run([train_step, loss], feed_dict=combined_dict)
        
        if((ep + 1) % checkpoint_factor == 0):
            # print the log statements:
            print "epoch: " + str(ep + 1)
            print "================================================================================================="
            print "================================================================================================="
            print('loss = {}'.format(cost))
            print "\n=========================================================================================\n"
            print "================================================================================================="
            print "================================================================================================="
            
            # run the summary op also
            summary = sess.run(all_summaries, feed_dict=combined_dict)

            # add the generated summary to the fileWriter
            tensorboard_writer.add_summary(summary, (ep + 1))
            
            # save the model trained so far:
            saver.save(sess, os.path.join(model_path, model_name), global_step = (ep + 1))

epoch: 10
loss = 8590258.0


epoch: 20
loss = 4888804.0


epoch: 30
loss = 7846376.5


epoch: 40
loss = 7039440.5


epoch: 50
loss = 3569902.75


epoch: 60
loss = 9106089.0


epoch: 70
loss = 12809460.0


epoch: 80
loss = 5550352.5


epoch: 90
loss = 11707415.0


epoch: 100
loss = 8912952.0


epoch: 110
loss = 9867705.0


epoch: 120
loss = 11138838.0


epoch: 130
loss = 9770695.0


epoch: 140
loss = 5204987.5


epoch: 150
loss = 12867747.0


epoch: 160
loss = 7759093.5


epoch: 170
loss = 6811445.5




epoch: 180
loss = 8077971.0


epoch: 190
loss = 7005283.5


epoch: 200
loss = 10219593.0


epoch: 210
loss = 10619173.0


epoch: 220
loss = 6855763.5


epoch: 230
loss = 8810130.0


epoch: 240
loss = 8488360.0


epoch: 250
loss = 8119549.5


epoch: 260
loss = 6680278.5


epoch: 270
loss = 7994903.5


epoch: 280
loss = 4993389.5


epoch: 290
loss = 2705012.75


epoch: 300
loss = 4289438.5


epoch: 310
loss = 5700191.0


epoch: 320
loss = 7061551.5


epoch: 330
loss = 9301330.0




epoch: 340
loss = 7052962.5


epoch: 350
loss = 5097945.0


epoch: 360
loss = 6433445.0


epoch: 370
loss = 5734353.5


epoch: 380
loss = 6717685.5


epoch: 390
loss = 3777068.5


epoch: 400
loss = 7209726.5


epoch: 410
loss = 7311528.5


epoch: 420
loss = 8420218.0


epoch: 430
loss = 5299102.0


epoch: 440
loss = 11118746.0


epoch: 450
loss = 9024522.0


epoch: 460
loss = 6864376.5


epoch: 470
loss = 9634773.0


epoch: 480
loss = 5732134.0


epoch: 490
loss = 5398076.0




epoch: 500
loss = 4669407.5


epoch: 510
loss = 5623746.5


epoch: 520
loss = 4881076.5


epoch: 530
loss = 6071260.5


epoch: 540
loss = 7382319.5


epoch: 550
loss = 6095815.0


epoch: 560
loss = 12562822.0


epoch: 570
loss = 10368860.0


epoch: 580
loss = 7569588.0


epoch: 590
loss = 4971739.0


epoch: 600
loss = 5387962.0


epoch: 610
loss = 6264020.5


epoch: 620
loss = 4048196.25


epoch: 630
loss = 4754331.0


epoch: 640
loss = 3949751.75


epoch: 650
loss = 7344845.0




epoch: 660
loss = 5162297.0


epoch: 670
loss = 8410520.0


epoch: 680
loss = 6274788.0


epoch: 690
loss = 6065816.0


epoch: 700
loss = 9326611.0


epoch: 710
loss = 7490053.5




KeyboardInterrupt: 