# In this notebook, I demonstrate how to create multi-layered RNN using the tensorflow's dynamic_rnn module.
-------------------------------------------------------------------------------------------------------------------

# Technology used: Tensorflow

## The dataset used for this notebook is the Movie-review-Sentiment-Analysis from kaggle
link -> https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews

I start with the usual utility cells and then proceed with preliminary data analysis

In [1]:
# packages used for processing: 
import cPickle as pickle # for reading the data
import matplotlib.pyplot as plt # for visualization
import numpy as np

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# the boss of frameworks
import tensorflow as tf

# for dataset building:
import collections

# to plot the images inline
%matplotlib inline

In [2]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))
    

In [3]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
Models
Scripts



In [4]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data/sentiment_analysis_kaggle/" # the data path

data_files = {
    "train": os.path.join(data_path, "train.tsv"),
    "test": os.path.join(data_path, "test.tsv")
}

base_model_path = '../Models'

model_name = 'Sentiment_Analysis_Model_1'

tensorboard_log_dir = os.path.join(base_model_path, model_name)

model_save_path = tensorboard_log_dir

model_save_filename = os.path.join(model_save_path, model_name)

plug_and_play_data_file_path = os.path.join(data_path, "plug_and_play.pickle")

# constants:
vocabulary_size = 15000
PAD = 0
hidden_cell_state_size = 512 # size of the LSTM cell_state
embedding_size = 128
num_classes = 5
learning_rate = 0.001
batch_size = 64
no_of_epochs = 3
check_point_factor = 2 # save after seeing 500 minibatches

In [5]:
# There doesn't seem to be any obvious package to load this tsv file. So, I am writing a 
# function myself to extract the training data from the train.tsv file

def get_train_data(file_path, feed_back = True, feed_back_factor = 100):
    '''
        function to load the data from the given file_path and generate a proper data_structure for it.
        @param
        file_path => the path where the file is located
        @return => a list of dictionaries of the formatted data
    '''
    # open the file and start reading it line by line
    with open(file_path, "r") as data_file:
        heading = data_file.readline()
        dict_keys = heading.lower().split() # split the heading to generate the keys for the json dicts
        
        data = [] # initialize the data to an empty list
        # parse the remaining lines and convert them to structured dictionaries
        count = 1 # for feedback purposes
        for line in data_file:
            # split the line at '\t'
            vals = line.strip().split("\t")
            element_dict = {} # initialize to empty dictionary
            
            # now, put the parsed values in the dict
            element_dict[dict_keys[0]] = int(vals[0])
            element_dict[dict_keys[1]] = int(vals[1])
            element_dict[dict_keys[2]] = vals[2]
            element_dict[dict_keys[3]] = int(vals[3])
            
            # append this newly formed dictionary to the data list
            data.append(element_dict)
            
            if(feed_back and count % feed_back_factor == 0):
                print "Currently processing line number: " + str(count)
                print "data extracted from this line: " + str(element_dict) + "\n\n"
            
            count += 1 # increment the counter
            
    # return the so created data
    return data

In [6]:
data = get_train_data(data_files["train"], feed_back=False)

In [7]:
data[0]

{'phrase': 'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
 'phraseid': 1,
 'sentenceid': 1,
 'sentiment': 1}

In [8]:
print "Total data examples to train on: " + str(len(data))

Total data examples to train on: 156060


In [9]:
# build a list of words for developing a vocabulary:
words = []
for elem in data:
    words += elem["phrase"].lower().split()
    
print "total words in the dataset: " + str(len(words))

total words in the dataset: 1124157


In [10]:
unique_words = list(set(words))
print "Unique words in the dataset: " + str(len(unique_words))

Unique words in the dataset: 16531


build the dataset: <br>
using the helper from https://github.com/akanimax/machine-learning-helpers/blob/master/text/create_vocabulary_for_text.py

In [11]:
import collections # only dependency for this function

'''
    Note: this function assumes the input as a list of words in a meaningful sequence. This function can be easily modified for handling list of sequences as input for special cases of seq2seq models.
'''

def build_vocabulary(words, n_words):
    """
        Process raw inputs into a dataset.
        @param
        words => the list of all the words in the dataset
        @return => word_count, words_dictionary, words_reverse_dictionary
    """
    count = [['BNK', 0], ['UNK', -1]] # start with this list.
    count.extend(collections.Counter(words).most_common(n_words - 1)) # this is inplace. i.e. has a side effect

    dictionary = dict() # initialize the dictionary to empty one
    # fill this dictionary with the most frequent words
    for word, _ in count:
        dictionary[word] = len(dictionary)
  
    # construct the reverse dictionary for the original dictionary
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    # return all the relevant stuff	
    return count, dictionary, reversed_dictionary

In [12]:
# obtain the count, dictionary and the reverse_dictionary for the given dataset. 
count, dictionary, reverse_dictionary = build_vocabulary(words, vocabulary_size)

## Now, use the vocabulary to transform the data into sequences. 

In [13]:
def transform_dicts(in_dicts, dictionary):
    '''
        Function to transform the input lists of words into numerical sequences using the dictionary
        @param
        in_dicts => the list of input data dictionaries
        dictionary => the word mapping from every word to an integer
        @return => input_sequences, sentiment class of those sequences
    '''
    input_sequences = []; labels = [] # initialize them to empty ones
    
    # iterate over every data element dictionary in the data
    for elem_dict in data:
        seq = elem_dict["phrase"].lower().split()
        label = elem_dict["sentiment"]
        
        # transform the seq using the dictionary
        seq = map(lambda x: dictionary[x] if x in dictionary else dictionary['UNK'], seq)
        
        # print for debugging purposes
        # print "Mapped Sequence: " + str(seq)
        
        input_sequences.append(seq); labels.append(label)
        
    # return the so formed input_sequences and the labels
    return input_sequences, labels

In [14]:
input_sequences, sentiment_labels = transform_dicts(data, dictionary)

In [15]:
assert len(input_sequences) == len(sentiment_labels), "Data has been corrupted. Seqs and labels length not equal"
len(input_sequences), len(sentiment_labels)

(156060, 156060)

# In this notebook, We have to only once shuffle and save that data permanently, so that the distributions of the train, test and dev set don't change

I have done this process and saved the data in the plug_and_play.pickle file. Simply load that file to get the shuffled and split data

# use the function from the helpers repo to pickle the data:

link to function -> https://github.com/akanimax/machine-learning-helpers/blob/master/pickling_unpickling/pickling_operations.py

In [16]:
# function to unpickle the given file and load the obj back into the python environment
def unPickleIt(pickle_path): # might throw the file not found exception
    '''
        function to unpickle the object from the given path
        @param
        pickle_path => the path where the pickle file is located
        @return => the object extracted from the saved path
    '''

    with open(pickle_path, 'rb') as dumped_pickle:
        obj = pickle.load(dumped_pickle)

    return obj # return the unpickled object

In [17]:
data_dict = unPickleIt(plug_and_play_data_file_path)

In [18]:
train_X = data_dict['train_X']
train_Y = data_dict['train_Y']
test_X = data_dict['test_X']
test_Y = data_dict['test_Y']
labels = data_dict['labels']

In [19]:
labels

{0: 'negative',
 1: 'somewhat negative',
 2: 'neutral',
 3: 'somewhat positive',
 4: 'positive'}

In [20]:
# write a function to pad the batch of sequences into a fixed length (by padding) numpy array
def pad(seqs):
    '''
        function to convert the list of seqs into a batch tensor (padding the batch to a nice length)
        @param
        seqs => the list of variable length scalar sequences
        @return => The batch tensor converted using the seqs
    '''
    
    lengths = map(lambda x: len(x), seqs) # extract the lengths of all the sequences in the batch
    max_length = max(lengths) # calculate the max of those lengths
    
    converted_seqs = [] # initialize it to empty list
    # for every sequence, pad it upto the length of max_length
    for seq in seqs: 
        while(len(seq) != max_length):
            seq = seq + [PAD]
        # now append this list to the converted_seqs
        converted_seqs.append(seq)
        
    # return the numpy array corredponding to the converted_seqs
    return np.array(converted_seqs).T

# So, now the setup is done. Let's move on to the actual dynamic_rnn_building
## I will use the InteractiveSession() to work with this

If anything down below goes wrong, try executing from this reset point.

In [21]:
tf.reset_default_graph()

In [22]:
# create placeholders for input_sequences and input_labels
with tf.variable_scope("Inputs"):
    tf_input_seqs = tf.placeholder(tf.int32, shape=(None, None), name='input_sequences')
    tf_senti_labs = tf.placeholder(tf.int32, shape=(None), name='sentiment_labels')
    one_hot_encoded_senti_labs = tf.one_hot(tf_senti_labs, depth=num_classes, axis=1)

### Embedding Time!

In [23]:
with tf.variable_scope("Embedding"):
    embedding_matrix = tf.get_variable("embedding_matrix", 
                                shape=(vocabulary_size, embedding_size), initializer=tf.random_uniform_initializer())

    # obtain the embedded version of input
    embedded_tf_input_seqs = tf.nn.embedding_lookup(embedding_matrix, tf_input_seqs)

In [24]:
embedded_tf_input_seqs

<tf.Tensor 'Embedding/embedding_lookup:0' shape=(?, ?, 128) dtype=float32>

### Model Time!

In [25]:
# create single layer of dynamic_rnn:
with tf.variable_scope("RNN_layer_1"):
    lay1_outputs, lay1_states = tf.nn.dynamic_rnn (
                                    tf.nn.rnn_cell.LSTMCell(hidden_cell_state_size, use_peepholes = True),
                                    embedded_tf_input_seqs,
                                    time_major = True, # the batch size is along the columns
                                    dtype = tf.float32 # we have to specify this since there is no initial state
                                )

In [26]:
# print the tensor information of lay1_outputs and lay1_states
print (lay1_outputs, lay1_states)

(<tf.Tensor 'RNN_layer_1/rnn/TensorArrayStack/TensorArrayGatherV3:0' shape=(?, ?, 512) dtype=float32>, LSTMStateTuple(c=<tf.Tensor 'RNN_layer_1/rnn/while/Exit_2:0' shape=(?, 512) dtype=float32>, h=<tf.Tensor 'RNN_layer_1/rnn/while/Exit_3:0' shape=(?, 512) dtype=float32>))


In [27]:
# time to build the second layer of the dynamic_rnn:
# for using a different LSTM cell for this layer, the variable scope of this layer needs to be different 
# from the earlier layer
with tf.variable_scope("RNN_layer_2"):
    lay2_outputs, lay2_states = tf.nn.dynamic_rnn (
                                    tf.nn.rnn_cell.LSTMCell(hidden_cell_state_size, use_peepholes=True),
                                    lay1_outputs, # output of the previous layer is input to this layer
                                    time_major = True,
                                    initial_state = lay1_states
                                )

In [28]:
print (lay2_outputs, lay2_states)

(<tf.Tensor 'RNN_layer_2/rnn/TensorArrayStack/TensorArrayGatherV3:0' shape=(?, ?, 512) dtype=float32>, LSTMStateTuple(c=<tf.Tensor 'RNN_layer_2/rnn/while/Exit_2:0' shape=(?, 512) dtype=float32>, h=<tf.Tensor 'RNN_layer_2/rnn/while/Exit_3:0' shape=(?, 512) dtype=float32>))


In [29]:
# time to build the last layer of the dynamic lstm network
with tf.variable_scope("RNN_layer3"):
    _, lay3_states = tf.nn.dynamic_rnn (
                        tf.nn.rnn_cell.LSTMCell(hidden_cell_state_size, use_peepholes=True),
                        lay2_outputs, # output of the previous layer is again the input of this layer
                        time_major = True,
                        initial_state = lay2_states
                     )

In [30]:
# Now, check what the last layers output state is:
pre_projected_logits = lay3_states.h
print pre_projected_logits

Tensor("RNN_layer3/rnn/while/Exit_3:0", shape=(?, 512), dtype=float32)


In [31]:
# we will have to project this in order to obtain the logits from the lay3_states.h tensor
with tf.variable_scope("Final_projection"):
    parameters = {
        'weights': tf.get_variable('Final_weights', shape=(hidden_cell_state_size, num_classes), 
                                   initializer=tf.random_normal_initializer()),
        'biases' : tf.get_variable('Final_biases', shape=(1, num_classes), initializer=tf.zeros_initializer())
    }

    # obtain the last layer activations:
    projected_logits = tf.matmul(pre_projected_logits, parameters['weights']) + parameters['biases']

In [32]:
projected_logits

<tf.Tensor 'Final_projection/add:0' shape=(?, 5) dtype=float32>

### Loss Time!

In [33]:
with tf.variable_scope("Loss"):
    # calculate the loss between the projected_logits and the one_hot encoded logits
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=projected_logits, 
                                                                    labels=one_hot_encoded_senti_labs))
    loss_summary = tf.summary.scalar("loss", loss)

### Training Time!

In [34]:
# define the optimizer
with tf.variable_scope("Trainer"):
    train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

In [35]:
# define the predictions
with tf.variable_scope("Predictions"):
    # manually take the softmax of the projected logits to obtain the predictions:
    predictions = tf.nn.softmax(projected_logits) 

### Errands Time!

In [36]:
# define the init op
with tf.variable_scope("INIT"):
    init = tf.global_variables_initializer()

In [37]:
# define the merged summaries op
with tf.variable_scope("ALL_SUMMARIES"):
    all_summaries = tf.summary.merge_all()

# Let's visualize this graph in Tensorboard to make sure everything is properly wired

In [38]:
sess = tf.InteractiveSession()

In [39]:
# run the init op
sess.run(init)

In [40]:
# create the tensorboard_visualizer:
tensorboard_wirter = tf.summary.FileWriter(logdir=tensorboard_log_dir, graph=sess.graph, filename_suffix='.bot')

# The graph looks cool! lets move forward with the actual training of this model:

In [41]:
total_train_examples = len(train_X)

In [42]:
# create minibatches of the train_X and train_Y
minibatches = [] # initialize to empty list
index = 0 # initialize to zero
for _ in range(int(np.ceil(float(total_train_examples) / batch_size))):
    start = index; end = start + batch_size
    mini_batch = (train_X[start: end], train_Y[start: end])
    
    # add the minibatch to the minibatches
    minibatches.append(mini_batch)
    
    # update the index 
    index += batch_size

In [None]:
len(minibatches[-1][0]), len(minibatches[-1][1]), len(minibatches[-2][0]), len(minibatches[-2][1])

(33, 33, 64, 64)

In [None]:
saver = tf.train.Saver(max_to_keep=3)

# start the training iterations
for epoch in range(no_of_epochs):
    print "Currently doing: epoch " + str(epoch + 1)
    print "======================================================================================================="
    
    # iterate over every minibatch
    for iteration in range(len(minibatches)):
        (mini_train_X, mini_train_Y) = minibatches[iteration]
        
        # compute the train_step.
        sess.run(train_step, feed_dict={tf_input_seqs: pad(mini_train_X), tf_senti_labs: mini_train_Y})
        
        # if it is checkpoint factor:
        if((iteration + 1) % check_point_factor == 0):
            # compute the cost and the summary for cost
            summaries, cost = sess.run((all_summaries, loss), feed_dict={tf_input_seqs: pad(mini_train_X), tf_senti_labs: mini_train_Y})
            
            # print a small message for feedback
            print str(iteration + 1) + ".) Current Loss: " + str(cost)
            
            # add the summary
            tensorboard_wirter.add_summary(summaries, global_step=(iteration + 1))
    
    print "=======================================================================================================\n\n"
    
    # save the model after every epoch
    saver.save(sess, model_save_filename, global_step=(epoch + 1))

Currently doing: epoch 1
2.) Current Loss: 22.7918
4.) Current Loss: 11.7678
6.) Current Loss: 4.61158
8.) Current Loss: 4.82146
10.) Current Loss: 1.6809
12.) Current Loss: 2.65704
14.) Current Loss: 2.6665
16.) Current Loss: 1.73891
18.) Current Loss: 1.80513
20.) Current Loss: 1.68673
22.) Current Loss: 1.45863
24.) Current Loss: 1.39958
26.) Current Loss: 1.47218
28.) Current Loss: 1.53066
30.) Current Loss: 1.24003
32.) Current Loss: 1.35464
34.) Current Loss: 1.53404
36.) Current Loss: 1.32031
38.) Current Loss: 1.3367
40.) Current Loss: 1.46536
42.) Current Loss: 1.38062
44.) Current Loss: 1.29986
46.) Current Loss: 1.30489
48.) Current Loss: 1.18321
50.) Current Loss: 1.11161
52.) Current Loss: 1.39637
54.) Current Loss: 1.20431
56.) Current Loss: 1.11043
58.) Current Loss: 1.3613
60.) Current Loss: 1.12724
62.) Current Loss: 1.28499
64.) Current Loss: 1.20053
66.) Current Loss: 1.18349
68.) Current Loss: 1.35467
70.) Current Loss: 1.2182
72.) Current Loss: 1.24563
74.) Current