# In this Notebook, I'll write the script for training the Order-Planner Model defined in the base referenced paper
-------------------------------------------------------------------------------------------------------------------
link to paper -> https://arxiv.org/abs/1709.00155

-------------------------------------------------------------------------------------------------------------------
# Technology used: Tensorflow

as usual, I'll start with the utility cells:

In [63]:
# packages used for processing: 
import matplotlib.pyplot as plt # for visualization
import numpy as np

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# The tensorflow_graph_package for this implementation
from Summary_Generator.Tensorflow_Graph.utils import *
from Summary_Generator.Text_Preprocessing_Helpers.pickling_tools import *

# import tensorflow temporarily:
import tensorflow as tf

# to plot the images inline
%matplotlib inline

In [3]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

In [4]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
LICENSE
Literature
README.md
Scripts
TensorFlow_implementation



In [5]:
np.random.seed(3) # set this seed for a device independant consistent behaviour

In [59]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data" # the data path

data_files_paths = {
    "table_content": os.path.join(data_path, "train.box"),
    "nb_sentences" : os.path.join(data_path, "train.nb"),
    "train_sentences": os.path.join(data_path, "train.sent")
}

base_model_path = "Models"
plug_and_play_data_file = os.path.join(data_path, "plug_and_play.pickle")

# constants for this script
train_percentage = 90

## Unpickle the processed data file and create the train_dev pratitions for it

In [7]:
data = unPickleIt(plug_and_play_data_file)

In [8]:
field_encodings = data['field_encodings']
field_dict = data['field_dict']

content_encodings = data['content_encodings']
content_dict = data['content_dict']

label_encodings = data['label_encodings']
label_dict = data['label_dict']

## create a randomized cell that prints a complete sample to verify the sanity of the processed data

In [37]:
total_samples = len(field_encodings)

random_index = np.random.randint(total_samples)

# extract the three parts of this random sample
random_field_sample = field_encodings[random_index]
content_sample = content_encodings[random_index]
label_sample = label_encodings[random_index]

# print the extracted sample in meaningful format
print("Table Contents: ")
print([(field_dict[field], content_dict[content]) for (field, content) in zip(random_field_sample, content_sample)])

print("\n")
print("Summary: ")
print([label_dict[label] for label in label_sample])

Table Contents: 
[('image', '<none>'), ('birthdate', '20'), ('birthdate', 'november'), ('birthdate', '1972'), ('birthplace', 'emporia'), ('birthplace', ','), ('birthplace', 'virginia'), ('position', 'defensive'), ('position', 'lineman'), ('number', '97'), ('college', 'north'), ('college', 'carolina'), ('heightft', '6'), ('heightin', '3'), ('weightlbs', '295'), ('undraftedyear', '1995'), ('stats', 'y'), ('databasefootball', 'parkerid01'), ('pfr', '<none>'), ('probowls', '<none>'), ('years', '1995\xc2\xa01996-2000\xc2\xa02001'), ('years', '2002-2003\xc2\xa02004'), ('teams', 'san'), ('teams', 'diego'), ('teams', 'chargers'), ('teams', 'seattle'), ('teams', 'seahawks'), ('teams', 'new'), ('teams', 'england'), ('teams', 'patriots'), ('teams', 'baltimore'), ('teams', 'ravens'), ('teams', 'san'), ('teams', 'francisco'), ('teams', '49ers'), ('articletitle', 'riddick'), ('articletitle', 'parker')]


Summary: 
['<start>', 'riddick', 'parker', '-lrb-', 'born', 'november', '20', ',', '1972', 'in',

run the above cell multiple times to satisfy yourself that the data is still sane.

## Perform random shuffling of the input data

In [39]:
X, Y = synch_random_shuffle_non_np(zip(field_encodings, content_encodings), label_encodings)

## Perform train_dev_splitting of the given data:

In [60]:
train_X, train_Y, dev_X, dev_Y = split_train_dev(X, Y, train_percentage)

In [62]:
print("Number of Examples in Training set: ", len(train_X))
print("Number of Examples in the dev  set: ", len(dev_X))

('Number of Examples in Training set: ', 9)
('Number of Examples in the dev  set: ', 1)


# Building graph temporarily:

step 0: Set the Hyper constants for the graph building process

In [95]:
# Set some hyper constants to be used in the graph building:

# random_seed value for consistent debuggable behaviour
seed_value = 3

# vocabulary sizes
field_vocab_size = len(field_dict)
content_vocab_size = len(content_dict)
label_vocab_size = len(label_dict)

# Embeddings size:
field_embedding_size = content_embedding_size = 256
label_embedding_size = 256 # this is same as the other two (for now)

# LSTM hidden state sizes
lstm_cell_state_size = hidden_state_size = 512 # they are same (for now)

In [153]:
# graph reset point:
tf.reset_default_graph()

step 1: Create placeholders for the computations in the graph

In [154]:
# Placeholders for the input data:
with tf.variable_scope("Input_data"):
    tf_field_encodings = tf.placeholder(tf.int32, shape=(None, None), name="input_field_encodings")
    tf_content_encodings = tf.placeholder(tf.int32, shape=(None, None), name="input_content_encodings")
    tf_label_encodings = tf.placeholder(tf.int32, shape=(None, None), name="input_label_encodings")
    
    # This is a placeholder for storing the lengths of the input sequences (they are padded to tensor)
    tf_input_seqs_lengths = tf.placeholder(tf.int32, shape=(None,), name="input_sequence_lengths")

In [155]:
# check tf_field_encodings
print(tf_field_encodings)

Tensor("Input_data/input_field_encodings:0", shape=(?, ?), dtype=int32)


step 2: Obtain Embeddings for the input and the output sequences

In [156]:
# Embeddings for the given input data:
with tf.variable_scope("Input_Embedder"):
    # Embed the field encodings:
    field_embedding_matrix = tf.get_variable("field_embedding_matrix", 
                                shape=(field_vocab_size, field_embedding_size), 
                                initializer=tf.random_uniform_initializer(minval=-1, maxval=1, seed=seed_value),
                                dtype=tf.float32)
    
    tf_field_embedded = tf.nn.embedding_lookup(field_embedding_matrix, tf_field_encodings, name="field_embedder")
    
    # Embed the content encodings: 
    content_embedding_matrix = tf.get_variable("content_embedding_matrix", 
                                shape=(content_vocab_size, content_embedding_size), 
                                initializer=tf.random_uniform_initializer(minval=-1, maxval=1, seed=seed_value),
                                dtype=tf.float32)
    
    tf_content_embedded = tf.nn.embedding_lookup(content_embedding_matrix, 
                                                 tf_content_encodings, name="content_embedder")

In [157]:
print("Embedded_Input_Tensors: ", tf_field_embedded, tf_content_embedded)

('Embedded_Input_Tensors: ', <tf.Tensor 'Input_Embedder/field_embedder:0' shape=(?, ?, 256) dtype=float32>, <tf.Tensor 'Input_Embedder/content_embedder:0' shape=(?, ?, 256) dtype=float32>)


In [158]:
# Embeddings for the label (summary sentences):
with tf.variable_scope("Label_Embedder"):
    # Embed the label encodings: 
    label_embedding_matrix = tf.get_variable("label_embedding_matrix", 
                                shape=(label_vocab_size, label_embedding_size), 
                                initializer=tf.random_uniform_initializer(minval=-1, maxval=1, seed=seed_value),
                                dtype=tf.float32)
    
    tf_label_embedded = tf.nn.embedding_lookup(label_embedding_matrix, 
                                                 tf_label_encodings, name="label_embedder")

In [159]:
print("Embedded_Label_Tensors: ", tf_label_embedded)

('Embedded_Label_Tensors: ', <tf.Tensor 'Label_Embedder/label_embedder:0' shape=(?, ?, 256) dtype=float32>)


In [160]:
# Concatenate the Input embeddings channel_wise and obtain the combined input tensor
with tf.variable_scope("Input_Concatenator"):
    tf_field_content_embedded = tf.concat([tf_field_embedded, tf_content_embedded], axis=-1, name="concatenator")

In [161]:
print("Final_Input_to_the_Encoder: ", tf_field_content_embedded)

('Final_Input_to_the_Encoder: ', <tf.Tensor 'Input_Concatenator/concatenator:0' shape=(?, ?, 512) dtype=float32>)


step 3: Create the encoder RNN to obtain the encoded input sequences.

In [162]:
with tf.variable_scope("Encoder"):
    encoded_input, final_state = tf.nn.dynamic_rnn (
                            cell = tf.nn.rnn_cell.LSTMCell(lstm_cell_state_size), # let all parameters to be default
                            inputs = tf_field_content_embedded,
                            sequence_length = tf_input_seqs_lengths,
                            dtype = tf.float32
                        )

In [163]:
print("Encoded_vectors_bank for attention mechanism: ", encoded_input)

('Encoded_vectors_bank for attention mechanism: ', <tf.Tensor 'Encoder/rnn/transpose:0' shape=(?, ?, 512) dtype=float32>)


In [164]:
print("Final_state obtained from the last step of encoder: ", final_state)

('Final_state obtained from the last step of encoder: ', LSTMStateTuple(c=<tf.Tensor 'Encoder/rnn/while/Exit_2:0' shape=(?, 512) dtype=float32>, h=<tf.Tensor 'Encoder/rnn/while/Exit_3:0' shape=(?, 512) dtype=float32>))


## Create a stub_session to generate the graph visualization

In [165]:
model_name = "Model_1"

In [166]:
model_path = os.path.join(base_model_path, model_name)

In [168]:
with tf.Session() as sess:
    tensorboard_writer = tf.summary.FileWriter(model_path, graph=sess.graph, filename_suffix=".bot")