# In this Notebook, I'll preprocess the data and generate a plug_and_play pickle file for it
-------------------------------------------------------------------------------------------------------------------
# Technology used: basic preprocessing tools

### usual utility cells

In [1]:
# packages used for processing: 
import matplotlib.pyplot as plt # for visualization
import numpy as np

# for pickling the data
import cPickle as pickle

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# import the Text preprocessing helper to obtain the lists of field_name:content_word pairs
from Summary_Generator.Text_Preprocessing_Helpers.utils import *
from Summary_Generator.Tensorflow_Graph.utils import *

# to plot the images inline
%matplotlib inline

Using TensorFlow backend.


In [2]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

In [3]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
LICENSE
Literature
README.md
Scripts
TensorFlow_implementation



In [4]:
np.random.seed(3) # set this seed for a device independant consistent behaviour

In [5]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data" # the data path

data_files_paths = {
    "table_content": os.path.join(data_path, "train.box"),
    "nb_sentences" : os.path.join(data_path, "train.nb"),
    "train_sentences": os.path.join(data_path, "train.sent")
}

base_model_path = "Models"
plug_and_play_data_file = os.path.join(data_path, "plug_and_play.pickle")

# constants for the preprocessing script
train_percentage = 95

## Extract the data from the related files and properly structure it

In [6]:
field_content_words, field_words, content_words = prepare_input_data(data_files_paths['table_content'])

In [7]:
# check if all the three lists are proper by printing them out
print("Field_content_words: ", field_content_words[1])
print("Field_words: ", field_words[:10])
print("Content_words: ", content_words[:10])

('Field_content_words: ', ['name hui', 'name jun', 'image <none>', 'imagesize <none>', 'caption <none>', 'fullname hui', 'fullname jun', 'education <none>', 'nationality <none>', 'playingstyle <none>', 'birthdate <none>', 'birthplace <none>', 'deathdate <none>', 'deathplace <none>', 'height <none>', 'weight <none>', 'medaltemplates <none>', 'articletitle hui', 'articletitle jun'])
('Field_words: ', ['type', 'name', 'name', 'name', 'name', 'title', 'title', 'title', 'title', 'title'])
('Content_words: ', ['pope', 'michael', 'iii', 'of', 'alexandria', '56th', 'pope', 'of', 'alexandria', '&'])


In [8]:
# extract only the lenghts of the field_content_words and delete the field_content_words in order 
# to free up resources
pair_lengths = map(lambda x: len(x), field_content_words)
print(pair_lengths)
del field_content_words

[68, 19, 99, 53, 40, 25, 59, 57, 37, 76]


In [9]:
label_sentences = prepare_input_labels(data_files_paths['nb_sentences'], data_files_paths['train_sentences'])

In [10]:
# label_sentences are concatenated properly to obtain the decoder sentences.
for sent in label_sentences[:3]: print(sent + '\n')

<start> pope michael iii of alexandria -lrb- also known as khail iii -rrb- was the coptic pope of alexandria and patriarch of the see of st. mark -lrb- 880 -- 907 -rrb- . in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community . this building was at one time believed to have later become the site of the cairo geniza . <eos>

<start> hui jun is a male former table tennis player from china . <eos>

<start> okan Öztürk -lrb- born 30 november 1977 -rrb- is a turkish professional footballer . he currently plays as a striker for yeni malatyaspor . <eos>



In [11]:
train_data_field, field_dict, rev_field_dict, vocab_size_field = prepare_tokenizer(field_words)
train_data_content, content_dict, rev_content_dict, vocab_size_content = prepare_tokenizer(content_words)

In [12]:
print(vocab_size_field, len(rev_field_dict), len(field_dict))
train_data_field[:3], train_data_content[:3]

(106, 106, 106)


(array([[62],
        [ 2],
        [ 2]]), array([[11],
        [37],
        [38]]))

In [13]:
# use the group function to bring the data together:
field_seq = np.squeeze(train_data_field).tolist()
content_seq = np.squeeze(train_data_content).tolist()
field_sequences, content_sequences = (group_tokenized_sequences(field_seq, pair_lengths),
                                         group_tokenized_sequences(content_seq, pair_lengths))

In [15]:
# print some slices of the field_sequences and the content_sequences:
print(field_sequences[:2])
print(content_sequences[:2])

[[62, 2, 2, 2, 2, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 7, 5, 37, 37, 37, 38, 38, 38, 31, 31, 27, 27, 63, 64, 4, 65, 3, 18, 18, 18, 23, 23, 23, 23, 23, 23, 50, 32, 32, 32, 20, 20, 20, 20, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 28, 66, 6, 6, 6, 6, 6], [2, 2, 7, 33, 5, 29, 29, 67, 50, 68, 4, 3, 18, 9, 39, 51, 52, 6, 6]]
[[11, 37, 38, 4, 20, 83, 11, 4, 20, 84, 85, 4, 7, 86, 4, 87, 39, 1, 1, 40, 8, 88, 12, 13, 41, 89, 42, 90, 42, 1, 1, 1, 1, 91, 12, 13, 41, 92, 4, 43, 93, 7, 94, 95, 44, 96, 97, 43, 39, 21, 45, 12, 13, 5, 14, 98, 22, 7, 44, 99, 6, 1, 1, 11, 37, 38, 4, 20], [23, 24, 1, 1, 1, 23, 24, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 23, 24]]


## Check if the defined pad_sequences function works properly

In [16]:
padded_field_sequences = pad_sequences(field_sequences)
print("Length of padded_sequences: ", padded_field_sequences.shape)

('Length of padded_sequences: ', (10, 99))


## Perform structuring of the label_sentences:

Step 1: convert the label_sentences into a single flat list (order preserved) in order to tokenize it

In [17]:
# extract the length information from the label_sentences
label_sentences_lengths = map(lambda x: len(x), label_sentences)
print(label_sentences_lengths[:3])

[450, 71, 155]


In [18]:
''' Warning: This is a huge map - reduce operation. And may take a long time to execute '''
label_words_list = reduce(lambda x,y: x + y, map(lambda x: x.split(), label_sentences))
print(label_words_list[:10])

['<start>', 'pope', 'michael', 'iii', 'of', 'alexandria', '-lrb-', 'also', 'known', 'as']


step 2: Use the earlier defined function prepare_tokenizer to transform the input words into numeric sequences

In [19]:
# Use the tokenizer function to obtain the token and processed data for label_sentences
train_data_label, label_dict, rev_label_dict, vocab_size_label = prepare_tokenizer(label_sentences)
train_data_label = train_data_label.tolist()

In [22]:
print(train_data_label[:3])

[[8, 32, 33, 34, 4, 35, 5, 47, 23, 17, 36, 34, 6, 12, 3, 48, 32, 4, 35, 10, 49, 4, 3, 50, 4, 51, 52, 5, 53, 54, 55, 6, 1, 13, 56, 2, 3, 57, 4, 58, 2, 59, 60, 61, 2, 62, 36, 15, 63, 64, 65, 2, 66, 67, 15, 68, 7, 69, 10, 70, 71, 72, 15, 3, 73, 74, 75, 1, 76, 77, 12, 78, 79, 80, 81, 15, 82, 37, 83, 3, 84, 4, 3, 85, 86, 1, 9], [8, 87, 88, 11, 7, 89, 24, 90, 91, 25, 18, 92, 1, 9], [8, 93, 94, 5, 14, 95, 26, 96, 6, 11, 7, 97, 19, 98, 1, 20, 99, 100, 17, 7, 101, 27, 102, 103, 1, 9]]
