# In this Notebook, I'll preprocess the data and generate a plug_and_play pickle file for it
-------------------------------------------------------------------------------------------------------------------
# Technology used: basic preprocessing tools

### usual utility cells

In [1]:
# packages used for processing: 
import matplotlib.pyplot as plt # for visualization
import numpy as np

# for pickling the data
import cPickle as pickle

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# to plot the images inline
%matplotlib inline

In [2]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

In [3]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
LICENSE
Literature
README.md
Scripts
TensorFlow_implementation



In [4]:
np.random.seed(3) # set this seed for a device independant consistent behaviour

In [5]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data" # the data path

data_files_paths = {
    "table_content": os.path.join(data_path, "train.box"),
    "nb_sentences" : os.path.join(data_path, "train.nb"),
    "train_sentences": os.path.join(data_path, "train.sent")
}

base_model_path = "Models"
plug_and_play_data_file = os.path.join(data_path, "plug_and_play.pickle")

# constants for the preprocessing script
train_percentage = 95

## perform structuring of the data extracted from the related files

In [6]:
# import the Text preprocessing helper to obtain the lists of field_name:content_word pairs
from Summary_Generator.Text_Preprocessing_Helpers.utils import *

Using TensorFlow backend.


In [7]:
field_content_words, field_words, content_words = prepare_input_data(data_files_paths['table_content'])

In [8]:
# check if all the three lists are proper by printing them out
print("Field_content_words: ", field_content_words[0])
print("Field_words: ", field_words[:10])
print("Content_words: ", content_words[:10])

('Field_content_words: ', ['type pope', 'name michael', 'name iii', 'name of', 'name alexandria', 'title 56th', 'title pope', 'title of', 'title alexandria', 'title &', 'title patriarch', 'title of', 'title the', 'title see', 'title of', 'title st.', 'title mark', 'enthroned 25', 'enthroned april', 'enthroned 880', 'ended 16', 'ended march', 'ended 907', 'predecessor shenouda', 'predecessor i', 'successor gabriel', 'successor i', 'birthplace egypt', 'deathdate 16', 'deathdate march', 'deathdate 907', 'buried monastery', 'buried of', 'buried saint', 'buried macarius', 'buried the', 'buried great', 'nationality egyptian', 'religion coptic', 'religion orthodox', 'religion christian', 'residence saint', 'residence mark', "residence 's", 'residence church', 'feastday 16', 'feastday march', 'feastday -lrb-', 'feastday 20', 'feastday baramhat', 'feastday in', 'feastday the', 'feastday coptic', 'feastday calendar', 'feastday -rrb-', 'articletitle pope', 'articletitle michael', 'articletitle ii

In [9]:
label_sentences = prepare_input_labels(data_files_paths['nb_sentences'], data_files_paths['train_sentences'])

In [10]:
# label_sentences are concatenated properly to obtain the decoder sentences.
for sent in label_sentences[:3]: print(sent + '\n')

<start> pope michael iii of alexandria -lrb- also known as khail iii -rrb- was the coptic pope of alexandria and patriarch of the see of st. mark -lrb- 880 -- 907 -rrb- . in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community . this building was at one time believed to have later become the site of the cairo geniza . <eos>

<start> hui jun is a male former table tennis player from china . <eos>

<start> okan Öztürk -lrb- born 30 november 1977 -rrb- is a turkish professional footballer . he currently plays as a striker for yeni malatyaspor . <eos>



In [11]:
train_data_field, field_dict, rev_field_dict, vocab_size_field = prepare_tokenizer(field_words)

In [18]:
print("Train_data_field: ", train_data_field.shape)
print train_data_field, vocab_size_field, rev_field_dict, field_dict

('Train_data_field: ', (472, 1))
[[56]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [12]
 [12]
 [12]
 [12]
 [12]
 [12]
 [12]
 [12]
 [12]
 [12]
 [12]
 [12]
 [35]
 [35]
 [35]
 [36]
 [36]
 [36]
 [29]
 [29]
 [26]
 [26]
 [ 3]
 [22]
 [22]
 [22]
 [23]
 [23]
 [23]
 [23]
 [23]
 [23]
 [57]
 [37]
 [37]
 [37]
 [20]
 [20]
 [20]
 [20]
 [13]
 [13]
 [13]
 [13]
 [13]
 [13]
 [13]
 [13]
 [13]
 [13]
 [ 4]
 [ 4]
 [ 4]
 [ 4]
 [ 4]
 [ 2]
 [ 2]
 [30]
 [30]
 [ 4]
 [ 4]
 [ 2]
 [ 2]
 [30]
 [30]
 [58]
 [ 5]
 [ 5]
 [ 5]
 [ 3]
 [ 3]
 [ 3]
 [48]
 [48]
 [59]
 [27]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [15]
 [15]
 [15]
 [15]
 [15]
 [15]
 [15]
 [15]
 [15]
 [15]
 [15]
 [16]
 [16]
 [16]
 [16]
 [16]
 [16]
 [16]
 [16]
 [16]
 [16]
 [16]
 [19]
 [19]
 [19]
 [19]
 [19]
 [19]
 [19]
 