# In this Notebook, I'll preprocess the data and generate a plug_and_play pickle file for it
-------------------------------------------------------------------------------------------------------------------
# Technology used: basic preprocessing tools

### usual utility cells

In [1]:
# packages used for processing: 
import numpy as np

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# import the Text preprocessing helper to obtain the lists of field_name:content_word pairs
from Summary_Generator.Text_Preprocessing_Helpers.utils import *
from Summary_Generator.Tensorflow_Graph.utils import *
from Summary_Generator.Text_Preprocessing_Helpers.pickling_tools import *

Using TensorFlow backend.


In [2]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

In [3]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
LICENSE
Literature
README.md
Scripts
TensorFlow_implementation



In [4]:
np.random.seed(3) # set this seed for a device independant consistent behaviour

In [5]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data" # the data path

data_files_paths = {
    "table_content": os.path.join(data_path, "train.box"),
    "nb_sentences" : os.path.join(data_path, "train.nb"),
    "train_sentences": os.path.join(data_path, "train.sent")
}

base_model_path = "Models"
plug_and_play_data_file = os.path.join(data_path, "plug_and_play.pickle")

# constants for the preprocessing script
train_percentage = 95

## Extract the data from the related files and properly structure it

In [6]:
field_content_words, field_words, content_words = prepare_input_data(data_files_paths['table_content'])

In [7]:
# check if all the three lists are proper by printing them out
print("Field_content_words: ", field_content_words[1])
print("Field_words: ", field_words[:10])
print("Content_words: ", content_words[:10])

('Field_content_words: ', ['name hui', 'name jun', 'image <none>', 'imagesize <none>', 'caption <none>', 'fullname hui', 'fullname jun', 'education <none>', 'nationality <none>', 'playingstyle <none>', 'birthdate <none>', 'birthplace <none>', 'deathdate <none>', 'deathplace <none>', 'height <none>', 'weight <none>', 'medaltemplates <none>', 'articletitle hui', 'articletitle jun'])
('Field_words: ', ['type', 'name', 'name', 'name', 'name', 'title', 'title', 'title', 'title', 'title'])
('Content_words: ', ['pope', 'michael', 'iii', 'of', 'alexandria', '56th', 'pope', 'of', 'alexandria', '&'])


In [8]:
# extract only the lenghts of the field_content_words and delete the field_content_words in order 
# to free up resources
pair_lengths = map(lambda x: len(x), field_content_words)
print(pair_lengths)
del field_content_words

[68, 19, 99, 53, 40, 25, 59, 57, 37, 76]


In [9]:
label_sentences = prepare_input_labels(data_files_paths['nb_sentences'], data_files_paths['train_sentences'])

In [10]:
# label_sentences are concatenated properly to obtain the decoder sentences.
for sent in label_sentences[:3]: print(sent + '\n')

<start> pope michael iii of alexandria -lrb- also known as khail iii -rrb- was the coptic pope of alexandria and patriarch of the see of st. mark -lrb- 880 -- 907 -rrb- . in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community . this building was at one time believed to have later become the site of the cairo geniza . <eos>

<start> hui jun is a male former table tennis player from china . <eos>

<start> okan Öztürk -lrb- born 30 november 1977 -rrb- is a turkish professional footballer . he currently plays as a striker for yeni malatyaspor . <eos>



In [11]:
train_data_field, field_dict, rev_field_dict, vocab_size_field = prepare_tokenizer(field_words)
train_data_content, content_dict, rev_content_dict, vocab_size_content = prepare_tokenizer(content_words)

In [12]:
print(vocab_size_field, len(rev_field_dict), len(field_dict))
train_data_field[:3], train_data_content[:3]

(106, 106, 106)


(array([[62],
        [ 2],
        [ 2]]), array([[11],
        [37],
        [38]]))

In [13]:
# use the group function to bring the data together:
field_seq = np.squeeze(train_data_field).tolist()
content_seq = np.squeeze(train_data_content).tolist()
field_sequences, content_sequences = (group_tokenized_sequences(field_seq, pair_lengths),
                                         group_tokenized_sequences(content_seq, pair_lengths))

In [14]:
print field_dict

{0: '<unk>', 1: 'years', 2: 'name', 3: 'birthplace', 4: 'birthdate', 5: 'caption', 6: 'articletitle', 7: 'image', 8: 'clubs', 9: 'deathplace', 10: 'teams', 11: 'label', 12: 'associatedacts', 13: 'title', 14: 'feastday', 15: 'caps', 16: 'goals', 17: 'statlabel', 18: 'deathdate', 19: 'beatifieddate', 20: 'residence', 21: 'pcupdate', 22: 'titles', 23: 'buried', 24: 'coach', 25: 'yearsactive', 26: 'spouse', 27: 'successor', 28: 'almamater', 29: 'fullname', 30: 'position', 31: 'predecessor', 32: 'religion', 33: 'imagesize', 34: 'debutteam', 35: 'statyear', 36: 'genre', 37: 'enthroned', 38: 'ended', 39: 'height', 40: 'event', 41: 'dateofhighestranking', 42: 'dateofcurrentranking', 43: 'updated', 44: 'team', 45: 'statvalue', 46: 'origin', 47: 'veneratedin', 48: 'beatifiedby', 49: 'patronage', 50: 'nationality', 51: 'weight', 52: 'medaltemplates', 53: 'currentclub', 54: 'plays', 55: 'website', 56: 'highestranking', 57: 'currentranking', 58: 'occupation', 59: 'number', 60: 'debutdate', 61: 'col

In [15]:
# print some slices of the field_sequences and the content_sequences:
print(field_sequences[:2])
print(content_sequences[:2])

[[62, 2, 2, 2, 2, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 7, 5, 37, 37, 37, 38, 38, 38, 31, 31, 27, 27, 63, 64, 4, 65, 3, 18, 18, 18, 23, 23, 23, 23, 23, 23, 50, 32, 32, 32, 20, 20, 20, 20, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 28, 66, 6, 6, 6, 6, 6], [2, 2, 7, 33, 5, 29, 29, 67, 50, 68, 4, 3, 18, 9, 39, 51, 52, 6, 6]]
[[11, 37, 38, 4, 20, 83, 11, 4, 20, 84, 85, 4, 7, 86, 4, 87, 39, 1, 1, 40, 8, 88, 12, 13, 41, 89, 42, 90, 42, 1, 1, 1, 1, 91, 12, 13, 41, 92, 4, 43, 93, 7, 94, 95, 44, 96, 97, 43, 39, 21, 45, 12, 13, 5, 14, 98, 22, 7, 44, 99, 6, 1, 1, 11, 37, 38, 4, 20], [23, 24, 1, 1, 1, 23, 24, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 23, 24]]


## Check if the defined pad_sequences function works properly

In [16]:
padded_field_sequences = pad_sequences(field_sequences)
print("Length of padded_sequences: ", padded_field_sequences.shape)

('Length of padded_sequences: ', (10, 99))


## Perform structuring of the label_sentences:

Step 1: convert the label_sentences into a single flat list (order preserved) in order to tokenize it

In [17]:
# extract the length information from the label_sentences
label_sentences_lengths = map(lambda x: len(x.split()), label_sentences)
print(label_sentences_lengths[:3])

[87, 14, 26]


In [18]:
''' Warning: This is a huge map - reduce operation. And may take a long time to execute '''
label_words_list = reduce(lambda x,y: x + y, map(lambda x: x.split(), label_sentences))
print(label_words_list[:10])

['<start>', 'pope', 'michael', 'iii', 'of', 'alexandria', '-lrb-', 'also', 'known', 'as']


Step 2: Use the earlier defined function prepare_tokenizer to transform the input words into numeric sequences

In [19]:
# Use the tokenizer function to obtain the token and processed data for label_sentences
train_data_label, label_dict, rev_label_dict, vocab_size_label = prepare_tokenizer(label_words_list)
train_data_label = train_data_label

In [20]:
print(train_data_label.shape)

(461, 1)


In [21]:
# use the group tokenized sequences function to restructure the tokenized input
label_seq = np.squeeze(train_data_label).tolist()
label_sequences = group_tokenized_sequences(label_seq, label_sentences_lengths)

## Finally, perform the pickling of the Processed data

In [22]:
# create the structured dictionary to pickle in the pickle file:
pickling_data = {
    # ''' Input structured data: '''
    
    # field_encodings and related data:
    'field_encodings': field_sequences,
    'field_dict': field_dict,
    'field_rev_dict': rev_field_dict,
    'field_vocab_size': vocab_size_field,
    
    # content encodings and related data:
    'content_encodings': content_sequences,
    'content_dict': content_dict,
    'content_rev_dict': rev_content_dict,
    'content_vocab_size': vocab_size_content,
    
    
    
    #''' Label summary sentences: '''
    
    # label encodings and related data:
    'label_encodings': label_sequences,
    'label_dict': label_dict,
    'label_rev_dict': rev_label_dict,
    'label_vocab_size': vocab_size_label
}

use the function from this repository -> https://github.com/akanimax/machine-learning-helpers to perform pickling and unpickling. The code has been taken exactly and packaged in the Text_Preprocessing_Helpers module of this implementation

In [23]:
# pickle the above defined dictionary at the plug_and_play_data_file path
pickleIt(pickling_data, plug_and_play_data_file)

The file has been pickled at: ../Data/plug_and_play.pickle


Thus, the purpose of this notebook is now complete. We can directly use this pickled data and start building the tensorflow graph to go forward.

## See you in the graph building module! Asta la vista!