## Padding
            making sure that all the corpuses are same length and size

### Padding using numpy

In [2]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

preprocessed_sentences = [['barber', 'person'], ['barber', 'good', 'person'], 
                          ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], 
                          ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], 
                          ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], 
                          ['barber', 'went', 'huge', 'mountain']]


tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_sentences)
encoded = tokenizer.texts_to_sequences(preprocessed_sentences)
print(encoded)



[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]


In [3]:
# to keep the same length, calculate the maximum length of the sentence 

max_len = max(len(item) for item in encoded)
print('max length :',max_len)

max length : 7


In [5]:
# everything else smaller than max_len, fill it with zeros

for sentence in encoded:
    while len(sentence) < max_len:
        sentence.append(0)
        
        
padded_np=np.array(encoded)
padded_np

array([[ 1,  5,  0,  0,  0,  0,  0],
       [ 1,  8,  5,  0,  0,  0,  0],
       [ 1,  3,  5,  0,  0,  0,  0],
       [ 9,  2,  0,  0,  0,  0,  0],
       [ 2,  4,  3,  2,  0,  0,  0],
       [ 3,  2,  0,  0,  0,  0,  0],
       [ 1,  4,  6,  0,  0,  0,  0],
       [ 1,  4,  6,  0,  0,  0,  0],
       [ 1,  4,  2,  0,  0,  0,  0],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 1, 12,  3, 13,  0,  0,  0]])

### Pading using preprocessing tool in Keras

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

encoded=tokenizer.texts_to_sequences(preprocessed_sentences)
print(encoded)

[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]


In [7]:
padded=pad_sequences(encoded)
padded

array([[ 0,  0,  0,  0,  0,  1,  5],
       [ 0,  0,  0,  0,  1,  8,  5],
       [ 0,  0,  0,  0,  1,  3,  5],
       [ 0,  0,  0,  0,  0,  9,  2],
       [ 0,  0,  0,  2,  4,  3,  2],
       [ 0,  0,  0,  0,  0,  3,  2],
       [ 0,  0,  0,  0,  1,  4,  6],
       [ 0,  0,  0,  0,  1,  4,  6],
       [ 0,  0,  0,  0,  1,  4,  2],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 0,  0,  0,  1, 12,  3, 13]])

            The padding result is different from when padding was done with Numpy because pad_sequences does not basically fill in the back of the document with zero, but it fills in the front with zero. If you want to fill in zero after that, you can give the factor padding='post'.

In [8]:
padded=pad_sequences(encoded,padding='post')
padded

array([[ 1,  5,  0,  0,  0,  0,  0],
       [ 1,  8,  5,  0,  0,  0,  0],
       [ 1,  3,  5,  0,  0,  0,  0],
       [ 9,  2,  0,  0,  0,  0,  0],
       [ 2,  4,  3,  2,  0,  0,  0],
       [ 3,  2,  0,  0,  0,  0,  0],
       [ 1,  4,  6,  0,  0,  0,  0],
       [ 1,  4,  6,  0,  0,  0,  0],
       [ 1,  4,  2,  0,  0,  0,  0],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 1, 12,  3, 13,  0,  0,  0]])

In [10]:
(padded==padded_np).all()


True

In [11]:
# the padding doesn't have to be the max length of the corpus. For example, one anamoly of 5000 length, rest average of 20
# you can set the max length on 'maxlen= 20'.

padded = pad_sequences(encoded, padding='post', maxlen=5)
padded

array([[ 1,  5,  0,  0,  0],
       [ 1,  8,  5,  0,  0],
       [ 1,  3,  5,  0,  0],
       [ 9,  2,  0,  0,  0],
       [ 2,  4,  3,  2,  0],
       [ 3,  2,  0,  0,  0],
       [ 1,  4,  6,  0,  0],
       [ 1,  4,  6,  0,  0],
       [ 1,  4,  2,  0,  0],
       [ 3,  2, 10,  1, 11],
       [ 1, 12,  3, 13,  0]])

            Documents shorter than 5 are padded to zero, and data is lost if they were previously longer than 5. For example, the second sentence from the back was originally [7, 7, 3, 2, 10, 1, 11], but now it's changed to [3, 2, 10, 1, 11]. If you want to delete the word after it, not the previous word, in the event of data loss, use the factor called truncating. When using truncating='post', the following words are deleted.

In [12]:
padded = pad_sequences(encoded, padding='post', truncating='post', maxlen=5)
padded

array([[ 1,  5,  0,  0,  0],
       [ 1,  8,  5,  0,  0],
       [ 1,  3,  5,  0,  0],
       [ 9,  2,  0,  0,  0],
       [ 2,  4,  3,  2,  0],
       [ 3,  2,  0,  0,  0],
       [ 1,  4,  6,  0,  0],
       [ 1,  4,  6,  0,  0],
       [ 1,  4,  2,  0,  0],
       [ 7,  7,  3,  2, 10],
       [ 1, 12,  3, 13,  0]])

In [13]:
# to pad with values other than zero 

last_value = len(tokenizer.word_index) + 1 # using values +1 than the entire length of the list 
print(last_value)

14


In [14]:
padded = pad_sequences(encoded, padding='post', value=last_value)
padded

array([[ 1,  5, 14, 14, 14, 14, 14],
       [ 1,  8,  5, 14, 14, 14, 14],
       [ 1,  3,  5, 14, 14, 14, 14],
       [ 9,  2, 14, 14, 14, 14, 14],
       [ 2,  4,  3,  2, 14, 14, 14],
       [ 3,  2, 14, 14, 14, 14, 14],
       [ 1,  4,  6, 14, 14, 14, 14],
       [ 1,  4,  6, 14, 14, 14, 14],
       [ 1,  4,  2, 14, 14, 14, 14],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 1, 12,  3, 13, 14, 14, 14]])

In [16]:
#now trying it with 7 

padded = pad_sequences(encoded, padding='post', value=7)
padded

array([[ 1,  5,  7,  7,  7,  7,  7],
       [ 1,  8,  5,  7,  7,  7,  7],
       [ 1,  3,  5,  7,  7,  7,  7],
       [ 9,  2,  7,  7,  7,  7,  7],
       [ 2,  4,  3,  2,  7,  7,  7],
       [ 3,  2,  7,  7,  7,  7,  7],
       [ 1,  4,  6,  7,  7,  7,  7],
       [ 1,  4,  6,  7,  7,  7,  7],
       [ 1,  4,  2,  7,  7,  7,  7],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 1, 12,  3, 13,  7,  7,  7]])