# 패딩 Padding
---

In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

# 토큰화
preprocessed_sentences = [['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]

# 인코딩
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_sentences)


2023-12-22 16:03:11.419579: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Numpy 패딩

In [2]:
encoded = tokenizer.texts_to_sequences(preprocessed_sentences)
encoded

max_len = max(len(item) for item in encoded)
print('최대 길이: ', max_len)

for sentence in encoded:
    # 길이가 가장 긴 문장이랑 같은 길이가 되도록 0 채워넣기
    while len(sentence) < max_len:
        sentence.append(0)
        # print(len(sentence),sentence)
    
padded_np = np.array(encoded)
padded_np

최대 길이:  7


array([[ 1,  5,  0,  0,  0,  0,  0],
       [ 1,  8,  5,  0,  0,  0,  0],
       [ 1,  3,  5,  0,  0,  0,  0],
       [ 9,  2,  0,  0,  0,  0,  0],
       [ 2,  4,  3,  2,  0,  0,  0],
       [ 3,  2,  0,  0,  0,  0,  0],
       [ 1,  4,  6,  0,  0,  0,  0],
       [ 1,  4,  6,  0,  0,  0,  0],
       [ 1,  4,  2,  0,  0,  0,  0],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 1, 12,  3, 13,  0,  0,  0]])

## Keras 전처리 도구 pad_sequences 활용
    from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

encoded = tokenizer.texts_to_sequences(preprocessed_sentences)

padded = pad_sequences(encoded) # default padding = 'pre
padded

array([[ 0,  0,  0,  0,  0,  1,  5],
       [ 0,  0,  0,  0,  1,  8,  5],
       [ 0,  0,  0,  0,  1,  3,  5],
       [ 0,  0,  0,  0,  0,  9,  2],
       [ 0,  0,  0,  2,  4,  3,  2],
       [ 0,  0,  0,  0,  0,  3,  2],
       [ 0,  0,  0,  0,  1,  4,  6],
       [ 0,  0,  0,  0,  1,  4,  6],
       [ 0,  0,  0,  0,  1,  4,  2],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 0,  0,  0,  1, 12,  3, 13]], dtype=int32)

In [4]:
padded_post = pad_sequences(encoded, padding='post')
padded_post

array([[ 1,  5,  0,  0,  0,  0,  0],
       [ 1,  8,  5,  0,  0,  0,  0],
       [ 1,  3,  5,  0,  0,  0,  0],
       [ 9,  2,  0,  0,  0,  0,  0],
       [ 2,  4,  3,  2,  0,  0,  0],
       [ 3,  2,  0,  0,  0,  0,  0],
       [ 1,  4,  6,  0,  0,  0,  0],
       [ 1,  4,  6,  0,  0,  0,  0],
       [ 1,  4,  2,  0,  0,  0,  0],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 1, 12,  3, 13,  0,  0,  0]], dtype=int32)

In [5]:
padded_pre = pad_sequences(encoded, padding='pre')
padded_pre

array([[ 0,  0,  0,  0,  0,  1,  5],
       [ 0,  0,  0,  0,  1,  8,  5],
       [ 0,  0,  0,  0,  1,  3,  5],
       [ 0,  0,  0,  0,  0,  9,  2],
       [ 0,  0,  0,  2,  4,  3,  2],
       [ 0,  0,  0,  0,  0,  3,  2],
       [ 0,  0,  0,  0,  1,  4,  6],
       [ 0,  0,  0,  0,  1,  4,  6],
       [ 0,  0,  0,  0,  1,  4,  2],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 0,  0,  0,  1, 12,  3, 13]], dtype=int32)

### truncating=

In [6]:
# maxlen= n에서 설정한 길이를 기준으로 truncatin='post'뒤에서부터' 자르기
padded_truncated = pad_sequences(encoded, padding='pre', truncating='post', maxlen=5)
padded_truncated

array([[ 0,  0,  0,  1,  5],
       [ 0,  0,  1,  8,  5],
       [ 0,  0,  1,  3,  5],
       [ 0,  0,  0,  9,  2],
       [ 0,  2,  4,  3,  2],
       [ 0,  0,  0,  3,  2],
       [ 0,  0,  1,  4,  6],
       [ 0,  0,  1,  4,  6],
       [ 0,  0,  1,  4,  2],
       [ 7,  7,  3,  2, 10],
       [ 0,  1, 12,  3, 13]], dtype=int32)

In [7]:
# truncatin='pre' -> default setting
padded_truncated = pad_sequences(encoded, padding='post', maxlen=5)
padded_truncated

array([[ 1,  5,  0,  0,  0],
       [ 1,  8,  5,  0,  0],
       [ 1,  3,  5,  0,  0],
       [ 9,  2,  0,  0,  0],
       [ 2,  4,  3,  2,  0],
       [ 3,  2,  0,  0,  0],
       [ 1,  4,  6,  0,  0],
       [ 1,  4,  6,  0,  0],
       [ 1,  4,  2,  0,  0],
       [ 3,  2, 10,  1, 11],
       [ 1, 12,  3, 13,  0]], dtype=int32)

### value= n : 0이 아닌 다른 숫자로 패딩하기

In [8]:
last_value = len(tokenizer.word_index) + 1
print(last_value)

padded = pad_sequences(encoded, padding='post', value=last_value)
padded

14


array([[ 1,  5, 14, 14, 14, 14, 14],
       [ 1,  8,  5, 14, 14, 14, 14],
       [ 1,  3,  5, 14, 14, 14, 14],
       [ 9,  2, 14, 14, 14, 14, 14],
       [ 2,  4,  3,  2, 14, 14, 14],
       [ 3,  2, 14, 14, 14, 14, 14],
       [ 1,  4,  6, 14, 14, 14, 14],
       [ 1,  4,  6, 14, 14, 14, 14],
       [ 1,  4,  2, 14, 14, 14, 14],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 1, 12,  3, 13, 14, 14, 14]], dtype=int32)