Transformer 모델의 구성 -> Transformer_Implement.ipynb

In [1]:
import import_ipynb
import Transformer_Implement as ti
from CommonModule.Handle_Dir import mkdir_p, del_folder
from CommonModule.Embedding import Embedding
from CommonModule.Encoder import IntegerEncoder

import re
import os
import numpy as np
from pathlib import Path
from glob import iglob
from glove import Corpus, Glove
import tensorflow as tf

importing Jupyter notebook from Transformer_Implement.ipynb
importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Handle_Dir.ipynb
importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Embedding.ipynb
importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Encoder.ipynb


GPU Setting -> GPU 1번 사용

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
    except RuntimeError as e:
        print(e)

경로 설정

In [2]:
BASE_DIR = "/data/ksb/TestSampleDir"
DATA_BASE_DIR = os.path.join(BASE_DIR, 'articles')

PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")

SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")

MODEL_BASE_DIR = os.path.join(os.path.join(BASE_DIR, 'articleSummary-Jupyter'), 'Word-Embedding-Model')
glove_model_path = os.path.join(MODEL_BASE_DIR, 'glove-256.model')
corpus_model_path = os.path.join(MODEL_BASE_DIR, 'corpus-256.model')

말뭉치와 함께 임베딩 모델 불러오기

In [3]:
glove = Glove.load(glove_model_path)
corpus = Corpus.load(corpus_model_path)

In [4]:
tr_embedding = Embedding(PREPROCESSED_PATH, corpus=corpus, glove=glove, model='GloVe')
train_encoded_list = tr_embedding.get_embedded_list()

In [5]:
# 예외 상황 - 중앙일보만 테스트

import csv

train_encoded_list = []

test_media_list = [os.path.join(PREPROCESSED_PATH, '중앙일보.csv')]
for _, proc_path in enumerate(test_media_list):
    f = open(proc_path, 'r', newline="\n", encoding="utf-8")
            
    for [article_idx, title, contents] in csv.reader(f):  
        content = contents.split("\t")
                
        vec = [glove.word_vectors[corpus.dictionary[token]] 
                for sent in content for token in sent.split() if token in corpus.dictionary]
        train_encoded_list.append(np.array(vec))
    f.close()

In [6]:
te_embedding = Embedding(SUMMARY_PREPROCESSED_PATH, corpus=corpus, glove=glove, model='GloVe')
test_encoded_list = te_embedding.get_embedded_list()

In [7]:
options = {
    'model-type' : 'GloVe',
    'inv_wv' : None,
    'corpus' : corpus
}
output_encoded_list = IntegerEncoder(list(iglob(os.path.join(SUMMARY_PREPROCESSED_PATH, '**.csv'), recursive=False)), options).encoder()


In [8]:
D_MODEL = 256
LAYER_NUM = 6
NUM_HEADS = 8
DFF = 512
VOCAB_SIZE = len(corpus.dictionary)

BATCH_SIZE = 64
BUFFER_SIZE = 20000

WARMUP_STEPS = 50
EPOCHS = 70

In [9]:
get_max_length = lambda x : np.max([len(x[idx]) for idx in range(len(x))])
MAX_LEN = get_max_length(train_encoded_list)
MAX_LEN

374

기사의 시작과 끝을 알리는 <*SOS*>,<*EOS*>을 붙이고,  
nd-array로 저장하기 위해 패딩을 수행한다.

In [10]:
# Padding
traing_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    train_encoded_list, maxlen=MAX_LEN, padding='post')
test_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    test_encoded_list, maxlen=MAX_LEN, padding='post')
output_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    output_encoded_list, maxlen=MAX_LEN, padding='post')


In [11]:
print('Contents Shape : {}'.format(traing_encoded_matrix.shape))
print('Summaries Shape : {}'.format(test_encoded_matrix.shape))
print('Output Shape : {}'.format(output_encoded_matrix.shape))

Contents Shape : (4853, 374, 256)
Summaries Shape : (4853, 374, 256)
Output Shape : (4853, 374)


정수 인코딩을 수행한 결과행렬에 정수를 임베딩 벡터로 변환한다.  
이의 결과로 (59604, 4392, 256)의 dimension을 가진다.

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'encoder_inputs': traing_encoded_matrix, # Encoder Input
        'decoder_inputs': test_encoded_matrix[:, :-1, :] # Decoder Input
    },
    {
        # Decoder Output, Remove <SOS>
        'Output': output_encoded_matrix[:, 1:]  
    },
))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [13]:
for dict_1, dict_2 in dataset.as_numpy_iterator():
    print("encoder input : {enc}, decoder input : {dec}".format(enc= dict_1['encoder_inputs'].shape,
                                                               dec=dict_1['decoder_inputs'].shape))
    print("output shape : {}".format(dict_2['Output'].shape))

encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), decoder input : (64, 373, 256)
output shape : (64, 373)
encoder input : (64, 374, 256), 

학습률 Learning Rate 조정

In [14]:
lrate_scheduler = ti.CustomSchedule(d_model=D_MODEL)

Optimizer 정의

In [15]:
beta_1 = 0.9  
beta_2 = 0.98
epsilon = 10 ** -9

optimizer = tf.keras.optimizers.Adam(lrate_scheduler, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

Transformer 모델 정의

In [16]:
with tf.device('/CPU:0'):
    model = ti.Transformer(
        vocab_size=VOCAB_SIZE,
        layer_num=LAYER_NUM,
        dff=DFF,
        d_model=D_MODEL,
        num_heads=NUM_HEADS).get_transformer()

(1, 174477, 256)
encoder_layer_0 sub-layer 1
[Input] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Dense] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Splited] Q shape : (None, 8, None, 32), K shape : (None, 8, None, 32), V shape : (None, 8, None, 32)

encoder_layer_0 sub-layer 2
encoder_layer_1 sub-layer 1
[Input] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Dense] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Splited] Q shape : (None, 8, None, 32), K shape : (None, 8, None, 32), V shape : (None, 8, None, 32)

encoder_layer_1 sub-layer 2
encoder_layer_2 sub-layer 1
[Input] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Dense] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Splited] Q shape : (None, 8, None, 32), K shape : (None, 8, None, 

In [17]:
from tensorflow.python.keras import backend as K
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops

def sparse_categorical_accuracy(y_true, y_pred):
    y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
    y_true = ops.convert_to_tensor_v2_with_dispatch(y_true)

    y_pred_rank = y_pred.shape.ndims
    y_true_rank = y_true.shape.ndims
    # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
    if (y_true_rank is not None) and (y_pred_rank is not None) and (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
        y_true = array_ops.squeeze(y_true, [-1])
    y_pred = math_ops.argmax(y_pred, axis=-1)

    # If the predicted output and actual output types don't match, force cast them
    # to match.
    if K.dtype(y_pred) != K.dtype(y_true):
        y_pred = math_ops.cast(y_pred, K.dtype(y_true))
    return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())

In [18]:
def loss_function(y_true, y_pred):
    print("loss func", y_true.shape, y_pred.shape)
    y_true = tf.reshape(y_true, shape=(-1, MAX_LEN - 1))
    loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)
    print("loss func", y_true.shape, y_pred.shape)
    return tf.reduce_mean(loss)

def accuracy(y_true, y_pred):
    # 레이블의 크기는 (batch_size, MAX_LENGTH - 1)
    print("accuracy ", y_true.shape, y_pred.shape)
    y_true = tf.reshape(y_true, shape=(-1, MAX_LEN - 1))
    print("accuracy ", y_true.shape, y_pred.shape)
    return sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])
model.summary()

Model: "Transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, None, 256)]  0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, None, 256)]  0                                            
__________________________________________________________________________________________________
Encoder (Functional)            (None, None, 256)    3162624     encoder_inputs[0][0]             
__________________________________________________________________________________________________
Decoder (Functional)            (None, None, 256)    4744704     decoder_inputs[0][0]             
                                                                 Encoder[0][0]          

In [19]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.00005, verbose=1)

model.fit(dataset, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2, shuffle=True, callbacks=[reduce_lr])


Epoch 1/70
loss func (None, 373) (None, 373, 174477)
loss func (None, 373) (None, 373, 174477)
optimizer Tensor("Adam/Cast:0", shape=(), dtype=float32, device=/job:localhost/replica:0/task:0/device:CPU:0)
optimizer return Tensor("Adam/mul_1:0", shape=(), dtype=float32, device=/job:localhost/replica:0/task:0/device:CPU:0)
accuracy  (None, 373) (None, 373, 174477)
accuracy  (None, 373) (None, 373, 174477)
loss func (None, 373) (None, 373, 174477)
loss func (None, 373) (None, 373, 174477)
optimizer Tensor("Adam/Cast:0", shape=(), dtype=float32, device=/job:localhost/replica:0/task:0/device:CPU:0)
optimizer return Tensor("Adam/mul_1:0", shape=(), dtype=float32, device=/job:localhost/replica:0/task:0/device:CPU:0)
accuracy  (None, 373) (None, 373, 174477)
accuracy  (None, 373) (None, 373, 174477)


ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[64,8,374,374] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node Transformer/Encoder/encoder_layer_0/tf.nn.softmax/Softmax (defined at <ipython-input-19-a392481a996e>:4) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[gradient_tape/Transformer/Decoder/decoder_layer_3/dense_71/Tensordot/MatMul/MatMul_1/_1478]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[64,8,374,374] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node Transformer/Encoder/encoder_layer_0/tf.nn.softmax/Softmax (defined at <ipython-input-19-a392481a996e>:4) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_40633]

Function call stack:
train_function -> train_function
