Transformer 모델의 구성 -> Transformer_Implement.ipynb

In [1]:
import import_ipynb
import Transformer_Implement as ti
from CommonModule.Handle_Dir import mkdir_p, del_folder
from CommonModule.Embedding import Embedding
from CommonModule.Encoder import IntegerEncoder

import re
import os
import numpy as np
from pathlib import Path
from glob import iglob
from glove import Corpus, Glove
import tensorflow as tf

importing Jupyter notebook from Transformer_Implement.ipynb
importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Handle_Dir.ipynb
importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Embedding.ipynb
importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Encoder.ipynb


GPU Setting -> GPU 1번 사용

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
    except RuntimeError as e:
        print(e)

경로 설정

In [2]:
BASE_DIR = "/data/ksb/TestSampleDir"
DATA_BASE_DIR = os.path.join(BASE_DIR, 'articles')

PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")

SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")

WORD_EMBEDDING_DIR = os.path.join(os.path.join(BASE_DIR, 'articleSummary-Jupyter'), 'Word-Embedding-Model')
MODEL_DIR = os.path.join(os.path.join(BASE_DIR, 'articleSummary-Jupyter'), 'Transformer-Model')

glove_model_path = os.path.join(WORD_EMBEDDING_DIR, 'glove-256.model')
input_corpus_model_path = os.path.join(WORD_EMBEDDING_DIR, 'input-corpus-256.model')
summary_corpus_model_path = os.path.join(WORD_EMBEDDING_DIR, 'summary-corpus-256.model')


말뭉치와 함께 임베딩 모델 불러오기

In [3]:
glove = Glove.load(glove_model_path)
input_corpus = Corpus.load(input_corpus_model_path)
summary_corpus = Corpus.load(summary_corpus_model_path)

In [5]:
origin_embedding = Embedding(PREPROCESSED_PATH, corpus=input_corpus, glove=glove, model='GloVe')
origin_encoded_list = origin_embedding.get_embedded_list()

In [5]:
len(origin_encoded_list)

145344

In [6]:
summary_embedding = Embedding(SUMMARY_PREPROCESSED_PATH, corpus=summary_corpus, glove=glove, model='GloVe')
summary_encoded_list = summary_embedding.get_embedded_list()

In [7]:
len(summary_encoded_list)

145145

In [8]:
options = {
    'model-type' : 'GloVe',
    'inv_wv' : None,
    'corpus' : input_corpus
}
output_encoded_list = IntegerEncoder(list(iglob(os.path.join(SUMMARY_PREPROCESSED_PATH, '**.csv'), recursive=False)), options).encoder()


In [9]:
D_MODEL = 256
LAYER_NUM = 6
NUM_HEADS = 8
DFF = 512
INPUT_VOCAB_SIZE = len(input_corpus.dictionary)
SUMMARY_VOCAB_SIZE = len(summary_corpus.dictionary)

BATCH_SIZE = 64
BUFFER_SIZE = 20000

WARMUP_STEPS = 50
EPOCHS = 70

In [12]:
get_max_length = lambda x : np.max([x[idx].shape[0] for idx in range(len(x))]) 
MAX_LEN = get_max_length(origin_encoded_list)
MAX_LEN

672

In [12]:
get_idx_max_length = lambda x : np.argmax([len(x[idx]) for idx in range(len(x))]) 
idx = get_idx_max_length(origin_encoded_list)
idx

41594

In [36]:
def split_list(split_num, length):
    split_size = length // split_num
    
    split_range = []
    for num in range(split_num):
        if num == split_num -1:
            split_range.append(np.arange(num * split_size, length))
        else:
            split_range.append(np.arange(num * split_size, (num + 1) * split_size))
    return split_range

In [6]:
# Padding
origin_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    origin_encoded_list, maxlen=MAX_LEN, padding='post')
summary_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    summary_encoded_list, maxlen=MAX_LEN, padding='post')
output_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    output_encoded_list, maxlen=MAX_LEN, padding='post')

In [11]:
print('Contents Shape : {}'.format(origin_encoded_matrix.shape))
print('Summaries Shape : {}'.format(summary_encoded_matrix.shape))
print('Output Shape : {}'.format(output_encoded_matrix.shape))

Contents Shape : (4853, 349, 256)
Summaries Shape : (4853, 349, 256)
Output Shape : (4853, 349)


In [14]:
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'encoder_inputs': origin_encoded_matrix, # Encoder Input
        'decoder_inputs': summary_encoded_matrix[:, :-1, :] # Decoder Input
    },
    {
        # Decoder Output, Remove <SOS>
        'Output': output_encoded_matrix[:, 1:]  
    },
))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [15]:
for dict_1, dict_2 in dataset.as_numpy_iterator():
    print("encoder input : {enc}, decoder input : {dec}".format(enc= dict_1['encoder_inputs'].shape,
                                                               dec=dict_1['decoder_inputs'].shape))
    print("output shape : {}".format(dict_2['Output'].shape))

encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), decoder input : (64, 348, 256)
output shape : (64, 348)
encoder input : (64, 349, 256), 

학습률 Learning Rate 조정

In [19]:
lrate_scheduler = ti.CustomSchedule(d_model=D_MODEL)

Optimizer 정의

In [20]:
beta_1 = 0.9  
beta_2 = 0.98
epsilon = 10 ** -9

optimizer = tf.keras.optimizers.Adam(lrate_scheduler, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

Transformer 모델 정의

In [21]:
with tf.device('/CPU:0'):
    model = ti.Transformer(
        vocab_size=VOCAB_SIZE,
        layer_num=LAYER_NUM,
        dff=DFF,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        dropout = 0.3).get_transformer()

(1, 63677, 256)
encoder_layer_0 sub-layer 1
[Input] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Dense] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Splited] Q shape : (None, 8, None, 32), K shape : (None, 8, None, 32), V shape : (None, 8, None, 32)

encoder_layer_0 sub-layer 2
encoder_layer_1 sub-layer 1
[Input] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Dense] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Splited] Q shape : (None, 8, None, 32), K shape : (None, 8, None, 32), V shape : (None, 8, None, 32)

encoder_layer_1 sub-layer 2
encoder_layer_2 sub-layer 1
[Input] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Dense] Q shape : (None, None, 256), K shape : (None, None, 256), V shape : (None, None, 256)

[Splited] Q shape : (None, 8, None, 32), K shape : (None, 8, None, 3

In [22]:
from tensorflow.python.keras import backend as K
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops

def sparse_categorical_accuracy(y_true, y_pred):
    y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
    y_true = ops.convert_to_tensor_v2_with_dispatch(y_true)

    y_pred_rank = y_pred.shape.ndims
    y_true_rank = y_true.shape.ndims
    # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
    if (y_true_rank is not None) and (y_pred_rank is not None) and (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
        y_true = array_ops.squeeze(y_true, [-1])
    y_pred = math_ops.argmax(y_pred, axis=-1)

    # If the predicted output and actual output types don't match, force cast them
    # to match.
    if K.dtype(y_pred) != K.dtype(y_true):
        y_pred = math_ops.cast(y_pred, K.dtype(y_true))
    return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())

In [23]:
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LEN - 1))
    loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)
    print("loss func", y_true.shape, y_pred.shape)
    return tf.reduce_mean(loss)

In [24]:
with tf.device('/CPU:0'):
    model.compile(optimizer=optimizer, loss=loss_function, metrics=['loss'])
    model.summary()

Model: "Transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, None, 256)]  0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, None, 256)]  0                                            
__________________________________________________________________________________________________
Encoder (Functional)            (None, None, 256)    3162624     encoder_inputs[0][0]             
__________________________________________________________________________________________________
Decoder (Functional)            (None, None, 256)    4744704     decoder_inputs[0][0]             
                                                                 Encoder[0][0]          

In [25]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='accuracy', factor=0.5, patience=10, min_lr=0.00005, verbose=1)

with tf.device('/GPU:0'):
    model.fit(dataset, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2, shuffle=True, validation_data=val_dataset)


Epoch 1/70
loss func (None, 348) (None, 348, 63677)
loss func (None, 348) (None, 348, 63677)
optimizer Tensor("Adam/Cast:0", shape=(), dtype=float32, device=/job:localhost/replica:0/task:0/device:CPU:0)
optimizer return Tensor("Adam/mul_1:0", shape=(), dtype=float32, device=/job:localhost/replica:0/task:0/device:CPU:0)
accuracy  (None, 348) (None, 348, 63677)
accuracy  (None, 348) (None, 348, 63677)
loss func (None, 348) (None, 348, 63677)
loss func (None, 348) (None, 348, 63677)
optimizer Tensor("Adam/Cast:0", shape=(), dtype=float32, device=/job:localhost/replica:0/task:0/device:CPU:0)
optimizer return Tensor("Adam/mul_1:0", shape=(), dtype=float32, device=/job:localhost/replica:0/task:0/device:CPU:0)
accuracy  (None, 348) (None, 348, 63677)
accuracy  (None, 348) (None, 348, 63677)
loss func (None, 348) (None, 348, 63677)
loss func (None, 348) (None, 348, 63677)
accuracy  (None, 348) (None, 348, 63677)
accuracy  (None, 348) (None, 348, 63677)
61/61 - 134s - loss: 0.7052 - accuracy: 5

Epoch 68/70
61/61 - 114s - loss: 0.5713 - accuracy: 6.8175e-04 - val_loss: 0.5748 - val_accuracy: 6.8175e-04
Epoch 69/70
61/61 - 114s - loss: 0.5711 - accuracy: 6.8175e-04 - val_loss: 0.5741 - val_accuracy: 6.8175e-04
Epoch 70/70
61/61 - 115s - loss: 0.5718 - accuracy: 6.8175e-04 - val_loss: 0.5747 - val_accuracy: 6.8175e-04


In [7]:
mkdir_p(MODEL_DIR)
model_num = len(list(iglob(os.path.join(MODEL_DIR, '**.h5'), recursive=False)))
model_path = os.path.join(MODEL_DIR, 'transformer-{}.h5'.format(model_num))

model.save(model_path)