GPU Setting -> GPU 1번 사용

In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
    except RuntimeError as e:
        print(e)

Transformer 모델의 구성 -> Transformer_Implement.ipynb

In [41]:
import import_ipynb
import Transformer_Implement as ti

import re
import os
import numpy as np
from pathlib import Path
from glove import Corpus, Glove

경로 설정

In [31]:
BASE_DIR = "/data/ksb/TestDir"
DATA_BASE_DIR = os.path.join(BASE_DIR, 'sample_articles')

PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")
MINI_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Mini-Preprocessed-Data")

SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")
MINI_SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Mini-Summary-Preprocessed-Data")

MODEL_BASE_DIR = os.path.join(os.path.join(BASE_DIR, 'articleSummary-Jupyter'), 'Word-Embedding')
glove_model_path = os.path.join(MODEL_BASE_DIR, 'glove-256.model')
corpus_model_path = os.path.join(MODEL_BASE_DIR, 'corpus-256.model')

In [32]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def del_folder(path):
    from shutil import rmtree
    try:
        rmtree(path)
    except Exception as e:
        print(e)
        pass

In [33]:
def get_switched_root_dir(path, save_root_dir):
    path_token = path.split(os.sep)
    path_token[4] = save_root_dir.split(os.sep)[4]
    save_dir_path = os.sep.join(path_token[:6])
    
    return save_dir_path

In [34]:
class ArticleCopier:
    def __init__(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            self.lines = f.readlines()

    def copy_article(self, save_dir):
        mkdir_p(save_dir)
        article_name = str(len(os.listdir(save_dir)))+".txt"

        with open(os.path.join(save_dir, article_name), 'w', encoding='utf-8') as new_f:
            new_f.writelines(self.lines)

def load_proc_data(summary_path):

    media_list = sorted(os.listdir(summary_path))
        
    for media_name in media_list:
    
        media_path = os.path.join(summary_path, media_name)
        article_list = os.listdir(media_path)
            
        article_path_list = [os.path.join(media_path, article_name) for article_name in article_list]
        
        origin_root = PREPROCESSED_PATH
        new_root = MINI_PREPROCESSED_PATH
        
        origin_proc_path_list = [os.path.join(get_switched_root_dir(path, origin_root), path.split(os.sep)[-1]) for path in article_path_list]
        new_proc_path_list = [get_switched_root_dir(path, new_root) for path in article_path_list]

        for origin_path, new_path in zip(origin_proc_path_list, new_proc_path_list):
            article = ArticleCopier(origin_path)
            article.copy_article(new_path)
        map(lambda article : article.copy_article, new_proc_path_list)

In [35]:
#del_folder(MINI_PREPROCESSED_PATH)
#MINI_PREPROCESSED_PATH

In [36]:
#load_proc_data(MINI_SUMMARY_PREPROCESSED_PATH)

말뭉치와 함께 임베딩 모델 불러오기

In [37]:
glove = Glove.load(glove_model_path)
corpus = Corpus.load(corpus_model_path)

FileNotFoundError: [Errno 2] No such file or directory: '/data/ksb/TestDir/articleSummary-Jupyter/Word-Embedding/glove-256.model'

In [None]:
class RawTextReader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.rgxSplitter = re.compile("/n")

    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in ch:
                yield s

In [None]:
class IntegerEncoder:
    def __init__(self, filepath, corpus):
        self.filepath = filepath
        self.corpus = corpus
    
    def get_encoded_vec_matrix(self):
        media_list = os.listdir(self.filepath)
        
        encoded_vec_list = []
        for media_name in media_list:
    
            media_path = os.path.join(self.filepath, media_name)
            article_list = os.listdir(media_path)
            
            for article_name in article_list:
                
                reader = RawTextReader(os.path.join(media_path, article_name)) 
                content = list(filter(None, reader))
                vec = np.array([corpus.dictionary[token] for sent in content for token in sent.split() if token in corpus.dictionary])
                
                encoded_vec_list.append(vec)

        return np.array(encoded_vec_list)

In [None]:
D_MODEL = 256
LAYER_NUM = 6
NUM_HEADS = 8
DFF = 512
VOCAB_SIZE = len(corpus.dictionary)

BATCH_SIZE = 64
BUFFER_SIZE = 20000

WARMUP_STEPS = 50
EPOCHS = 70

In [38]:
START_TOKEN, END_TOKEN = [VOCAB_SIZE], [VOCAB_SIZE + 1]

In [39]:
print('<SOS> Token : {}'.format(START_TOKEN))
print('<EOS> Token : {}'.format(END_TOKEN))

VOCAB_SIZE += 2
print('Vocabulary Size : {}'.format(VOCAB_SIZE))

<SOS> Token : [65850]
<EOS> Token : [65851]
Vocabulary Size : 65852


전처리 된 59,604개의 기사 본문 데이터에 정수 인코딩을 수행한다.  

정수 인코딩 수행 후에 (59604, None)의 크기를 가지는 Matrix가 되며,   
각 인코딩된 값을 임베딩 벡터 값으로 대체한 후에는 (59604, `max_token`, 256)의 크기를 가지는 Tensor가 된다.  
- `max_token`은 각 기사 본문의 토큰 수 중 가장 큰 값을 의미한다.

각 기사의 토큰에 대해 정수 인코딩을 수행한 결과 행렬을 `train_encoded_matrix`, `target_encoded_matrix`로 저장한다.

In [40]:
train_encoder = IntegerEncoder(MINI_PREPROCESSED_PATH, corpus)
train_encoded_matrix = train_encoder.get_encoded_vec_matrix()
train_encoded_matrix.shape

KeyboardInterrupt: 

In [None]:
get_max_length = lambda x : np.max([len(x[idx]) for idx in range(len(x))])
MAX_LEN = get_max_length(train_encoded_matrix)
MAX_LEN

In [None]:
target_encoder = IntegerEncoder(MINI_SUMMARY_PREPROCESSED_PATH, corpus)
target_encoded_matrix = target_encoder.get_encoded_vec_matrix()
target_encoded_matrix.shape

In [None]:
target_max_len = get_max_length(target_encoded_matrix)
MAX_LEN = target_max_len if target_max_len > MAX_LEN else MAX_LEN
MAX_LEN

기사의 시작과 끝을 알리는 <*SOS*>,<*EOS*>을 붙이고,  
nd-array로 저장하기 위해 패딩을 수행한다.

In [None]:
def encode_and_padding(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []
    
    for (content, summary) in zip(inputs, outputs):
        # Add Start Token, End Token
        content = START_TOKEN + content + END_TOKEN
        summary = START_TOKEN + summary + END_TOKEN

        tokenized_inputs.append(content)
        tokenized_outputs.append(summary)

    # Padding
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=MAX_LEN, padding='post')
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_outputs, maxlen=MAX_LEN, padding='post')
        
    return tokenized_inputs, tokenized_outputs

In [None]:
contents, summaries = encode_and_padding(train_encoded_matrix, target_encoded_matrix)

print('Contents Shape : {}'.format(contents.shape))
print('Summaries Shape : {}'.format(summaries.shape))

정수 인코딩을 수행한 결과행렬에 정수를 임베딩 벡터로 변환한다.  
이의 결과로 (59604, 4392, 256)의 dimension을 가진다.

SOS, EOS 토큰에 대한 임베딩 벡터 값도 있어야 한다.

In [None]:
def embedding_vector(int_enc_vec):
    return np.array([glove.word_vectors[integer] for integer in int_enc_vec])

def embedding_matrix(int_enc_matrix):
    row_list = []
    for idx, row_vec in enumerate(int_enc_matrix):
        emb_Vec = embedding_vector(row_vec)
        row_list.append(emb_Vec)
    return np.array(row_list)

In [None]:
train_embedded_matrix = embedding_matrix(train_encoded_matrix)
target_embedded_matrix = embedding_matrix(target_encoded_matrix)

print('Train Embedded Matrix Shape : {}'.format(train_embedded_matrix.shape))
print('Target Embedded Matrix Shape : {}'.format(target_embedded_matrix.shape))

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'encoder_inputs': contents, # Encoder Input
        'decoder_inputs': summaries[:, :-1] # Decoder Input
    },
    {
        # Decoder Output, Remove <SOS>
        'Output': summaries[:, 1:]  
    },
))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

학습률 Learning Rate 조정

In [None]:
lrate_scheduler = ti.LearningRate(d_model=D_MODEL, warmup_steps=WARMUP_STEPS)

Optimizer 정의

In [None]:
beta_1 = 0.9  
beta_2 = 0.98
epsilon = 10 ** -9

optimizer = tf.keras.optimizers.Adam(lrate_scheduler, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

Transformer 모델 정의

In [None]:
model = ti.Transformer(
    vocab_size=VOCAB_SIZE,
    layer_num=LAYER_NUM,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS).get_transformer()

In [None]:
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
!cp ./Transformer-Train-GloVe.ipynb ./Transformer-Train-Word2Vec.ipynb

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.00005, verbose=1)

model.fit(x=train_embedded_matrix, y=target_embedded_matrix, batch_size=16, epochs=EPOCHS, shuffle=True, callbacks=[reduce_lr])