In [57]:
import import_ipynb
import Transformer_Implement as ti
import tensorflow as tf

import re
import os
import numpy as np

from glove import Corpus, Glove

경로 설정

In [16]:
BASE_DIR = "/data/TestDir"

DATA_BASE_DIR = os.path.join(BASE_DIR, 'sample_articles')
PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")
SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")

MODEL_BASE_DIR = os.path.join(os.path.join(BASE_DIR, 'articleSummary-Jupyter'), 'Word-Embedding')
glove_model_path = os.path.join(MODEL_BASE_DIR, 'glove.model')
corpus_model_path = os.path.join(MODEL_BASE_DIR, 'corpus.model')

말뭉치와 함께 임베딩 모델 불러오기

In [17]:
glove = Glove.load(glove_model_path)
corpus = Corpus.load(corpus_model_path)

In [18]:
class RawTextReader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.rgxSplitter = re.compile("/n")

    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in ch:
                yield s

In [19]:
class IntegerEncoder:
    def __init__(self, filepath, corpus):
        self.filepath = filepath
        self.corpus = corpus
    
    def get_encoded_vec_matrix(self):
        media_list = os.listdir(self.filepath)
        
        encoded_vec_list = []
        for media_name in media_list:
    
            media_path = os.path.join(self.filepath, media_name)
            article_list = os.listdir(media_path)
            
            for article_name in article_list:
                
                reader = RawTextReader(os.path.join(media_path, article_name)) 
                content = list(filter(None, reader))
                corpus
                vec = np.array([corpus.dictionary[token] for sent in content for token in sent.split() if token in corpus.dictionary])
                
                encoded_vec_list.append(vec)

        return np.array(encoded_vec_list)

In [50]:
D_MODEL = 256
LAYER_NUM = 6
NUM_HEADS = 8
DFF = 512
VOCAB_SIZE = len(corpus.dictionary)

BATCH_SIZE = 64
BUFFER_SIZE = 20000

WARMUP_STEPS = 50
EPOCHS = 70

In [42]:
START_TOKEN, END_TOKEN = [VOCAB_SIZE], [VOCAB_SIZE + 1]

In [55]:
print('<SOS> Token : {}'.format(START_TOKEN))
print('<EOS> Token : {}'.format(END_TOKEN))

VOCAB_SIZE += 2
print('Vocabulary Size : {}'.format(VOCAB_SIZE))

<SOS> Token : [65848]
<EOS> Token : [65849]
Vocabulary Size : 65850


전처리 된 99,237개의 기사 본문 데이터에 정수 인코딩을 수행한다.  

정수 인코딩 수행 후에 (99237, None)의 크기를 가지는 Matrix가 되며,   
각 인코딩된 값을 임베딩 벡터 값으로 대체한 후에는 (99237, `max_token`, 256)의 크기를 가지는 Tensor가 된다.  
- `max_token`은 각 기사 본문의 토큰 수 중 가장 큰 값을 의미한다.

In [30]:
train_encoder = IntegerEncoder(PREPROCESSED_PATH, corpus)
train_encoded_matrix = train_encoder.get_encoded_vec_matrix()
train_encoded_matrix.shape



(99237,)

In [56]:
get_max_length = lambda x : np.max([len(x[idx]) for idx in range(len(x))])
MAX_LEN = get_max_length(train_encoded_matrix)
MAX_LEN

5813

In [None]:
target_encoder = IntegerEncoder(SUMMARY_PREPROCESSED_PATH, corpus)
target_encoded_matrix = target_encoder.get_encoded_vec_list()

In [None]:
target_max_len = get_max_length(target_encoded_matrix)
MAX_LEN = target_max_len if target_max_len > MAX_LEN

In [47]:
def encode_and_padding(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []
    
    for (content, summary) in zip(inputs, outputs):
        # Add Start Token, End Token
        content = START_TOKEN + content + END_TOKEN
        summary = START_TOKEN + summary + END_TOKEN

        tokenized_inputs.append(content)
        tokenized_outputs.append(summary)

    # Padding
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=MAX_LEN, padding='post')
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_outputs, maxlen=MAX_LEN, padding='post')
        
    return tokenized_inputs, tokenized_outputs

In [None]:
contents, summaries = encode_and_padding(train_encoded_matrix, target_encoded_matrix)

print('Contents Shape : {}'.format(contents.shape))
print('Summaries Shape : {}'.format(summaries.shape))

In [49]:
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'encoder_inputs': contents, # Encoder Input
        'decoder_inputs': summaries[:, :-1] # Decoder Input
    },
    {
        # Decoder Output, Remove <SOS>
        'Output': summaries[:, 1:]  
    },
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

NameError: name 'tf' is not defined

In [37]:
lrate_scheduler = ti.LearningRate(d_model=D_MODEL, warmup_steps=WARMUP_STEPS)

NameError: name 'D_MODEL' is not defined

In [None]:
beta_1 = 0.9  
beta_2 = 0.98
epsilon = 10 ** -9

optimizer = tf.keras.optimizers.Adam(lrate_scheduler, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [None]:
model = ti.Transformer(
    vocab_size=VOCAB_SIZE,
    layer_num=LAYER_NUM,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS).get_transformer()

In [None]:
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(dataset, epochs=EPOCHS)