In [115]:
import import_ipynb
import RC_Transformer
from CommonModule.Handle_Dir import mkdir_p, del_folder
from CommonModule.Embedding import DocumentEmbedding
from CommonModule.Encoder import IntegerEncoder


import re
import os
import numpy as np
from pathlib import Path
from glob import iglob
import tensorflow as tf
import sentencepiece as spm
import csv

In [116]:
BASE_DIR = "/docker/data/ksb/TestSampleDir"
DATA_BASE_DIR = os.path.join(BASE_DIR, 'articles')

PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")

SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")

WORD_EMBEDDING_DIR = os.path.join(os.path.join(BASE_DIR, 'articleSummary-Jupyter'), 'Word-Embedding-Model')
MODEL_DIR = os.path.join(os.path.join(BASE_DIR, 'articleSummary-Jupyter'), 'Transformer-Model')

In [136]:
D_MODEL = 128

In [137]:
sp = spm.SentencePieceProcessor()
model_num = len(list(iglob('**.vocab', recursive=False))) -1
sp.Load('spm-{}.model'.format(model_num))

True

In [138]:
VOCAB_SIZE = 70000

In [139]:
options = {
    'model-type' : 'Sentence-Piece',
    'inv_wv' : None,
    'corpus' : None,
    'spm' : sp
}
input_encoded_list = IntegerEncoder(options=options, filepaths=list(iglob(os.path.join(PREPROCESSED_PATH, '**.csv'), recursive=False))).encoder()
output_encoded_list = IntegerEncoder(options=options, filepaths=list(iglob(os.path.join(SUMMARY_PREPROCESSED_PATH, '**.csv'), recursive=False))).encoder()


In [140]:
LAYER_NUM = 6
NUM_HEADS = 8
DFF = 512

BATCH_SIZE = 64
BUFFER_SIZE = 5000

WARMUP_STEPS = 50
EPOCHS = 30

In [141]:
get_max_length = lambda x : np.max([len(line) for line in x])

MAX_LEN = get_max_length(input_encoded_list)
MAX_LEN

250

In [142]:
# Padding
input_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    input_encoded_list, maxlen=MAX_LEN, padding='post')
summary_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    output_encoded_list, maxlen=MAX_LEN, padding='post')

In [143]:
print('Contents Shape : {}'.format(input_encoded_matrix.shape))
print('Summaries Shape : {}'.format(summary_encoded_matrix.shape))

Contents Shape : (57632, 250)
Summaries Shape : (57632, 250)


In [144]:
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': input_encoded_matrix, # Encoder Input
        'dec_inputs': summary_encoded_matrix[:, :] # Decoder Input
    },
    {
        # Decoder Output, Remove <SOS>
        'outputs': summary_encoded_matrix[:, :]  
    },
))

In [145]:
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [146]:
lrate_scheduler = CustomSchedule(d_model=D_MODEL)

In [147]:
beta_1 = 0.9  
beta_2 = 0.98
epsilon = 10 ** -9

optimizer = tf.keras.optimizers.Adam(lrate_scheduler, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [148]:
model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=LAYER_NUM,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout = 0.3)

(None, None, 128)
(None, None, 128)


In [149]:
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LEN))

    loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [2]:
checkpoint_dirpath = os.path.join(MODEL_DIR, "RC_Transformer")
checkpoint_dirpath = "RC_Transformer"

mkdir_p(checkpoint_dirpath)
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_dirpath,
    save_weights_only=True,
    monitor='loss',
    mode='max',
    save_best_only=True)

In [155]:
from tensorflow.keras.losses import sparse_categorical_crossentropy

model.compile(optimizer=optimizer, loss=loss_function)
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_33 (Embedding)        (None, None, 128)    8960000     inputs[0][0]                     
__________________________________________________________________________________________________
tf.math.multiply_46 (TFOpLambda (None, None, 128)    0           embedding_33[0][0]               
__________________________________________________________________________________________________
positional_encoding_33 (Positio (None, None, 128)    0           tf.math.multiply_46[0][0]        
________________________________________________________________________________________

In [None]:
model.fit(dataset, batch_size=BATCH_SIZE, epochs=15, verbose=2, shuffle=True, callbacks=[model_checkpoint_callback])

Epoch 1/10
901/901 - 1163s - loss: 1.0460
Epoch 2/10
901/901 - 1152s - loss: 0.7337
Epoch 3/10
901/901 - 1151s - loss: 0.3987
Epoch 4/10
901/901 - 1151s - loss: 0.1873
Epoch 5/10
901/901 - 1151s - loss: 0.1012
Epoch 6/10
901/901 - 1151s - loss: 0.0750
Epoch 7/10
901/901 - 1150s - loss: 0.0683
Epoch 8/10
901/901 - 1150s - loss: 0.0655
Epoch 9/10
901/901 - 1151s - loss: 0.0632
Epoch 10/10
