In [3]:
import import_ipynb
import S2S
from CommonModule.Handle_Dir import mkdir_p, del_folder
from CommonModule.Encoder import IntegerEncoder

import numpy as np
import tensorflow as tf

import os
from glob import iglob
import sentencepiece as spm

import csv

In [4]:
BASE_DIR = "/docker/data/ksb/TestSampleDir"
DATA_BASE_DIR = os.path.join(BASE_DIR, 'articles')

TITLE_PREPROCESSED_PATH= os.path.join(DATA_BASE_DIR,"Title-Preprocessed-Data")
PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")

SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")

WORD_EMBEDDING_DIR = os.path.join(BASE_DIR, 'Word-Embedding-Model')
MODEL_DIR = os.path.join(BASE_DIR, 'Transformer-Model')

In [15]:
sp = spm.SentencePieceProcessor()
model_num = len(list(iglob('spm-headline-*.vocab', recursive=False))) -1
sp.Load('spm-headline-{}.model'.format(model_num))

True

In [16]:
VOCAB_SIZE = 70000
D_MODEL = 128

In [17]:
options = {
    'model-type' : 'Sentence-Piece',
    'inv_wv' : None,
    'corpus' : None,
    'spm' : sp
}
output_encoded_list = IntegerEncoder(options=options, filepaths=list(iglob(os.path.join(TITLE_PREPROCESSED_PATH, '**.csv'), recursive=False))).encoder()
input_encoded_list = IntegerEncoder(options=options, filepaths=list(iglob(os.path.join(SUMMARY_PREPROCESSED_PATH, '**.csv'), recursive=False))).encoder()


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [18]:
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LEN))

    loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [19]:
model = seq2seq(src_vocab_size=VOCAB_SIZE, tar_vocab_size=VOCAB_SIZE, embedding_size= D_MODEL, units=256)
model.compile(optimizer="rmsprop", loss=loss_function)
model.summary()

Model: "sequence-to-sequence"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
decoder inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder (Functional)            [(None, 256), (None, 9354240     encoder inputs[0][0]             
__________________________________________________________________________________________________
decoder (Functional)            (None, None, 256)    9354240     decoder inputs[0][0]             
                                                                 encoder[0][0] 

In [20]:
get_max_length = lambda x : np.max([len(line) for line in x])

MAX_LEN = get_max_length(input_encoded_list)
MAX_LEN

145

In [21]:
get_max_length(output_encoded_list)

25

In [22]:
BATCH_SIZE = 64
BUFFER_SIZE = 5000

EPOCHS = 30

In [23]:
# Padding
input_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    input_encoded_list, maxlen=MAX_LEN, padding='post')
output_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    output_encoded_list, maxlen=MAX_LEN, padding='post')

In [24]:
print('Summary Shape : {}'.format(input_encoded_matrix.shape))
print('Title Shape : {}'.format(output_encoded_matrix.shape))

Summary Shape : (46571, 145)
Title Shape : (46571, 145)


In [25]:
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'encoder inputs': input_encoded_matrix, # Encoder Input
        'decoder inputs': output_encoded_matrix[:, :] # Decoder Input
    },
    {
        # Decoder Output, Remove <SOS>
        'outputs': output_encoded_matrix[:, :]  
    },
))

In [26]:
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [27]:
for dict_1, dict_2 in dataset.as_numpy_iterator():
    print("encoder input : {enc}, decoder input : {dec}".format(enc= dict_1['encoder inputs'].shape,
                                                               dec=dict_1['decoder inputs'].shape))
    print("output shape : {}".format(dict_2['outputs'].shape))

encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shape : (64, 145)
encoder input : (64, 145), decoder input : (64, 145)
output shap

In [28]:
model.fit(dataset, batch_size=BATCH_SIZE, epochs=15, verbose=2, shuffle=True)

Epoch 1/15
728/728 - 388s - loss: 0.3258
Epoch 2/15
728/728 - 384s - loss: 0.1424
Epoch 3/15
728/728 - 383s - loss: 0.0811
Epoch 4/15
728/728 - 381s - loss: 0.0507
Epoch 5/15
728/728 - 378s - loss: 0.0336
Epoch 6/15
728/728 - 378s - loss: 0.0238
Epoch 7/15
728/728 - 378s - loss: 0.0179
Epoch 8/15
728/728 - 378s - loss: 0.0139
Epoch 9/15
728/728 - 378s - loss: 0.0112
Epoch 10/15
728/728 - 378s - loss: 0.0095
Epoch 11/15
728/728 - 379s - loss: 0.0083
Epoch 12/15
728/728 - 378s - loss: 0.0073
Epoch 13/15
728/728 - 378s - loss: 0.0064
Epoch 14/15
728/728 - 379s - loss: 0.0056
Epoch 15/15
728/728 - 377s - loss: 0.0049


<tensorflow.python.keras.callbacks.History at 0x7fa6b9d039b0>