In [2]:
import import_ipynb
from S2S_Attention import Encoder, Decoder
from Transformer.CommonModule.Handle_Dir import mkdir_p, del_folder
from Transformer.CommonModule.Encoder import IntegerEncoder

import numpy as np
import tensorflow as tf

import os
from glob import iglob
import sentencepiece as spm

import csv

importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Handle_Dir.ipynb
importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Encoder.ipynb


In [3]:
BASE_DIR = "/data/ksb/"
DATA_BASE_DIR = os.path.join(BASE_DIR, 'sample_articles')
SRC_BASE_DIR = os.path.join(BASE_DIR, 'TestSampleDir')

TITLE_PREPROCESSED_PATH= os.path.join(DATA_BASE_DIR,"Title-Preprocessed-Data")
PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")

SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")

WORD_ENCODING_DIR = os.path.join(os.path.join(SRC_BASE_DIR, 'articleSummary-Jupyter'), 'Word-Encoding-Model')
MODEL_DIR = os.path.join(SRC_BASE_DIR, 'Transformer-Model')

In [4]:
sp = spm.SentencePieceProcessor()
model_num = len(list(iglob(os.path.join(WORD_ENCODING_DIR, 'spm-input-*.vocab'), recursive=False))) -1
sp.Load(os.path.join(WORD_ENCODING_DIR, 'spm-input-{}.model').format(model_num))


True

In [5]:
VOCAB_SIZE = 70000
D_MODEL = 128

In [6]:
options = {
    'model-type' : 'Sentence-Piece',
    'inv_wv' : None,
    'corpus' : None,
    'spm' : sp
}
output_encoded_list = IntegerEncoder(options=options, filepaths=list(iglob(os.path.join(TITLE_PREPROCESSED_PATH, '**.csv'), recursive=False))).encoder()
input_encoded_list = IntegerEncoder(options=options, filepaths=list(iglob(os.path.join(SUMMARY_PREPROCESSED_PATH, '**.csv'), recursive=False))).encoder()


In [7]:
get_max_length = lambda x : np.max([len(line) for line in x])

MAX_LEN = get_max_length(input_encoded_list)
MAX_LEN

190

In [8]:
SUMMARY_MAX_LEN = get_max_length(output_encoded_list)
SUMMARY_MAX_LEN

33

In [9]:
BATCH_SIZE = 64
BUFFER_SIZE = 5000

EPOCHS = 30
ENC_UNITS = 128
DEC_UNITS = ENC_UNITS * 2

START_TOKEN = [sp.bos_id()]
END_TOKEN = [sp.eos_id()]

In [10]:
input_encoded_list = list(map(lambda list_ : START_TOKEN + list_ + END_TOKEN, input_encoded_list))
output_encoded_list = list(map(lambda list_ : START_TOKEN + list_ + END_TOKEN, output_encoded_list))

In [11]:
MAX_LEN+=2
SUMMARY_MAX_LEN+=2

In [12]:
# Padding
input_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    input_encoded_list, maxlen=MAX_LEN, padding='post')
output_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    output_encoded_list, maxlen=MAX_LEN, padding='post')

In [13]:
print('Summary Shape : {}'.format(input_encoded_matrix.shape))
print('Title Shape : {}'.format(output_encoded_matrix.shape))

Summary Shape : (28323, 192)
Title Shape : (28323, 192)


In [14]:
dataset = tf.data.Dataset.from_tensor_slices((input_encoded_matrix, output_encoded_matrix))

In [15]:
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [16]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 192]), TensorShape([64, 192]))

In [17]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [23]:
@tf.function
def train_step(inp, targ):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp)

        dec_hidden = enc_hidden[0]

        # print("dec_hidden shape : {}".format(dec_hidden))

        dec_input = tf.expand_dims(START_TOKEN * BATCH_SIZE, 1)

        # 교사 강요(teacher forcing) - 다음 입력으로 타겟을 피딩(feeding)합니다.
        for t in range(1, targ.shape[1]):
            # enc_output를 디코더에 전달합니다.
            #   print("enc_output shape : {}".format(enc_output))
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            #   print("dec_hidden shape : {}".format(dec_hidden.shape))
            loss += loss_function(targ[:, t], predictions)

            # 교사 강요(teacher forcing)를 사용합니다.
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [19]:
encoder = Encoder(VOCAB_SIZE, D_MODEL, ENC_UNITS, BATCH_SIZE, cell='lstm')

with tf.device('/GPU:1'):
    sample_output, sample_hidden = encoder(example_input_batch)
    print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
    print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden[0].shape))
    print ('Encoder Cell state shape: (batch size, units) {}'.format(sample_hidden[1].shape))

Encoder output shape: (batch size, sequence length, units) (64, 192, 256)
Encoder Hidden state shape: (batch size, units) (64, 256)
Encoder Cell state shape: (batch size, units) (64, 256)


In [20]:
decoder = Decoder(VOCAB_SIZE, D_MODEL, DEC_UNITS, BATCH_SIZE, cell='lstm')

with tf.device('/GPU:1'):
    hidden = sample_hidden[0]
    sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                          hidden, sample_output)

    print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 70000)


In [21]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [None]:
import time

EPOCHS = 10
BUFFER_SIZE = input_encoded_matrix.shape[0]
BATCH_SIZE = 64
steps_per_epoch = BUFFER_SIZE // BATCH_SIZE

with tf.device('/GPU:1'):
    for epoch in range(EPOCHS): 
        start = time.time()

#         enc_hidden = encoder.initialize_hidden_state()
        total_loss = 0

        for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ)
            total_loss += batch_loss

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
        # 에포크가 2번 실행될때마다 모델 저장 (체크포인트)
        if (epoch + 1) % 2 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)

        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                          total_loss / steps_per_epoch))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.6764
Epoch 1 Batch 100 Loss 0.4685
Epoch 1 Batch 200 Loss 0.4890
Epoch 1 Batch 300 Loss 0.4790
Epoch 1 Batch 400 Loss 0.4476
Epoch 1 Loss 0.4900
Time taken for 1 epoch 595.5211997032166 sec

Epoch 2 Batch 0 Loss 0.4859
Epoch 2 Batch 100 Loss 0.4396
Epoch 2 Batch 200 Loss 0.4473
Epoch 2 Batch 300 Loss 0.4520
Epoch 2 Batch 400 Loss 0.4371
Epoch 2 Loss 0.4454
Time taken for 1 epoch 380.66272926330566 sec

Epoch 3 Batch 0 Loss 0.4624
Epoch 3 Batch 100 Loss 0.4278
Epoch 3 Batch 200 Loss 0.4096
Epoch 3 Batch 300 Loss 0.4292
Epoch 3 Batch 400 Loss 0.4141
Epoch 3 Loss 0.4205
Time taken for 1 epoch 386.43679022789 sec

Epoch 4 Batch 0 Loss 0.4272
Epoch 4 Batch 100 Loss 0.4231
Epoch 4 Batch 200 Loss 0.3939
Epoch 4 Batch 300 Loss 0.3936
Epoch 4 Batch 400 Loss 0.3839
Epoch 4 Loss 0.4006
Time taken for 1 epoch 387.2936415672302 sec

Epoch 5 Batch 0 Loss 0.4211
Epoch 5 Batch 100 Loss 0.3835
Epoch 5 Batch 200 Loss 0.3662
Epoch 5 Batch 300 Loss 0.3503
Epoch 5 Batch 400 Loss 0.38

In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((SUMMARY_MAX_LEN, MAX_LEN))


    inputs = sp.encode_as_ids(sentence)
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=MAX_LEN,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''
    
    enc_out, enc_stats = encoder(inputs)
    
#     enc_hidden_stat = enc_stats

    enc_hidden_stat = enc_stats[0]
    enc_cell_stat = enc_stats[1]

    print(enc_hidden_stat.shape)

    dec_hidden = enc_hidden_stat
    dec_input = tf.expand_dims(START_TOKEN, 0)

    idx_list = []
    for t in range(SUMMARY_MAX_LEN):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)

        # 나중에 어텐션 가중치를 시각화하기 위해 어텐션 가중치를 저장합니다.
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()
        # print(sp.decode_ids([predicted_id]))

        # result += sp.decode_ids([predicted_id])[0] + ' '
        idx_list.append(int(predicted_id))

        if [predicted_id] == END_TOKEN:
            return ' '.join(sp.decode_ids(idx_list)), sentence, attention_plot

        # 예측된 ID를 모델에 다시 피드합니다.
        dec_input = tf.expand_dims([predicted_id], 0)

    return ' '.join(sp.decode_ids(idx_list)), sentence, attention_plot

In [40]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.font_manager as fm

path = './NanumBarunGothic.ttf'

# 어텐션 가중치를 그리기 위한 함수입니다.
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(14,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'family': 'NanumBarunGothic', 'fontsize': 14}
    fontprop = fm.FontProperties(fname = path, size=18)


    ax.set_xticklabels([''] + sentence, fontproperties=fontprop, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontproperties=fontprop, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))


    plt.show()

In [41]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [1]:
translate("이 아이의 장기판매도 가능하다는 글도 덧붙였다 4분 뒤 우리집 내 딸 판매합니다 라는 제목의 글이 또 올라왔다")

NameError: name 'translate' is not defined