In [1]:
import import_ipynb
from Transformer import transformer
from Transformer import CustomSchedule
from CommonModule.Handle_Dir import mkdir_p, del_folder
from CommonModule.Encoder import IntegerEncoder
from CommonModule.Decoder import Decoder

import re
import os
import numpy as np
from pathlib import Path
from glob import iglob
import tensorflow as tf
import sentencepiece as spm
import csv
import pandas as pd

importing Jupyter notebook from RC_Transformer.ipynb
importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Handle_Dir.ipynb
importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Encoder.ipynb
importing Jupyter notebook from /data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer/CommonModule/Decoder.ipynb


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    except RuntimeError as e:
        print(e)

In [3]:
BASE_DIR = "/data/ksb"
DATA_BASE_DIR = os.path.join(BASE_DIR, 'sample_articles')
SRC_BASE_DIR = os.path.join(BASE_DIR, 'TestSampleDir')

VAL_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Valid-Preprocessed-Data")
VAL_SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Valid-Summary-Preprocessed-Data")

PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")
SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")

PREDICT_PATH = os.path.join(DATA_BASE_DIR,"Predict-Data")

WORD_ENCODING_DIR = os.path.join(os.path.join(SRC_BASE_DIR, 'articleSummary-Jupyter'), 'Word-Encoding-Model')
MODEL_DIR = os.path.join(os.path.join(SRC_BASE_DIR, 'articleSummary-Jupyter'), 'Transformer-Model')


In [4]:
D_MODEL = 128

In [5]:
sp = spm.SentencePieceProcessor()
model_num = len(list(iglob(os.path.join(WORD_ENCODING_DIR, 'spm-input-*.vocab'), recursive=False))) -1
sp.Load(os.path.join(WORD_ENCODING_DIR, 'spm-input-{}.model').format(model_num))


True

In [6]:
VOCAB_SIZE = 70000

In [7]:
options = {
    'model-type' : 'Sentence-Piece',
    'inv_wv' : None,
    'corpus' : None,
    'spm' : sp
}
input_encoded_list = IntegerEncoder(options=options, filepaths=list(iglob(os.path.join(PREPROCESSED_PATH, '**.csv'), recursive=False))).encoder()
output_encoded_list = IntegerEncoder(options=options, filepaths=list(iglob(os.path.join(SUMMARY_PREPROCESSED_PATH, '**.csv'), recursive=False))).encoder()


In [8]:
LAYER_NUM = 6
NUM_HEADS = 8
DFF = 512

BATCH_SIZE = 64
BUFFER_SIZE = 5000

WARMUP_STEPS = 50
EPOCHS = 30

In [9]:
START_TOKEN = [sp.bos_id()]
END_TOKEN = [sp.eos_id()]

In [10]:
get_max_length = lambda x : np.max([len(line) for line in x])

MAX_LEN = get_max_length(input_encoded_list)
MAX_LEN

250

In [11]:
SUMMARY_MAX_LEN = get_max_length(output_encoded_list)
SUMMARY_MAX_LEN

190

In [12]:
input_encoded_list = list(map(lambda list_ : START_TOKEN + list_ + END_TOKEN, input_encoded_list))
output_encoded_list = list(map(lambda list_ : START_TOKEN + list_ + END_TOKEN, output_encoded_list))

In [13]:
from sklearn.model_selection import train_test_split
input_train, input_test, output_train, output_test = train_test_split(
    input_encoded_list, output_encoded_list, test_size=0.2, random_state=42)

In [14]:
MAX_LEN += 2

In [15]:
SUMMARY_MAX_LEN +=2

In [16]:
# Padding
train_input_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    input_train, maxlen=MAX_LEN, padding='post')
train_summary_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    output_train, maxlen=MAX_LEN, padding='post')

In [17]:
# Padding
test_input_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    input_test, maxlen=MAX_LEN, padding='post')
test_summary_encoded_matrix = tf.keras.preprocessing.sequence.pad_sequences(
    output_test, maxlen=MAX_LEN, padding='post')

In [18]:
print('Train Contents Shape : {}'.format(train_input_encoded_matrix.shape))
print('Train Summaries Shape : {}'.format(train_summary_encoded_matrix.shape))
print('Test Contents Shape : {}'.format(test_input_encoded_matrix.shape))
print('Test Summaries Shape : {}'.format(test_summary_encoded_matrix.shape))

Train Contents Shape : (22658, 252)
Train Summaries Shape : (22658, 252)
Test Contents Shape : (5665, 252)
Test Summaries Shape : (5665, 252)


In [19]:
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': train_input_encoded_matrix, # Encoder Input
        'dec_inputs': train_summary_encoded_matrix[:, :-1] # Decoder Input
    },
    {
        # Decoder Output, Remove <SOS>
        'outputs': train_summary_encoded_matrix[:, 1:]  
    },
))

In [20]:
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [21]:
val_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': test_input_encoded_matrix, # Encoder Input
        'dec_inputs': test_summary_encoded_matrix[:, :-1] # Decoder Input
    },
    {
        # Decoder Output, Remove <SOS>
        'outputs': test_summary_encoded_matrix[:, 1:]  
    },
))
val_dataset = val_dataset.cache()
val_dataset = val_dataset.shuffle(BUFFER_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [22]:
lrate_scheduler = CustomSchedule(d_model=D_MODEL)

In [23]:
beta_1 = 0.9  
beta_2 = 0.98
epsilon = 10 ** -9

optimizer = tf.keras.optimizers.Adam(lrate_scheduler, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [24]:
with tf.device('/CPU'):
    model = transformer(
        vocab_size=VOCAB_SIZE,
        num_layers=LAYER_NUM,
        dff=DFF,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        dropout = 0.3)

(None, None, 128)
(None, None, 128)


In [25]:
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LEN-1))

    loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [26]:
checkpoint_dirpath = os.path.join(MODEL_DIR, "RC_Transformer")

mkdir_p(checkpoint_dirpath)
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_dirpath,
    save_weights_only=True,
    monitor='loss',
    mode='max',
    save_best_only=True)

In [30]:
from tensorflow.keras.losses import sparse_categorical_crossentropy

model.compile(optimizer=optimizer, loss=loss_function)
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 128)    8960000     inputs[0][0]                     
__________________________________________________________________________________________________
tf.math.multiply_1 (TFOpLambda) (None, None, 128)    0           embedding_1[0][0]                
__________________________________________________________________________________________________
positional_encoding_1 (Position (None, None, 128)    0           tf.math.multiply_1[0][0]         
________________________________________________________________________________________

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.00005, verbose=1)

model.fit(dataset, batch_size=BATCH_SIZE, epochs=30, verbose=2, validation_data=val_dataset, 
          shuffle=True, callbacks=[model_checkpoint_callback])


Epoch 1/30


In [None]:
option={
        'model-type' : 'Sentence-Piece',
        'inv_wv' : None,
        'corpus' : None,
        'spm' : sp
    }

In [None]:
def evaluate(content, max_len):
    
#     enc_input_sent = IntegerEncoder(options=src_option, filepaths=None).line_encoder(content)
#     enc_input_sent = START_TOKEN + enc_input_sent + END_TOKEN
    enc_input_sent= content
    enc_input_sent = tf.expand_dims(enc_input_sent, axis=0)
    
    output = START_TOKEN
    output = tf.expand_dims(output, axis=0)
            
    for i in range(max_len):
        predictions = model(inputs=[enc_input_sent, output], training=False)
        
        predictions = predictions[:, -1:, :]
        
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
        predicted_id = tf.squeeze(predicted_id)

        if tf.equal(predicted_id, END_TOKEN[0]):
            break

        output = tf.concat([output, tf.expand_dims([predicted_id], axis=0)], axis=-1)

    return tf.squeeze(output, axis=0)


In [None]:
def predict(content):
    prediction = evaluate(content, SUMMARY_MAX_LEN)

    predict_list = prediction.numpy().tolist()
    decoder = Decoder(option)
    predicted_sentence = decoder.decode(predict_list)

    return predicted_sentence

In [None]:
def saveCSVFile(baseDir, media, article_dist):
    save_path = os.path.join(baseDir, media) + ".csv"

    article_dist.to_csv(save_path, mode='w', header=False)

In [None]:
def get_media_name(filepath):
    filename = filepath.split(os.sep)[-1]
    return filename.split(".")[0]

In [None]:
del_folder(PREDICT_PATH)
mkdir_p(PREDICT_PATH)

src data (기사 본문과 비교)

#### ROUGE-L Score  
reference : 기사 본문  
produced summary : RC-Transformer 생성 요약문 

In [None]:
import nltk.translate.bleu_score as bleu
from rouge import Rouge 
rouge = Rouge()

generated_summary_dist = pd.DataFrame(columns=['Origin Contents', 'Generated Summary', 
                                            'ROUGE-F1', 'ROUGE-Recall', 'ROUGE-Precision',
                                              'ROUGE-F1-with-Target', 'ROUGE-Recall-with-Target', 'ROUGE-Precision-with-Target'])

for inp, outp in zip(input_test, output_test):
    decoder_src = Decoder(option)
    input_sent = decoder_src.decode(inp)
    target_sent = decoder_src.decode(outp)
    
    predict_summary = predict(inp)
    
    print('Input: {}'.format(input_sent))
    print('Target : {}'.format(target_sent))
    print('Output: {}'.format(predict_summary))
    
    # ROUGE-L Score
    rouge_scores = rouge.get_scores(predict_summary, input_sent)
    rouge_f = rouge_scores[0]['rouge-l']['f']
    print('Rouge F1: {}'.format(rouge_f * 100))
    
    rouge_r = rouge_scores[0]['rouge-l']['r']
    print('Rouge Recall: {}'.format(rouge_r * 100))
    
    rouge_p = rouge_scores[0]['rouge-l']['p']
    print('Rouge Precision: {}\n'.format(rouge_p * 100))
    
    
    rouge_scores_with_target = rouge.get_scores(target_sent, input_sent)
    tar_rouge_f = rouge_scores_with_target[0]['rouge-l']['f']
    print('Rouge F1: {}'.format(tar_rouge_f * 100))
    
    tar_rouge_r = rouge_scores_with_target[0]['rouge-l']['r']
    print('Rouge Recall: {}'.format(tar_rouge_r * 100))
    
    tar_rouge_p = rouge_scores_with_target[0]['rouge-l']['p']
    print('Rouge Precision: {}\n'.format(tar_rouge_p * 100))
    
    summary = {'Origin Contents' : input_sent, 'Generated Summary' : predict_summary,
               'ROUGE-F1': rouge_f, 'ROUGE-Recall': rouge_r, 'ROUGE-Precision': rouge_p, 
               'ROUGE-F1-with-Target': tar_rouge_f, 'ROUGE-Recall-with-Target': tar_rouge_r, 'ROUGE-Precision-with-Target': tar_rouge_p}
    
    generated_summary_dist = generated_summary_dist.append(summary, ignore_index=True)
    

target data (추출 요약문과 비교)

#### ROUGE-L Score  
reference : gensim summarize 라이브러리를 이용해 추출한 추출 요약문    
produced summary : RC-Transformer 생성 요약문 

In [None]:
rouge = Rouge()

generated_summary_target_dist = pd.DataFrame(columns=['Origin Contents', 'Target Summary', 'Generated Summary', 
                                                'ROUGE F1', 'ROUGE Recall', 'ROUGE Precision'])
for inp, outp in zip(input_test, output_test):
    decoder_src = Decoder(option)
    input_sent = decoder_src.decode(inp)
    target_sent = decoder_src.decode(outp)
    
    predict_summary = predict(inp)
    
    print('Input : {}'.format(input_sent))
    print('Target : {}'.format(target_sent))
    print('Output : {}'.format(predict_summary))
    
    rouge_scores = rouge.get_scores(predict_summary, target_sent)
    rouge_f = rouge_scores[0]['rouge-l']['f']
    print('Rouge F1: {}'.format(rouge_f * 100))
    
    rouge_r = rouge_scores[0]['rouge-l']['r']
    print('Rouge Recall: {}'.format(rouge_r * 100))
    
    rouge_p = rouge_scores[0]['rouge-l']['p']
    print('Rouge Precision: {}\n'.format(rouge_p * 100))
    
    summary = {'Origin Contents' : input_sent, 'Target Summary' : target_sent,'Generated Summary' : predict_summary,
               'ROUGE F1': rouge_f, 'ROUGE Recall': rouge_r, 'ROUGE Precision': rouge_p}
    
    generated_summary_target_dist = generated_summary_target_dist.append(summary, ignore_index=True)
    

In [None]:
import nltk.translate.bleu_score as bleu
from rouge import Rouge 
rouge = Rouge()

validation_generated_dist = pd.DataFrame(columns=['Origin Contents', 'Generated Summary', 
                                            'ROUGE F1', 'ROUGE Recall', 'ROUGE Precision'])

for _, val_proc_path in enumerate(iglob(os.path.join(VAL_PREPROCESSED_PATH, '**.csv'), recursive=False)):

    media_name = get_media_name(val_proc_path)
    val_summary_path = os.path.join(VAL_SUMMARY_PREPROCESSED_PATH, media_name +".csv")
    print(media_name, val_proc_path)
    
    f_src = open(val_proc_path, 'r', newline="\n", encoding="utf-8")
    f_tar = open(val_summary_path, 'r', newline="\n", encoding="utf-8")
        
    for [_, title, contents], [_, _, target] in zip(csv.reader(f_src), csv.reader(f_tar)):
        content = contents.split("\t")
        target_summary = target.split("\t")
        
        encoder = IntergerEncoder(options=option, filepaths=None)
        
        input_sent = ' '.join(content)
        input_enc_sent = START_TOKEN + encoder.line_encoder(content_line) + END_TOKEN
        
        predict_summary = predict(input_enc_sent)
        target_summary = ' '.join(target_summary)
    
        print('Input: {}'.format(input_sent))
        print('Output: {}'.format(predict_summary))
    
        rouge_scores = rouge.get_scores(predict_summary, input_sent)
        rouge_f = rouge_scores[0]['rouge-l']['f']
        print('Rouge F1: {}'.format(rouge_f * 100))
    
        rouge_r = rouge_scores[0]['rouge-l']['r']
        print('Rouge Recall: {}'.format(rouge_r * 100))
    
        rouge_p = rouge_scores[0]['rouge-l']['p']
        print('Rouge Precision: {}\n'.format(rouge_p * 100))
    
        summary = {'Origin Contents' : input_sent, 'Generated Summary' : predict_summary,
               'ROUGE F1': rouge_f, 'ROUGE Recall': rouge_r, 'ROUGE Precision': rouge_p}
    
        validation_generated_dist = validation_generated_dist.append(summary, ignore_index=True)
    

In [None]:
saveCSVFile(PREDICT_PATH, 'KBS', generated_summary_dist)

In [None]:
saveCSVFile(PREDICT_PATH, 'KBS', generated_summary_target_dist)