In [22]:
import import_ipynb
from Transformer_Implement import Transformer, LearningRate
from CommonModule.Handle_Dir import mkdir_p, del_folder
from CommonModule.Embedding import Embedding
from CommonModule.Encoder import Encoder
from CommonModule.Decoder import Decoder

import os
import re
import csv
import pandas as pd
import numpy as np
from glob import iglob
import tensorflow as tf
from glove import Corpus, Glove


In [25]:
BASE_DIR = "/data/ksb/TestSampleDir"
DATA_BASE_DIR = os.path.join(BASE_DIR, 'articles')

PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")
SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")
PREDICT_PATH = os.path.join(DATA_BASE_DIR,"Predict-Data")

WORD_EMBEDDING_DIR = os.path.join(os.path.join(BASE_DIR, 'articleSummary-Jupyter'), 'Word-Embedding-Model')
MODEL_DIR = os.path.join(os.path.join(BASE_DIR, 'articleSummary-Jupyter'), 'Transformer-Model')


In [17]:
D_MODEL = 128
MIN_COUNT = 10

SUMMARY_MAX_LEN = 100

In [5]:
glove_model_path = os.path.join(WORD_EMBEDDING_DIR, 'glove-{d_model}-{mincount}.model'.format(d_model=D_MODEL, mincount=MIN_COUNT))
corpus_model_path = os.path.join(WORD_EMBEDDING_DIR, 'input-corpus-{d_model}-{mincount}.model'.format(d_model=D_MODEL, mincount=MIN_COUNT))

In [6]:
glove = Glove.load(glove_model_path)
corpus = Corpus.load(corpus_model_path)

In [7]:
START_TOKEN = ['<SOS>']
END_TOKEN = ['<EOS>']

START_TOKEN_NUM = corpus.dictionary[START_TOKEN[0]]
END_TOKEN_NUM = corpus.dictionary[END_TOKEN[0]]

LAYER_NUM = 6
NUM_HEADS = 8
DFF = 512
VOCAB_SIZE = len(corpus.dictionary)

BATCH_SIZE = 64
BUFFER_SIZE = 5000

WARMUP_STEPS = 50
EPOCHS = 70

Transformer 모델 생성

In [8]:
with tf.device('/CPU:0'):
    model = Transformer(
        vocab_size=VOCAB_SIZE,
        layer_num=LAYER_NUM,
        dff=DFF,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        dropout = 0.3).get_transformer()

(1, 58112, 128)
encoder_layer_0 sub-layer 1
[Input] Q shape : (None, None, 128), K shape : (None, None, 128), V shape : (None, None, 128)

[Dense] Q shape : (None, None, 128), K shape : (None, None, 128), V shape : (None, None, 128)

[Splited] Q shape : (None, 8, None, 16), K shape : (None, 8, None, 16), V shape : (None, 8, None, 16)

encoder_layer_0 sub-layer 2
encoder_layer_1 sub-layer 1
[Input] Q shape : (None, None, 128), K shape : (None, None, 128), V shape : (None, None, 128)

[Dense] Q shape : (None, None, 128), K shape : (None, None, 128), V shape : (None, None, 128)

[Splited] Q shape : (None, 8, None, 16), K shape : (None, 8, None, 16), V shape : (None, 8, None, 16)

encoder_layer_1 sub-layer 2
encoder_layer_2 sub-layer 1
[Input] Q shape : (None, None, 128), K shape : (None, None, 128), V shape : (None, None, 128)

[Dense] Q shape : (None, None, 128), K shape : (None, None, 128), V shape : (None, None, 128)

[Splited] Q shape : (None, 8, None, 16), K shape : (None, 8, None, 1

학습된 가중치 로드

In [9]:
checkpoint_num = len(list(iglob(os.path.join(MODEL_DIR, '*.index'), recursive=False))) - 1
checkpoint_path = os.path.join(MODEL_DIR, 'model-checkpoint-{}'.format(checkpoint_num))
print(checkpoint_path)

model.load_weights(checkpoint_path)

/data/ksb/TestSampleDir/articleSummary-Jupyter/Transformer-Model/model-checkpoint-0


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f98283ec9e8>

In [11]:
def evaluate(tokens, max_len):
    
    enc_input_sent = START_TOKEN + tokens+ END_TOKEN
    enc_input_sent = Embedding(corpus=corpus, glove=glove, model='GloVe').get_embedded_list(enc_input_sent)
    enc_input_sent = tf.expand_dims(enc_input_sent, axis=0)
    
    dec_input_sent = START_TOKEN
    dec_input_sent = Embedding(corpus=corpus, glove=glove, model='GloVe').get_embedded_list(dec_input_sent)
    dec_input_sent = tf.expand_dims(dec_input_sent, axis=0)
    
    output = tf.expand_dims([START_TOKEN_NUM], axis=0)
    
    for i in range(max_len):
        predictions = model(inputs=[enc_input_sent, dec_input_sent], training=False)

        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
        predicted_id = tf.squeeze(predicted_id)
        print("index : {idx}, predicted : {pre}".format(idx=i, pre=predicted_id))
        
        predicted_emb = [glove.word_vectors[predicted_id]]
        predicted_emb = tf.expand_dims(predicted_emb, axis=0)

        if tf.equal(predicted_id, END_TOKEN_NUM):
            break

        dec_input_sent = tf.concat([dec_input_sent, predicted_emb], axis=1)
        output = tf.concat([output, tf.expand_dims([predicted_id], axis=0)], axis=-1)

    return tf.squeeze(output, axis=0)


In [18]:
def predict(tokens):
    prediction = evaluate(tokens, SUMMARY_MAX_LEN)

    options = {
        'model-type' : 'GloVe',
        'inv_wv' : None,
        'corpus' : corpus
    }
    decoder = Decoder(options)
    predicted_sentence = decoder.decode(prediction.numpy())

    print('Input: {}'.format(' '.join(tokens)))
    print('Output: {}'.format(' '.join(predicted_sentence)))

    return predicted_sentence

In [19]:
def saveCSVFile(baseDir, media, article_dist):
    save_path = os.path.join(baseDir, media) + ".csv"

    article_dist.to_csv(save_path, mode='w', header=False)

In [20]:
def get_media_name(filepath):
    filename = filepath.split(os.sep)[-1]
    return filename.split(".")[0]

In [29]:
del_folder(PREDICT_PATH)
mkdir_p(PREDICT_PATH)

generated_summary_dist = pd.DataFrame(columns=['Title', 'Contents'])

for _, proc_path in enumerate(iglob(os.path.join(PREPROCESSED_PATH, '**.csv'), recursive=False)):
    
    media_name = get_media_name(proc_path)
    print(media_name, proc_path)
    
    f = open(proc_path, 'r', newline="\n", encoding="utf-8")
    
    for [article_idx, title, contents] in csv.reader(f):
        print(article_idx, title)
        content = contents.split("\t")
                
        tokens = [token for sent in content for token in sent.split()]
        predict_summary = ' '.join(predict(tokens))
        print(predict_summary)
        break
        summary = {'Title' : title, 'Contents' : predict_summary}
        generated_summary_dist = generated_summary_dist.append(summary, ignore_index=True)
    
    saveCSVFile(PREDICT_PATH, media_name, generated_summary_dist)
    f.close()
    break

KBS /data/ksb/TestSampleDir/articles/Preprocessed-Data/KBS.csv
0 이태원 클럽발 확진자 계속 증가…각 지자체 추가 행정명령--0
index : 0, predicted : 362
index : 1, predicted : 362
index : 2, predicted : 362
index : 3, predicted : 362
index : 4, predicted : 362
index : 5, predicted : 362
index : 6, predicted : 362
index : 7, predicted : 362
index : 8, predicted : 362
index : 9, predicted : 362
index : 10, predicted : 362
index : 11, predicted : 362
index : 12, predicted : 362
index : 13, predicted : 362
index : 14, predicted : 362
index : 15, predicted : 362
index : 16, predicted : 362
index : 17, predicted : 362
index : 18, predicted : 362
index : 19, predicted : 362
index : 20, predicted : 362
index : 21, predicted : 362
index : 22, predicted : 362
index : 23, predicted : 362
index : 24, predicted : 362
index : 25, predicted : 362
index : 26, predicted : 362
index : 27, predicted : 362
index : 28, predicted : 362
index : 29, predicted : 362
index : 30, predicted : 362
index : 31, predicted : 362
index : 32, pr