In [None]:
#%%
import numpy as np
from numpy.core.fromnumeric import shape
from numpy.core.numeric import NaN 
import pandas as pd
import re

from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import  LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from keras.models import Model, Input
from keras.callbacks import EarlyStopping
from keras import backend

from scipy.sparse.construct import random

from sklearn.model_selection import train_test_split

import tensorflow as tf

from nltk.corpus import stopwords

import matplotlib.pyplot as plt

import urllib
import warnings
from tensorflow.python import keras

from tensorflow.python.keras import activations
# Warning 제거 및 칼럼 여러개 보기
warnings.filterwarnings("ignore")
pd.set_option("display.max_colwidth", 200)
urllib.request.urlretrieve("https://raw.githubusercontent.com/thushv89/attention_keras/master/src/layers/attention.py", filename="attention.py")
from attention import AttentionLayer

dt = pd.read_csv(r'..\..\data\sample_data\Reviews.csv')

# 중복된 Text값 제거 
dt.drop_duplicates(subset = ['Text'], inplace= True )
# na값 제거
dt.dropna(axis=0, inplace=True)

# 영어 기준 축양형(줄인말) 사전 
contraction_mapping = {"isn't":"is not","ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not","didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not","he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is","I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would","i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would","it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam","mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have","mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have","she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is","should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as","this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would","there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have","they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have","wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are","we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are","what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is","where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have","why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have","would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have","you're": "you are", "you've": "you have"}

# Text 전처리
stop_words = set(stopwords.words('english'))
def text_cleansing(text):
    # 소문자 통일
    newS = text.lower()
    # HTML태그 삭제
    newS = BeautifulSoup(newS, "lxml").text
    # ()안의 단어 제거
    newS = re.sub(r'\([^)]*\)','',newS)
    # 따옴표 제거
    newS = re.sub('"','',newS)
    # 축약어 제거
    newS = ' ' .join(
        [contraction_mapping[t] 
            if t in contraction_mapping 
            else t for t in newS.split(" ")]
        )
    # 불필요 공백 제거
    newS = re.sub(r"s\b","",newS)
    # 특수문자 제거
    newS = re.sub("[^a-zA-Z]"," ", newS)
    # 불용어 제거
    tokens = [w for w in newS.split() if not w in stop_words]
    # 짧은 단어 제거
    long_words = []
    for i in tokens:
        if len(i)>= 4:
            long_words.append(i)
    return (" ".join(long_words)).strip()
#정제된 Text 기존 데이터에 추가
cleaned_text = []
for t in dt['Text']:
    cleaned_text.append(text_cleansing(t))
dt['cleaned_text'] = cleaned_text 

# 요약 Text 정제
def summary_cleansing(text):
    newS = re.sub('"','',text)
    newS = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newS.split(" ")])
    newS = re.sub(r"'s\b","", newS)
    newS = re.sub("[^a-zA-Z]"," ", newS)
    newS = newS.lower()
    tokens = newS.split()
    newS = ''
    for i in tokens:
        if len(i) > 1:
            newS += i+' '
    return newS
# 정제된 Text 추가
cleaned_summary = []
for t in dt['Summary']:
    cleaned_summary.append(summary_cleansing(t))
dt['cleaned_summary'] = cleaned_summary 
dt['cleaned_summary'].replace('',np.nan,inplace=True)
dt.dropna(axis = 0 , inplace= True)
dt['cleaned_summary']= dt['cleaned_summary'].apply(lambda x : '_start_ '+x+' _end_')


# 정제된 텍스트 시각화 화여 적절한 반환 길이 추정
# 필요에 따라 진행할것
text_wc = []
summary_wc = []

for i in dt['cleaned_text']:
    text_wc.append(len(i.split()))
for i in dt['cleaned_summary']:
    summary_wc.append(len(i.split()))
length_df = pd.DataFrame({'Text' : text_wc,'summary': summary_wc})
length_df.hist(bins= 20)
plt.show()

# Plot을 본것을 토대로 Text 길이 지정
max_text_len = 80
max_summary_len = 20

# Train, Test 만들기
x_train, x_test, y_train, y_test = train_test_split(
    dt['cleaned_text'],
    dt['cleaned_summary'],
    train_size= 0.8,
    random_state= 0,
    shuffle= True
    )

def word_tokenizer(train , test,max_len):
    tokenizer_ = Tokenizer()
    tokenizer_.fit_on_texts(list(train))

    train = tokenizer_.texts_to_sequences(train)
    test = tokenizer_.texts_to_sequences(test)


    train = pad_sequences(train, maxlen=max_len,padding='post')
    test = pad_sequences(test, maxlen=max_len,padding='post')

    vodc_size = len(tokenizer_.word_index)+1
    return train,test,vodc_size,tokenizer_

x_train, x_test,x_voc_size,x_tokenizer = word_tokenizer(x_train,x_test,max_text_len)
y_train, y_test,y_voc_size,y_tokenizer = word_tokenizer(y_train,y_test,max_summary_len)

# 모델생성
# Keras 로 생성된것들 모두 초기화
backend.clear_session()
latent_dim = 512

# Encoder 
encoder_inputs = Input(shape=(max_text_len,)) 
enc_emb = Embedding(x_voc_size, latent_dim,trainable=True)(encoder_inputs) 

#LSTM 1 
enc_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True) 
enc_output1, state_h1, state_c1 = enc_lstm1(enc_emb) 

#LSTM 2 
enc_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True) 
enc_output2, state_h2, state_c2 = enc_lstm2(enc_output1) 

#LSTM 3 
enc_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True) 
enc_outputs, state_h, state_c= enc_lstm3(enc_output2) 

# Set up the decoder. 
dec_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(y_voc_size, latent_dim,trainable=True) 
dec_emb = dec_emb_layer(dec_inputs) 

#LSTM using encoder_states as initial state
dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
dec_outputs,decoder_fwd_state, decoder_back_state = dec_lstm(dec_emb,initial_state=[state_h, state_c]) 

#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([enc_outputs, dec_outputs]) 
4
# Concat attention output and decoder LSTM output 
dec_concat_input = Concatenate(axis=-1, name='concat_layer')([dec_outputs, attn_out])

#Dense layer
dec_dense = TimeDistributed(Dense(y_voc_size, activation='softmax')) 
dec_outputs = dec_dense(dec_concat_input) 

# Define the model
model = Model([encoder_inputs, dec_inputs], dec_outputs) 
model.summary()

model.compile(
    optimizer = 'rmsprop',
    loss = 'sparse_categorical_crossentropy'
)
#callnack 함수
# EarlyStopping : 개선 없을시 fitting 중단
es = EarlyStopping(
    monitor= 'val_loss',
    mode = 'min',
    verbose = 1
)

In [None]:
# with tf.device("/device:gpu:0") :
#     history=model.fit(
#         [
#             x_train,y_train[:,:-1]],
#             y_train.reshape(y_train.shape[0],
#             y_train.shape[1], 1)[:,1:] ,
#         epochs=2,
#         callbacks=[es],
#         batch_size=128,
#         validation_data=([x_test,y_test[:,:-1]], 
#         y_test.reshape(y_test.shape[0],
#         y_test.shape[1], 1)[:,1:])
#         )
with tf.device("/device:gpu:0") :
    history=model.fit(
        [
            x_train,y_train[:,:-1]],
            y_train.reshape(y_train.shape[0],
            y_train.shape[1], 1)[:,1:] ,
        epochs=2,
        callbacks=[es],
        batch_size=128,
        validation_data=([x_test,y_test[:,:-1]], 
        y_test.reshape(y_test.shape[0],
        y_test.shape[1], 1)[:,1:])
        )


# inx2word
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = y_tokenizer.index_word
target_word_index = y_tokenizer.word_index

enc_model = Model(inputs = encoder_inputs,outputs= [enc_outputs,state_h,state_c])

dec_state_input_h = Input(shape=(latent_dim,))
dec_state_input_c = Input(shape=(latent_dim,))
dec_hid_state_input = Input(shape = (max_text_len, latent_dim))

dec_emb2 = dec_emb_layer(dec_inputs)

dec_output02,state_h2,state_c2 = dec_lstm(dec_emb2, initial_state = [dec_state_input_h,dec_state_input_c])

attn_out_inf, attn_states_inf = attn_layer([dec_hid_state_input,dec_output02])
dec_inf_con = Concatenate(axis=-1 , name = 'dec_inf_con')([dec_output02,attn_out_inf])

dec_output02 = dec_dense(dec_inf_con)

dec_model = Model(
[dec_inputs] + [dec_hid_state_input,dec_state_input_h, dec_state_input_c],
[dec_output02] + [state_h2, state_c2])

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = enc_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Chose the 'start' word as the first word of the target sequence
    target_seq[0, 0] = target_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = dec_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if(sampled_token!='end'):
            decoded_sentence += ' '+sampled_token

            # Exit condition: either hit max length or find stop word.
            if (sampled_token == 'end' or len(decoded_sentence.split()) >= (max_summary_len-1)):
                stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

def seq2summary(input_seq):
    newString=''
    for i in input_seq:
      if((i!=0 and i!=target_word_index['start']) and i!=target_word_index['end']):
        newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
      if(i!=0):
        newString=newString+reverse_source_word_index[i]+' '
    return newString

for i in range(len(x_test)):
  print("Review:",seq2text(x_test[i]))
  print("Original summary:",seq2summary(y_test[i]))
  print("Predicted summary:",decode_sequence(x_test[i].reshape(1,max_text_len)))
  print("\n")