In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install nltk



In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import urllib.request

np.random.seed(seed=0)
filename = '/content/drive/MyDrive/학습카드 자동 생성 프로젝트/test/Reviews.csv'
data = pd.read_csv(filename, nrows = 100000)
data = data[['Text', 'Summary']]
data.drop_duplicates(subset=['Text'], inplace=True)  #중복 제거
data.dropna(axis=0, inplace=True)  #Null 제거

contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
stop_words = set(stopwords.words('english'))

def preprocess_sentence(sentence, remove_stopwords = True):
    sentence = sentence.lower() # 텍스트 소문자화
    sentence = BeautifulSoup(sentence, "lxml").text # <br />, <a href = ...> 등의 html 태그 제거
    sentence = re.sub(r'\([^)]*\)', '', sentence) # 괄호로 닫힌 문자열  제거 Ex) my husband (and myself) for => my husband for
    sentence = re.sub('"','', sentence) # 쌍따옴표 " 제거
    sentence = ' '.join([contractions[t] if t in contractions else t for t in sentence.split(" ")]) # 약어 정규화
    sentence = re.sub(r"'s\b","",sentence) # 소유격 제거. Ex) roland's -> roland
    sentence = re.sub("[^a-zA-Z]", " ", sentence) # 영어 외 문자(숫자, 특수문자 등) 공백으로 변환
    sentence = re.sub('[m]{2,}', 'mm', sentence) # m이 3개 이상이면 2개로 변경. Ex) ummmmmmm yeah -> umm yeah

    # 불용어 제거 (Text)
    if remove_stopwords:
        tokens = ' '.join(word for word in sentence.split() if not word in stop_words if len(word) > 1)
    # 불용어 미제거 (Summary)
    else:
        tokens = ' '.join(word for word in sentence.split() if len(word) > 1)
    return tokens

#전처리
clean_text = []
for s in data['Text'] :
    clean_text.append(preprocess_sentence(s))
clean_summary = []
for s in data['Summary'] :
    clean_summary.append(preprocess_sentence(s))
data['Text'] = clean_text
data['Summary'] = clean_summary
data.replace('', np.nan, inplace=True)
data.dropna(axis=0, inplace=True)

#길이 맞추기
text_max_len = 50
summary_max_len = 8
data = data[data['Text'].apply(lambda x: len(x.split()) <= text_max_len)]
data = data[data['Summary'].apply(lambda x: len(x.split()) <= summary_max_len)]

#시작, 종료 토큰
data['decoder_input'] = data['Summary'].apply(lambda x : 'sostoken '+ x)
data['decoder_target'] = data['Summary'].apply(lambda x : x + ' eostoken')

#input, target
encoder_input = np.array(data['Text'])
decoder_input = np.array(data['decoder_input'])
decoder_target = np.array(data['decoder_target'])

#순서 섞은 정수 시퀀스
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)

#샘플 순서 섞어주기
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

  ' that document to Beautiful Soup.' % decoded_markup


In [None]:
#훈련/테스트
n_of_val = int(len(encoder_input)*0.2)

encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

#정수 인코딩
src_tokenizer = Tokenizer()
src_tokenizer.fit_on_texts(encoder_input_train)

#등장 빈도 작은 단어가 차지하는 파이 확
# threshold = 7
# total_cnt = len(src_tokenizer.word_index) # 단어의 수
# rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
# total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
# rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합
#
# # 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
# for key, value in src_tokenizer.word_counts.items():
#     total_freq = total_freq + value
#
#     # 단어의 등장 빈도수가 threshold보다 작으면
#     if(value < threshold):
#         rare_cnt = rare_cnt + 1
#         rare_freq = rare_freq + value
# print('단어 집합(vocabulary)의 크기 :',total_cnt)
# print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
# print('단어 집합에서 희귀 단어를 제외시킬 경우의 단어 집합의 크기 %s'%(total_cnt - rare_cnt))
# print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
# print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

src_vocab=8000
src_tokenizer = Tokenizer(num_words = src_vocab)
src_tokenizer.fit_on_texts(encoder_input_train)

# 텍스트 시퀀스를 정수 시퀀스로 변환
encoder_input_train = src_tokenizer.texts_to_sequences(encoder_input_train)
encoder_input_test = src_tokenizer.texts_to_sequences(encoder_input_test)

tar_vocab=2000
tar_tokenizer = Tokenizer(num_words=tar_vocab)
tar_tokenizer.fit_on_texts(decoder_input_train)
tar_tokenizer.fit_on_texts(decoder_target_train)
# 텍스트 시퀀스를 정수 시퀀스로 변환
decoder_input_train = tar_tokenizer.texts_to_sequences(decoder_input_train)
decoder_target_train = tar_tokenizer.texts_to_sequences(decoder_target_train)
decoder_input_test = tar_tokenizer.texts_to_sequences(decoder_input_test)
decoder_target_test = tar_tokenizer.texts_to_sequences(decoder_target_test)

#빈 샘플 제거
drop_train = [index for index, sentence in enumerate(decoder_input_train) if len(sentence) == 1]
drop_test = [index for index, sentence in enumerate(decoder_input_test) if len(sentence) == 1]

encoder_input_train = np.delete(encoder_input_train, drop_train, axis=0)
decoder_input_train = np.delete(decoder_input_train, drop_train, axis=0)
decoder_target_train = np.delete(decoder_target_train, drop_train, axis=0)

encoder_input_test = np.delete(encoder_input_test, drop_test, axis=0)
decoder_input_test = np.delete(decoder_input_test, drop_test, axis=0)
decoder_target_test = np.delete(decoder_target_test, drop_test, axis=0)

#패딩
encoder_input_train = pad_sequences(encoder_input_train, maxlen = text_max_len, padding='post')
encoder_input_test = pad_sequences(encoder_input_test, maxlen = text_max_len, padding='post')
decoder_input_train = pad_sequences(decoder_input_train, maxlen = summary_max_len, padding='post')
decoder_target_train = pad_sequences(decoder_target_train, maxlen = summary_max_len, padding='post')
decoder_input_test = pad_sequences(decoder_input_test, maxlen = summary_max_len, padding='post')
decoder_target_test = pad_sequences(decoder_target_test, maxlen = summary_max_len, padding='post')


  return array(a, dtype, copy=False, order=order)


In [None]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 128
hidden_size = 256

# 인코더
encoder_inputs = Input(shape=(text_max_len,))

# 인코더의 임베딩 층
enc_emb = Embedding(src_vocab, embedding_dim)(encoder_inputs)

# 인코더의 LSTM 1
encoder_lstm1 = LSTM(hidden_size, return_sequences=True, return_state=True ,dropout = 0.4, recurrent_dropout = 0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

# 인코더의 LSTM 2
encoder_lstm2 = LSTM(hidden_size, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

# 인코더의 LSTM 3
encoder_lstm3 = LSTM(hidden_size, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# 디코더
decoder_inputs = Input(shape=(None,))

# 디코더의 임베딩 층
dec_emb_layer = Embedding(tar_vocab, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)

# 디코더의 LSTM
decoder_lstm = LSTM(hidden_size, return_sequences = True, return_state = True, dropout = 0.4, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = [state_h, state_c])




In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/thushv89/attention_keras/master/src/layers/attention.py", filename="attention.py")
from attention import AttentionLayer

In [None]:
# 어텐션 층(어텐션 함수)
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

# 어텐션의 결과와 디코더의 hidden state들을 연결
decoder_concat_input = Concatenate(axis = -1, name='concat_layer')([decoder_outputs, attn_out])

# 디코더의 출력층
decoder_softmax_layer = Dense(tar_vocab, activation='softmax')
decoder_softmax_outputs = decoder_softmax_layer(decoder_concat_input)

# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.summary()

model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 2)
history = model.fit(x = [encoder_input_train, decoder_input_train], y = decoder_target_train, \
          validation_data = ([encoder_input_test, decoder_input_test], decoder_target_test),
          batch_size = 256, callbacks=[es], epochs = 3)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 50, 128)      1024000     input_3[0][0]                    
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, 50, 256), (N 394240      embedding_2[0][0]                
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
____________________________________________________________________________________________

In [None]:
print(data['Text'][0])

bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better


In [None]:
from google.colab import files
myfile = files.upload()

Saving src_naver_test_his_1.txt to src_naver_test_his_1.txt


In [None]:
#테스트
#숫자랑 특수문자는 일단 직접 한글로 바꿔둠

with open('/content/src_naver_test_his_1.txt', 'r') as f :
  text = f.read() 

In [None]:
stopwords_kor = "이번에 이거는 이렇게 그러니까 이게 바로 조금"
stopwords_kor = stopwords_kor.split(' ')

import re

def preprocess_kor(input) :
  input = re.sub("\n", " ", input)
  input = re.sub("[,.]", " ", input)
  tokens = ' '.join(word for word in input.split() if not word in stopwords_kor if len(word) > 1)
  return tokens

pre_kor = preprocess_kor(text)

In [None]:
def romanize(raw, fromEnc = 'utf8', toEnc = 'utf8'):
  """
    Takes a raw string of Korean, a 'from encoding' and a 'to encoding'.
    Returns a romanized string of the text, encoded as specified (default
    'from encoding' is None and default 'to encoding' is utf-8).
  """
  newString = ''
  for i in range(len(raw)):
    index = gti(raw[i])
    
    # If the index is a single (non-syllabic) hangul letter
    if index in range(12593, 12687):
      index = index - 12593
      if singles[index] and len(newString) > 1 and newString[-1] != ' ':
        newString += '.'
      newString += '<' + singles[index] + '>'
   
    # If the index represents a hangul syllable
    elif index in range(44032, 55204):
      index = index - 44032
      initial = int(index / 588)
      vowel = int((index % 588) / 28)
      final = (index % 588) % 28
      if len(newString) > 0:
        if nonpachim[initial] == 'g' and newString[-1] == 'n':
          newString += '.'
        elif nonpachim[initial] == '' and newString[-2:len(newString)] == 'ng':
          newString += '.'
        elif (newString[-1] in moum or newString[-2:len(newString)] in moum) and nonpachim[initial] in pachim + nonpachim:
          newString += '.'
        elif nonpachim[initial] == '' and newString[-1] in pachim + nonpachim: 
          newString += '.'
        elif nonpachim[initial] == 'h' and newString[-1] in ['t','k','p','c','n','l']: 
          newString += '.'
        elif newString[-1] + nonpachim[initial] in pachim + nonpachim or (len(nonpachim[initial]) > 1 and newString[-1] + nonpachim[initial][0] in pachim + nonpachim):
          newString += '.' 
      newString += nonpachim[int(initial)]
      newString += moum[int(vowel)]
      newString += pachim[int(final)]
    
    # Otherwise
    else:
      newString += chr(index).upper()
  return newString


def gti(char):
  """
    Only accepts unicode characters
    Return index of characters
  """
  return ord(char)


# Character lists
singles = ['k', 'kk', 'ks', 'n', 'nc', 'nh', 't', 'tt', 'l', 'lk', 'lm', 'lp', 'ls', 'lth', 'lph', 'lh', 'm', 'p', 'pp', 'ps', 's', 'ss', 'ng', 'c', 'cc', 'ch', 'kh', 'th', 'ph', 'h', 'a', 'ay', 'ya', 'yay', 'e', 'ey', 'ye', 'yey', 'o', 'wa', 'way', 'oy', 'yo', 'wu', 'we', 'wey', 'wi', 'yu', 'u', 'uy', 'i', 'NONE', 'NN', 'NT', 'NS', 'NZ', 'LKS', 'LT', 'LPS', 'LZ', 'LH', 'MP', 'MS', 'MZ', 'MNG', 'PK', 'PT', 'PSK', 'PST', 'PC', 'PTH', 'PNG', 'PPNG', 'SK', 'SL', 'ST', 'SP', 'SC', 'Z', 'NGNG', 'NG', 'NGS', 'NGZ', 'PHNG', 'HH', 'H', 'YOYA', 'YOYAY', 'YOI', 'YUE', 'YUEY', 'YUI', 'A', 'E']
moum = ['a', 'ay', 'ya', 'yay', 'e', 'ey', 'ye', 'yey', 'o', 'wa', 'way', 'oy', 'yo', 'wu', 'we', 'wey', 'wi', 'yu', 'u', 'uy', 'i']
pachim = ['', 'k', 'kk', 'ks', 'n', 'nc', 'nh', 't', 'l', 'lk', 'lm', 'lp', 'ls', 'lth', 'lph', 'lh', 'm', 'p', 'ps', 's', 'ss', 'ng', 'c', 'ch', 'kh', 'th', 'ph', 'h']
nonpachim = ['k', 'kk', 'n', 't', 'tt', 'l', 'm', 'p', 'pp', 's', 'ss', '', 'c', 'cc', 'ch', 'kh', 'th', 'ph', 'h']
letters = ['k', 'n', 't', 'l', 'm', 'p', 's', 'c', 'k', 'h', 'g', 'a', 'y', 'e', 'o', 'w', 'u', 'i']
vowelLetters = ['a', 'e', 'i', 'o', 'u', 'w', 'y']
consonantLetters = ['c', 'g', 'h', 'k', 'l', 'm', 'n', 'p', 's', 't', 'lt', 'lp']
singleLetters = ['k', 'kk', 'ks', 'ns', 'n', 'nc', 'nh', 't', 'tt', 'l', 'lk', 'lm', 'lp', 'ls', 'lth', 'lt', 'lph', 'lt', 'lh', 'm', 'p', 'pp', 'ps', 's', 'ss', 'ng', 'c', 'cc', 'ch', 'kh', 'th', 'ph', 'h', 'a', 'ay', 'ya', 'yay', 'e', 'ey', 'ye', 'yey', 'o', 'wa', 'way', 'oy', 'yo', 'wu', 'we', 'wey', 'wi', 'yu', 'u', 'uy', 'i', 'NONE', 'N', 'NO', 'NON', 'NN', 'NT', 'NS', 'NZ', 'LKS', 'LK', 'L', 'LT', 'LPS', 'LP', 'LZ', 'LH', 'MP', 'M', 'MS', 'MZ', 'MNG', 'MN', 'PK', 'P', 'PT', 'PSK', 'PS', 'PST', 'PC', 'PTH', 'PNG', 'PPNG', 'PP', 'PPN', 'SK', 'S', 'SL', 'ST', 'SP', 'SC', 'Z', 'NGNG', 'NGN', 'NG', 'NGS', 'NGZ', 'PHNG', 'PH', 'PHN', 'HH', 'H', 'YOYA', 'Y', 'YO', 'O', 'YOY', 'YOYAY', 'YOI', 'YUE', 'YU', 'YUEY', 'YUI', 'A', 'E']


# Character dictionaries
singlesDict = {'yey': 37, 'YUE': 89, 'YOYAY': 87, 'tt': 7, 'lm': 10, 'lk': 9, 'lh': 15, 'ls': 12, 'lp': 11, 'wey': 45, 'YUEY': 90, 'yo': 42, 'ya': 32, 'H': 85, 'LPS': 58, 'yu': 47, 'YUI': 91, 'h': 29, 'l': 8, 'p': 17, 't': 6, 'HH': 84, 'ey': 35, 'NGS': 81, 'n': 3, 'LKS': 56, 'NGZ': 82, 'NONE': 51, 'PT': 66, 'PTH': 70, 'PC': 69, 'PK': 65, 'MNG': 64, 'we': 44, 'wa': 39, 'PPNG': 72, 'wi': 46, 'wu': 43, 'PSK': 67, 'c': 23, 'k': 0, 'o': 38, 'PHNG': 83, 's': 20, 'MP': 61, 'MS': 62, 'YOYA': 86, 'lth': 13, 'PST': 68, 'MZ': 63, 'ch': 25, 'cc': 24, 'ps': 19, 'pp': 18, 'yay': 33, 'NN': 52, 'NG': 80, 'NZ': 55, 'way': 40, 'ph': 28, 'NS': 54, 'NT': 53, 'th': 27, 'Z': 78, 'uy': 49, 'SP': 76, 'ST': 75, 'SK': 73, 'ss': 21, 'SL': 74, 'SC': 77, 'ay': 31, 'NGNG': 79, 'nh': 5, 'nc': 4, 'LH': 60, 'ng': 22, 'LT': 57, 'ks': 2, 'LZ': 59, 'A': 92, 'E': 93, 'oy': 41, 'YOI': 88, 'ye': 36, 'kk': 1, 'a': 30, 'e': 34, 'i': 50, 'kh': 26, 'm': 16, 'u': 48, 'lph': 14, 'PNG': 71}
moumDict = {'a': 0, 'we': 14, 'uy': 19, 'yay': 3, 'oy': 11, 'wa': 9, 'ya': 2, 'yo': 12, 'ye': 6, 'o': 8, 'yey': 7, 'i': 20, 'wu': 13, 'ey': 5, 'wi': 16, 'way': 10, 'ay': 1, 'e': 4, 'wey': 15, 'yu': 17, 'u': 18}
pachimDict = {'': 0, 'nc': 5, 'ch': 23, 'nh': 6, 'ps': 18, 'p': 17, 'lm': 10, 'lk': 9, 'lh': 15, 'ng': 21, 'ls': 12, 'lp': 11, 'ph': 26, 'th': 25, 'c': 22, 'ss': 20, 'h': 27, 'k': 1, 'kh': 24, 'm': 16, 'l': 8, 'n': 4, 'ks': 3, 'kk': 2, 's': 19, 't': 7, 'lph': 14, 'lth': 13}
nonpachimDict = {'': 11, 'pp': 8, 'ch': 14, 'ss': 10, 'kk': 1, 'c': 12, 'k': 0, 'kh': 15, 'm': 6, 'l': 5, 'n': 2, 'p': 7, 's': 9, 't': 3, 'th': 16, 'h': 18, 'ph': 17, 'tt': 4, 'cc': 13}



In [None]:
pre_eng = []
for w in pre_kor.split() :
  pre_eng.append(romanize(w))

for w in range(len(pre_eng)) :
  pre_eng[w] = re.sub("[.]", "", pre_eng[w])
plus_tokenizer = Tokenizer()
plus_tokenizer.fit_on_texts(pre_eng)

plus_index = plus_tokenizer.index_word

tar_word_to_index = tar_tokenizer.word_index # 요약 단어 집합에서 단어 -> 정수를 얻음
tar_index_to_word = tar_tokenizer.index_word # 요약 단어 집합에서 정수 -> 단어를 얻음

length = len(tar_index_to_word)

for i in range(len(plus_input)) :
  tar_index_to_word[length + i] = plus_index[i+1]
  tar_word_to_index[plus_index[i+1]] = length + i


In [None]:
# 인코더 설계
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, state_c])

# 이전 시점의 상태들을 저장하는 텐서
decoder_state_input_h = Input(shape=(hidden_size,))
decoder_state_input_c = Input(shape=(hidden_size,))

dec_emb2 = dec_emb_layer(decoder_inputs)
# 문장의 다음 단어를 예측하기 위해서 초기 상태(initial_state)를 이전 시점의 상태로 사용. 이는 뒤의 함수 decode_sequence()에 구현
# 훈련 과정에서와 달리 LSTM의 리턴하는 은닉 상태와 셀 상태인 state_h와 state_c를 버리지 않음.
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

# 어텐션 함수
decoder_hidden_state_input = Input(shape=(text_max_len, hidden_size))
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# 디코더의 출력층
decoder_outputs2 = decoder_softmax_layer(decoder_inf_concat) 

# 최종 디코더 모델
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

def decode_sequence(input_seq):
    # 입력으로부터 인코더의 상태를 얻음
    e_out, e_h, e_c = encoder_model.predict(input_seq)

     # <SOS>에 해당하는 토큰 생성
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = tar_word_to_index['sostoken']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition: # stop_condition이 True가 될 때까지 루프 반복

        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = tar_index_to_word[sampled_token_index]

        if(sampled_token!='eostoken'):
            decoded_sentence += ' '+sampled_token

        #  <eos>에 도달하거나 최대 길이를 넘으면 중단.
        if (sampled_token == 'eostoken'  or len(decoded_sentence.split()) >= (summary_max_len-1)):
            stop_condition = True

        # 길이가 1인 타겟 시퀀스를 업데이트
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # 상태를 업데이트 합니다.
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
plus_input = []
num = len(pre_eng) // text_max_len

for i in range(num+1) :
  temp = []
  for j in range(text_max_len) :
    if (i*text_max_len + j) == len(pre_eng) :
      break
    temp.append(pre_eng[i*text_max_len + j])
  plus_input.append(temp) 

plus_input = plus_tokenizer.texts_to_sequences(plus_input)

for line in plus_input :
  for i in range(text_max_len - len(line)) :
    line.append(0)
#정수 시퀀스 데이터 준비 끝

for i in range(num+1) :
  print(i)
  print(decode_sequence(np.array(plus_input[i]).reshape(1, text_max_len)))



0
 great product
1
 great
2
 great product
3

4
 great tea
5
 great
