In [None]:
!pip install konlpy



In [None]:
data_path = 'keras2kor.txt'

In [None]:
from konlpy.tag import Okt
from nltk.tokenize import word_tokenize

In [None]:
input_texts = []
target_texts = []

with open(data_path, 'r', encoding='utf-8') as f:
    
    lines = f.read().split('\n')
    
for line in lines:
    target_text, input_text = line.split('\t') #     input_text, target_text = line.split('\t')
    
    input_texts.append(input_text)
    target_texts.append(target_text)

In [None]:
# KoNLPy 형태소분석기 설정
tagger = Okt()

texts_pos = []

# 모든 문장 반복
for text in input_texts:

    # 배열인 형태소분석의 출력을 띄어쓰기로 구분하여 붙임
    text = " ".join(tagger.morphs(text))
    texts_pos.append(text)

input_texts = texts_pos

In [None]:
input_tokens = []

# 단어들의 배열 생성
for text in input_texts:
    for token in text.split():
        input_tokens.append(token)

# 길이가 0인 단어는 삭제
input_tokens = [token for token in input_tokens if len(token) > 0]

# 중복된 단어 삭제
input_tokens = list(set(input_tokens))

In [None]:
from nltk.tokenize import TweetTokenizer

tokenizer_words = TweetTokenizer()

texts_pos = []

# 모든 문장 반복
for text in target_texts:

    # 배열인 형태소분석의 출력을 띄어쓰기로 구분하여 붙임
    text = " ".join(tokenizer_words.tokenize(text))
    texts_pos.append(text)

target_texts = texts_pos

In [None]:
target_tokens = []

# 단어들의 배열 생성
for text in target_texts:
    for token in text.split():
        target_tokens.append(token)

# 길이가 0인 단어는 삭제
target_tokens = [token for token in target_tokens if len(token) > 0]


# 중복된 단어 삭제
target_tokens = list(set(target_tokens))

target_tokens[:0] = ["<PAD>", "<START>", "<END>", "<UNKNOWN>"]

print(len(target_tokens))
print(target_tokens[:10])

96
['<PAD>', '<START>', '<END>', '<UNKNOWN>', '2D', 'sgd', '10', '200', '0.3', 'binary_crossentropy']


In [None]:
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

max_encoder_seq_length = max([len(text) for text in input_texts])
max_decoder_seq_length = max([len(text) for text in target_texts])

max_decoder_seq_length += 2

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 279
Number of unique input tokens: 216
Number of unique output tokens: 96
Max sequence length for inputs: 106
Max sequence length for outputs: 114


In [None]:
input_token_to_index = dict((token, index) for index, token in enumerate(input_tokens))
target_token_to_index = dict((token, index) for index, token in enumerate(target_tokens))

input_index_to_token = dict((index, token) for token, index in input_token_to_index.items())
target_index_to_token = dict((index, token) for token, index in target_token_to_index.items())

In [None]:
import numpy as np

encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype='int')
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length), dtype='int')
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length), dtype='int')
decoder_target_data_onehot = np.zeros((len(input_texts), max_decoder_seq_length, len(target_tokens)), dtype='float32')

In [None]:
for i, text in enumerate(input_texts):
    for t, token in enumerate(text.split()):
        encoder_input_data[i, t] = input_token_to_index[token]

for i, text in enumerate(target_texts):

    text = "<START> " + text
    
    for t, token in enumerate(text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_to_index[token]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1] = target_token_to_index[token]
            decoder_target_data_onehot[i, t - 1, target_token_to_index[token]] = 1.0
            
    decoder_target_data[i, t] = target_token_to_index["<END>"]
    decoder_target_data_onehot[i, t, target_token_to_index["<END>"]] = 1.0

In [None]:
for i, text in enumerate(input_texts):                 

    # input 
    for t, token in enumerate(text.split()):
        
        if t >= max_encoder_seq_length:
            break
        
        encoder_input_data[i, t] = input_token_to_index[token]
        
for i, text in enumerate(target_texts):

    text = '<START> ' + text

    # input 
    for t, token in enumerate(text.split()):
                
        if t < max_decoder_seq_length:
            decoder_input_data[i, t] = target_token_to_index[token]

        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1] = target_token_to_index[token]
        if t == max_decoder_seq_length:
            break
            
    if t == max_decoder_seq_length:
        decoder_target_data[i, max_decoder_seq_length-1] = target_token_to_index['<END>']            
    else:
        decoder_target_data[i, t] = target_token_to_index['<END>']

In [None]:
encoder_input_data[0]

array([ 69,  94, 161,  45,  38,   2,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0])

In [None]:
decoder_input_data[0]

array([ 1, 71, 72, 90, 10, 64,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [None]:
decoder_target_data[0]

array([71, 72, 90, 10, 64,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [None]:
decoder_target_data_onehot = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

# 디코더 목표를 원핫인코딩으로 변환
# 학습시 입력은 인덱스이지만, 출력은 원핫인코딩 형식임
for i, text in enumerate(decoder_target_data):
    for j, index in enumerate(text):
        decoder_target_data_onehot[i, j, index] = 1.0

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
import numpy as np

batch_size = 64  # Batch size for training.
epochs = 300  # Number of epochs to train for.
latent_dim = 512  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.
# Path to the data txt file on disk.

# 임베딩 벡터 차원
encoder_embedding_dim = 100
decoder_embedding_dim = 100

In [None]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data_onehot.shape)

(279, 106)
(279, 114)
(279, 114, 96)


In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
import numpy as np

batch_size = 64  # Batch size for training.
epochs = 20*100  # Number of epochs to train for.
latent_dim = 128  # Latent dimensionality of the encoding space.
# Path to the data txt file on disk.

# 임베딩 벡터 차원
encoder_embedding_dim = 100
decoder_embedding_dim = 100

encoder_embedding_layer = Embedding(num_encoder_tokens, encoder_embedding_dim)
encoder_lstm_layer = LSTM(128, dropout=0.1, recurrent_dropout=0.5, return_state=True)

decoder_embedding_layer = Embedding(num_decoder_tokens, decoder_embedding_dim)
decoder_lstm_layer = LSTM(128, dropout=0.1, recurrent_dropout=0.5, return_sequences=True, return_state=True)
decoder_dense_layer = Dense(num_decoder_tokens, activation='softmax')

encoder_inputs = Input(shape=(None,))
encoder_embedded = encoder_embedding_layer(encoder_inputs)
_, state_h, state_c = encoder_lstm_layer(encoder_embedded)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

decoder_embedded = decoder_embedding_layer(decoder_inputs)
decoder_lstm_outputs, _, _ = decoder_lstm_layer(decoder_embedded, initial_state=encoder_states)
decoder_outputs = decoder_dense_layer(decoder_lstm_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

hist = model.fit([encoder_input_data, decoder_input_data], decoder_target_data_onehot,
          batch_size=64,
          epochs=500)

print('accuracy :', hist.history['acc'][-1])
print('loss :', hist.history['loss'][-1])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

KeyError: ignored

In [None]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))

# 임베딩 레이어
encoder_embedding = Embedding(num_encoder_tokens, encoder_embedding_dim)
encoder_embedded = encoder_embedding(encoder_inputs)

encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embedded)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_decoder_tokens, decoder_embedding_dim)
decoder_embedded = decoder_embedding(decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedded,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data_onehot,
          batch_size=batch_size,
          epochs=epochs)
# Save model
model.save('s2s.h5')

In [None]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

In [None]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedded = decoder_embedding(decoder_inputs)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedded, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_to_index['<START>']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = target_index_to_token[sampled_token_index]
        decoded_sentence += sampled_token

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_token == '<END>' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print(seq_index)
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

In [None]:
input_seq = encoder_input_data[0]
print(input_seq)

In [None]:
input_texts = ["훈련셋으로 모델을 학습시킵니다.",]

# KoNLPy 형태소분석기 설정
tagger = Okt()

texts_pos = []

# 모든 문장 반복
for text in input_texts:

    # 배열인 형태소분석의 출력을 띄어쓰기로 구분하여 붙임
    text = " ".join(tagger.morphs(text))
    texts_pos.append(text)

input_texts = texts_pos

for text in input_texts:

    encoder_input_data = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')

    for t, token in enumerate(input_text.split()):
        encoder_input_data[0, t, input_token_to_index[token]] = 1.

    input_seq = encoder_input_data
    decoded_text = decode_sequence(input_seq)
    print(decoded_text)