In [1]:
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Input, Dense, TimeDistributed

단어의 토큰화

In [2]:
paper = ["많은 것을 바꾸고 싶다면 많은 것을 받아들여라"]
tknz = Tokenizer()
tknz.fit_on_texts(paper)
tknz.word_index

{'많은': 1, '것을': 2, '바꾸고': 3, '싶다면': 4, '받아들여라': 5}

In [3]:
tknz.word_counts

OrderedDict([('많은', 2), ('것을', 2), ('바꾸고', 1), ('싶다면', 1), ('받아들여라', 1)])

단어를 벡터로 변환

In [4]:
idx_paper = tknz.texts_to_sequences(paper)
idx_paper

[[1, 2, 3, 4, 1, 2, 5]]

In [5]:
n = len(tknz.word_index) + 1
n

6

In [6]:
idx_onehot = to_categorical(idx_paper, num_classes=n)
idx_onehot

array([[[0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1.]]])

단어 임베딩

In [7]:
model = Sequential()
model.add(Embedding(input_dim=n, output_dim=3))
model.compile(optimizer="rmsprop", loss="mse")
embedding = model.predict(idx_onehot)
embedding

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step


array([[[[ 0.03332189, -0.04253969,  0.00703301],
         [ 0.04164994, -0.00609601,  0.00807042],
         [ 0.03332189, -0.04253969,  0.00703301],
         [ 0.03332189, -0.04253969,  0.00703301],
         [ 0.03332189, -0.04253969,  0.00703301],
         [ 0.03332189, -0.04253969,  0.00703301]],

        [[ 0.03332189, -0.04253969,  0.00703301],
         [ 0.03332189, -0.04253969,  0.00703301],
         [ 0.04164994, -0.00609601,  0.00807042],
         [ 0.03332189, -0.04253969,  0.00703301],
         [ 0.03332189, -0.04253969,  0.00703301],
         [ 0.03332189, -0.04253969,  0.00703301]],

        [[ 0.03332189, -0.04253969,  0.00703301],
         [ 0.03332189, -0.04253969,  0.00703301],
         [ 0.03332189, -0.04253969,  0.00703301],
         [ 0.04164994, -0.00609601,  0.00807042],
         [ 0.03332189, -0.04253969,  0.00703301],
         [ 0.03332189, -0.04253969,  0.00703301]],

        [[ 0.03332189, -0.04253969,  0.00703301],
         [ 0.03332189, -0.04253969,  0.00703

자연어 처리 실습

In [8]:
n_batch = 64
epochs = 100
latent_dim = 256
n_max_sample = 10000
data_path = "./data/eng-fra/fra.txt"

In [9]:
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")

In [10]:
x_txts = []
y_txts = []
x_chars_uni = set()
y_chars_uni = set()
n_sample = min(n_max_sample, len(lines) - 1)

In [11]:
for line in lines[:n_sample]:
    x_txt, y_txt, _ = line.split("\t")
    y_txt = "\t" + y_txt + "\n"
    x_txts.append(x_txt)
    y_txts.append(y_txt)

    for char in x_txt:
        if char not in x_chars_uni:
            x_chars_uni.add(char)
    for char in y_txt:
        if char not in y_chars_uni:
            y_chars_uni.add(char)

In [12]:
x_txts[:5]

['Go.', 'Hi.', 'Hi.', 'Run!', 'Run!']

In [13]:
y_txts[:3]

['\tVa !\n', '\tSalut !\n', '\tSalut.\n']

In [14]:
x_chars_uni

{' ',
 '!',
 '$',
 '%',
 '&',
 "'",
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Y',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'é'}

In [15]:
y_chars_uni

{'\t',
 '\n',
 ' ',
 '!',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '5',
 '8',
 '9',
 ':',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'Y',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'x',
 'y',
 'z',
 '\xa0',
 '«',
 '»',
 'À',
 'Ç',
 'É',
 'Ê',
 'à',
 'â',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'î',
 'ï',
 'ô',
 'ù',
 'û',
 'œ',
 '\u2009',
 '’',
 '\u202f'}

In [16]:
x_chars_uni = sorted(list(x_chars_uni))
y_chars_uni = sorted(list(y_chars_uni))
n_encoder_tokens=len(x_chars_uni)
n_decoder_tokens=len(y_chars_uni)

In [17]:
max_encoder_seq_len = 0
for txt in x_txts:
    txt_len = len(txt)
    max_encoder_seq_len = max(txt_len, max_encoder_seq_len)

In [18]:
max_decoder_seq_len = 0
for txt in y_txts:
    txt_len = len(txt)
    max_decoder_seq_len = max(txt_len, max_decoder_seq_len)

In [19]:
x_token_idx = {}
for idx, char in enumerate(x_chars_uni):
    x_token_idx[char] = idx

In [20]:
y_token_idx = {}
for idx, char in enumerate(y_chars_uni):
    y_token_idx[char] = idx

In [21]:
encoder_x_data = np.zeros(
    (len(x_txts), max_encoder_seq_len, n_encoder_tokens), dtype="float32"
)
decoder_x_data = np.zeros(
    (len(x_txts), max_decoder_seq_len, n_decoder_tokens), dtype="float32"
)
decoder_y_data = np.zeros(
    (len(x_txts), max_decoder_seq_len, n_decoder_tokens), dtype="float32"
) # into tensor

In [22]:
for i, x_txt in enumerate(x_txts):
    for t, char in enumerate(x_txt):
        encoder_x_data[i, t, x_token_idx[char]] = 1
    encoder_x_data[i, t + 1: x_token_idx[" "]] = 1

In [23]:
for i, y_txt in enumerate(y_txts):
    for t, char in enumerate(y_txt):
        decoder_x_data[i, t, y_token_idx[char]] = 1
        if t > 0:
            decoder_y_data[i, t - 1, y_token_idx[char]] = 1
    decoder_x_data[i, t + 1 : y_token_idx[" "]] = 1
    decoder_y_data[i, t : y_token_idx[" "]] = 1

In [24]:
encoder_inputs = Input(shape=(None, n_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [25]:
decoder_inputs = Input(shape=(None, n_decoder_tokens))
decoder = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outs, _, _ = decoder(decoder_inputs, initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(n_decoder_tokens, activation="softmax"))
decoder_outputs = decoder_dense(decoder_outs)

In [26]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

In [27]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

In [28]:
model.fit(
    [encoder_x_data, decoder_x_data],
    decoder_y_data,
    batch_size=n_batch,
    epochs=epochs,
    validation_split=0.2,
)

Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 169ms/step - accuracy: 0.0517 - loss: 0.9957 - val_accuracy: 0.0701 - val_loss: 1.0204
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 160ms/step - accuracy: 0.0644 - loss: 0.8982 - val_accuracy: 0.0974 - val_loss: 0.9697
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 159ms/step - accuracy: 0.0828 - loss: 0.8337 - val_accuracy: 0.1066 - val_loss: 0.8804
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 164ms/step - accuracy: 0.1018 - loss: 0.7543 - val_accuracy: 0.1186 - val_loss: 0.8193
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 161ms/step - accuracy: 0.1149 - loss: 0.7003 - val_accuracy: 0.1353 - val_loss: 0.7686
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 164ms/step - accuracy: 0.1222 - loss: 0.6666 - val_accuracy: 0.1381 - val_loss: 0.7385
Epoc

<keras.src.callbacks.history.History at 0x1221bbc2590>

In [29]:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder(
    decoder_inputs, initial_state=decoder_state_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states
)

In [30]:
reverse_x_char_idx = {}
for char, idx in x_token_idx.items():
    reverse_x_char_idx[idx] = char

In [31]:
reverse_y_char_idx = {}
for char, idx in y_token_idx.items():
    reverse_y_char_idx[idx] = char

In [33]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    y_seq = np.zeros((1, 1, n_decoder_tokens))
    y_seq[0, 0, y_token_idx["\t"]] = 1
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([y_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_y_char_idx[sampled_token_index]
        decoded_sentence += sampled_char

        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_len:
            stop_condition = True
        y_seq = np.zeros((1, 1, n_decoder_tokens))
        y_seq[0, 0, sampled_token_index] = 1
        states_value = [h, c]

    return decoded_sentence

In [34]:
for seq_idx in range(100):
    x_seq = encoder_x_data[seq_idx : seq_idx + 1]
    decoded_sentence = decode_sequence(x_seq)
    print('-')
    print('Input sentence:', x_txts[seq_idx])
    print('Decoded sentence:', decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 