In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [5]:

# 1. 데이터 로드 및 전처리
def load_data(file_path, num_samples):
    input_texts = []
    target_texts = []
    input_characters = set()
    target_characters = set()

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()

    for line in lines[:num_samples]:
        input_text, target_text, desc = line.split('\t')
        target_text = '\t' + target_text + '\n'
        input_texts.append(input_text)
        target_texts.append(target_text)

        for char in input_text:
            input_characters.add(char)
        for char in target_text:
            target_characters.add(char)

    input_characters = sorted(list(input_characters))
    target_characters = sorted(list(target_characters))

    return input_texts, target_texts, input_characters, target_characters


In [7]:

file_path = 'fra.txt'  # fra.txt 파일 경로
num_samples = 10000  # 사용할 샘플 수
input_texts, target_texts, input_characters, target_characters = load_data(file_path, num_samples)

In [8]:

input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
input_token_index

{' ': 0,
 '!': 1,
 '"': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '5': 14,
 '7': 15,
 '8': 16,
 '9': 17,
 ':': 18,
 '?': 19,
 'A': 20,
 'B': 21,
 'C': 22,
 'D': 23,
 'E': 24,
 'F': 25,
 'G': 26,
 'H': 27,
 'I': 28,
 'J': 29,
 'K': 30,
 'L': 31,
 'M': 32,
 'N': 33,
 'O': 34,
 'P': 35,
 'Q': 36,
 'R': 37,
 'S': 38,
 'T': 39,
 'U': 40,
 'V': 41,
 'W': 42,
 'Y': 43,
 'a': 44,
 'b': 45,
 'c': 46,
 'd': 47,
 'e': 48,
 'f': 49,
 'g': 50,
 'h': 51,
 'i': 52,
 'j': 53,
 'k': 54,
 'l': 55,
 'm': 56,
 'n': 57,
 'o': 58,
 'p': 59,
 'q': 60,
 'r': 61,
 's': 62,
 't': 63,
 'u': 64,
 'v': 65,
 'w': 66,
 'x': 67,
 'y': 68,
 'z': 69,
 'é': 70}

In [None]:

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)


In [None]:

# 2. 데이터 전처리
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(target_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(target_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0

# 3. 모델 구성
latent_dim = 256

# 인코더 설정
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# 디코더 설정
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# 모델 컴파일
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy")

# 4. 모델 학습
batch_size = 64
epochs = 100

model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)

# 5. 인퍼런스 모델 설정
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# 6. 인퍼런스(번역) 함수 정의
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

def decode_sequence(input_seq):
    # 인코더 상태 추출
    states_value = encoder_model.predict(input_seq)

    # 디코더로 문자 생성 시작 ('\t')
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # 가장 높은 확률을 가진 문자를 샘플링
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # 종료 조건: 최대 길이이거나 '\n'일 때
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # 타겟 시퀀스 업데이트 (t+1 타임스텝)
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # 상태 업데이트
        states_value = [h, c]

    return decoded_sentence

# 7. 테스트: 번역 예시 출력
for seq_index in range(10):
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("입력 문장:", input_texts[seq_index])
    print("번역된 문장:", decoded_sentence)


In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1. 데이터 로드 및 전처리
def load_data(file_path, num_samples):
    df = pd.read_csv(file_path, delimiter='\t', header=None, names=['eng', 'fra'])
    df = df.sample(n=num_samples).reset_index(drop=True)  # 샘플링
    input_texts = df['fra'].values.tolist()
    target_texts = ['\t' + text + '\n' for text in df['eng'].values.tolist()]
    return input_texts, target_texts

file_path = 'fra.txt'  # fra.txt 파일 경로
num_samples = 10000  # 사용할 샘플 수
input_texts, target_texts = load_data(file_path, num_samples)

# 2. Tokenizer를 사용한 텍스트 토큰화
tokenizer_inputs = Tokenizer(char_level=False)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)
max_encoder_seq_length = max([len(seq) for seq in input_sequences])

tokenizer_targets = Tokenizer(char_level=False)
tokenizer_targets.fit_on_texts(target_texts)
target_sequences = tokenizer_targets.texts_to_sequences(target_texts)
max_decoder_seq_length = max([len(seq) for seq in target_sequences])

num_encoder_tokens = len(tokenizer_inputs.word_index) + 1
num_decoder_tokens = len(tokenizer_targets.word_index) + 1

# 3. 데이터 패딩
encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

# 디코더 타겟 데이터는 시프트된 버전이어야 함
decoder_target_data = np.zeros((len(target_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
for i, seq in enumerate(target_sequences):
    for t, word in enumerate(seq):
        if t > 0:
            decoder_target_data[i, t - 1, word] = 1.0

# 4. 모델 구성
latent_dim = 256

# 인코더 설정
encoder_inputs = Input(shape=(None,))
encoder_embedding = tf.keras.layers.Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# 디코더 설정
decoder_inputs = Input(shape=(None,))
decoder_embedding = tf.keras.layers.Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# 모델 컴파일
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy")

# 5. 모델 학습
batch_size = 64
epochs = 100

model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)

# 6. 인퍼런스 모델 설정
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# 7. 인퍼런스(번역) 함수 정의
reverse_input_word_index = dict((i, word) for word, i in tokenizer_inputs.word_index.items())
reverse_target_word_index = dict((i, word) for word, i in tokenizer_targets.word_index.items())

def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_targets.word_index['\t']

    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index[sampled_token_index]
        decoded_sentence += ' ' + sampled_word

        if sampled_word == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence

# 8. 테스트: 번역 예시 출력
for seq_index in range(10):
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("입력 문장:", input_texts[seq_index])
    print("번역된 문장:", decoded_sentence)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

KeyboardInterrupt: 

In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


In [13]:

# 1. 데이터 로드 및 전처리
def load_data(file_path, num_samples):
    df = pd.read_csv(file_path, delimiter='\t', header=None, names=['eng', 'fra', 'desc'])
    df = df.sample(n=num_samples).reset_index(drop=True)  # 샘플링
    input_texts = df['fra'].values.tolist()
    target_texts = ['<sos> ' + text + ' <eos>' for text in df['eng'].values.tolist()]
    return input_texts, target_texts

file_path = 'fra.txt'  # fra.txt 파일 경로
num_samples = 10000  # 사용할 샘플 수
input_texts, target_texts = load_data(file_path, num_samples)
target_texts

['<sos> We have to figure out whether we have enough money to do that. <eos>',
 "<sos> Don't cry, Tom. <eos>",
 '<sos> With great power comes great responsibility. <eos>',
 "<sos> I don't want to lose you. <eos>",
 "<sos> Tom isn't as good at French as he thinks he is. <eos>",
 '<sos> There are not enough doctors to give proper care to the children. <eos>',
 "<sos> He doesn't care much for baseball. <eos>",
 "<sos> I've never actually seen a real cow. <eos>",
 '<sos> Let me tell you what I think. <eos>',
 "<sos> We're running out of time. <eos>",
 "<sos> He's kind of handsome. <eos>",
 "<sos> Wouldn't you rather spend your time doing something you enjoy? <eos>",
 '<sos> Disconnect the plug. <eos>',
 "<sos> I don't have any money on me. <eos>",
 "<sos> I'm sorry. I forgot. <eos>",
 '<sos> You should ask your parents for permission. <eos>',
 "<sos> I'll be very happy if I can serve you. <eos>",
 '<sos> You have to sit somewhere else. <eos>',
 '<sos> Is it OK to drink beer on your univers

In [None]:

# 2. Tokenizer를 사용한 텍스트 토큰화
tokenizer_inputs = Tokenizer(char_level=False)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)
max_encoder_seq_length = max([len(seq) for seq in input_sequences])

tokenizer_targets = Tokenizer(char_level=False)
tokenizer_targets.fit_on_texts(target_texts)
target_sequences = tokenizer_targets.texts_to_sequences(target_texts)
max_decoder_seq_length = max([len(seq) for seq in target_sequences])

num_encoder_tokens = len(tokenizer_inputs.word_index) + 1
num_decoder_tokens = len(tokenizer_targets.word_index) + 1

# 3. 데이터 패딩
encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

# 디코더 타겟 데이터는 시프트된 버전이어야 함
decoder_target_data = np.zeros((len(target_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
for i, seq in enumerate(target_sequences):
    for t, word in enumerate(seq):
        if t > 0:
            decoder_target_data[i, t - 1, word] = 1.0

# 4. 모델 구성
latent_dim = 256

# 인코더 설정
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# 디코더 설정
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# 모델 컴파일
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy")

# 5. 콜백 함수 설정
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True),
    ModelCheckpoint('seq2seq_best_model.h5', save_best_only=True, monitor='val_loss', verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, verbose=1)
]

# 6. 모델 학습
batch_size = 64
epochs = 100

model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=callbacks
)

# 7. 인퍼런스 모델 설정
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# 8. 인퍼런스(번역) 함수 정의
reverse_input_word_index = dict((i, word) for word, i in tokenizer_inputs.word_index.items())
reverse_target_word_index = dict((i, word) for word, i in tokenizer_targets.word_index.items())

def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_targets.word_index['<sos>']

    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index[sampled_token_index]
        decoded_sentence += ' ' + sampled_word

        if sampled_word == "<eos>" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence

# 9. 테스트: 번역 예시 출력
for seq_index in range(10):
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("입력 문장:", input_texts[seq_index])
    print("번역된 문장:", decoded_sentence)


In [38]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import matplotlib.pyplot as plt


ModuleNotFoundError: No module named 'matplotlib'

In [17]:

# 1. 데이터 로드 및 전처리
def load_data(file_path, num_samples):
    df = pd.read_csv(file_path, delimiter='\t', header=None, names=['eng', 'fra', 'desc'])
    df = df.sample(n=num_samples).reset_index(drop=True)  # 샘플링
    input_texts = df['eng'].values.tolist()
    target_texts = ['<sos> ' + text + ' <eos>' for text in df['fra'].values.tolist()]
    return input_texts, target_texts

file_path = 'fra.txt'  # fra.txt 파일 경로
num_samples = 10000  # 사용할 샘플 수
input_texts, target_texts = load_data(file_path, num_samples)
target_texts

["<sos> Tom gagne plus d'argent que Mary. <eos>",
 '<sos> Cela semble être une bonne affaire. <eos>',
 '<sos> Quel est votre emploi ? <eos>',
 '<sos> Il a perdu son chemin dans les bois. <eos>',
 '<sos> Mes voisins sont fêlés. <eos>',
 "<sos> Ont-ils dit ce qui s'est produit ? <eos>",
 '<sos> Je voulais simplement dire que je suis désolé. <eos>',
 '<sos> Je veux y retourner. <eos>',
 '<sos> Je vous ai déjà payé. <eos>',
 "<sos> Tom a l'air content. <eos>",
 '<sos> Tu ne veux probablement pas me parler. <eos>',
 "<sos> Il m'apparaît que nous l'avons mal compris. <eos>",
 '<sos> À qui est adressée la lettre\xa0? <eos>',
 "<sos> Il n'y a pas de papier hygiénique. <eos>",
 '<sos> Peut-être devrions-nous annuler la réunion. <eos>',
 "<sos> Je ne peux pas t'en dire plus, j'en ai déjà trop dit. <eos>",
 '<sos> Elle a commis un crime. <eos>',
 "<sos> Aucun décès n'a été déclaré. <eos>",
 '<sos> Tom et toi êtes faits pour être ensemble. <eos>',
 '<sos> Il dépend financièrement de son épouse. <e

In [18]:

# 2. Tokenizer를 사용한 텍스트 토큰화
tokenizer_inputs = Tokenizer(char_level=False)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)
max_encoder_seq_length = max([len(seq) for seq in input_sequences])

tokenizer_targets = Tokenizer(char_level=False)
tokenizer_targets.fit_on_texts(target_texts)
target_sequences = tokenizer_targets.texts_to_sequences(target_texts)
max_decoder_seq_length = max([len(seq) for seq in target_sequences])

num_encoder_tokens = len(tokenizer_inputs.word_index) + 1
num_decoder_tokens = len(tokenizer_targets.word_index) + 1

In [23]:
max_encoder_seq_length, input_sequences[:3], num_encoder_tokens

(25,
 [[6, 378, 91, 124, 102, 74], [8, 701, 28, 5, 72, 702], [120, 25, 142]],
 4811)

In [25]:
max_decoder_seq_length, target_sequences[:3], num_decoder_tokens

(31,
 [[1, 13, 992, 33, 239, 7, 99, 2],
  [1, 54, 316, 49, 21, 151, 683, 2],
  [1, 157, 15, 70, 993, 2]],
 7750)

In [26]:

# 3. 데이터 패딩
encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

# 디코더 타겟 데이터는 시프트된 버전이어야 함
decoder_target_data = np.zeros((len(target_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
for i, seq in enumerate(target_sequences):
    for t, word in enumerate(seq):
        if t > 0:
            decoder_target_data[i, t - 1, word] = 1.0

In [33]:
encoder_input_data[:3], encoder_input_data.shape

(array([[  6, 378,  91, 124, 102,  74,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  8, 701,  28,   5,  72, 702,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [120,  25, 142,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]]),
 (10000, 25))

In [34]:
decoder_input_data[:3], decoder_input_data.shape

(array([[  1,  13, 992,  33, 239,   7,  99,   2,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0],
        [  1,  54, 316,  49,  21, 151, 683,   2,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0],
        [  1, 157,  15,  70, 993,   2,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0]]),
 (10000, 31))

In [39]:
# 4. 모델 구성
latent_dim = 256 # 임베딩과 LSTM Cell vector에 공통으로 적용

# 인코더 설정
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]  # decoder에 전달할 문맥 vector (state)

# 디코더 설정
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states) # encoder에서 문맥 전달
decoder_dense = Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# 모델 컴파일
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy")


In [41]:

# 5. 콜백 함수 설정
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True),
    ModelCheckpoint('seq2seq_best_model.h5', save_best_only=True, monitor='val_loss', verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, verbose=1)
]


In [42]:
# 6. 모델 학습
batch_size = 64
epochs = 100

history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=callbacks
)

Epoch 1/100
Epoch 1: val_loss improved from inf to 1.53725, saving model to seq2seq_best_model.h5
Epoch 2/100
Epoch 2: val_loss improved from 1.53725 to 1.51347, saving model to seq2seq_best_model.h5
Epoch 3/100
Epoch 3: val_loss improved from 1.51347 to 1.49513, saving model to seq2seq_best_model.h5
Epoch 4/100
Epoch 4: val_loss improved from 1.49513 to 1.48551, saving model to seq2seq_best_model.h5
Epoch 5/100
Epoch 5: val_loss did not improve from 1.48551
Epoch 6/100
Epoch 6: val_loss improved from 1.48551 to 1.47029, saving model to seq2seq_best_model.h5
Epoch 7/100
Epoch 7: val_loss improved from 1.47029 to 1.46734, saving model to seq2seq_best_model.h5
Epoch 8/100
Epoch 8: val_loss improved from 1.46734 to 1.45432, saving model to seq2seq_best_model.h5
Epoch 9/100
Epoch 9: val_loss improved from 1.45432 to 1.44744, saving model to seq2seq_best_model.h5
Epoch 10/100
Epoch 10: val_loss improved from 1.44744 to 1.43780, saving model to seq2seq_best_model.h5
Epoch 11/100
Epoch 11: va

In [None]:
# 7. 학습 진행 상황 시각화
plt.figure(figsize=(12, 4))

# Loss 그래프
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# 학습된 그래프 보여주기
plt.show()

In [None]:
# 8. 인퍼런스 모델 설정
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

In [None]:
# 9. 인퍼런스(번역) 함수 정의
reverse_input_word_index = dict((i, word) for word, i in tokenizer_inputs.word_index.items())
reverse_target_word_index = dict((i, word) for word, i in tokenizer_targets.word_index.items())

def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_targets.word_index['<sos>']

    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index[sampled_token_index]
        decoded_sentence += ' ' + sampled_word

        if sampled_word == "<eos>" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence

In [None]:
# 10. 테스트: 번역 예시 출력
for seq_index in range(10):
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("입력 문장:", input_texts[seq_index])
    print("번역된 문장:", decoded_sentence)