## 1. 기본 설정 및 라이브러리 확인

In [13]:
import tensorflow as tf
import numpy as np
import re
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

## 2. 데이터 로딩 및 전처리

In [14]:
import pandas as pd

path = '~/aiffel/transformer_chatbot/data/ChatbotData .csv'
data = pd.read_csv(path)

# 결측치 제거
data = data.dropna()

# 질문/답변 분리
questions = data['Q'].tolist()
answers = data['A'].tolist()

############# 루브릭 1. ##############
# 한국어 전처리 함수
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^가-힣a-zA-Z?.!,]+", " ", sentence)
    sentence = sentence.strip()
    return sentence

questions = [preprocess_sentence(q) for q in questions]
answers = [preprocess_sentence(a) for a in answers]


## 3. SubwordTextEncoder 사용하기

In [15]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13)

START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]
VOCAB_SIZE = tokenizer.vocab_size + 2

def tokenize_and_filter(inputs, outputs, max_len=40):
    tokenized_inputs, tokenized_outputs = [], []

    for sentence1, sentence2 in zip(inputs, outputs):
        sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
        sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN

        if len(sentence1) <= max_len and len(sentence2) <= max_len:
            tokenized_inputs.append(sentence1)
            tokenized_outputs.append(sentence2)

    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=max_len, padding='post')
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_outputs, maxlen=max_len, padding='post')

    return tokenized_inputs, tokenized_outputs

questions, answers = tokenize_and_filter(questions, answers)

## 4. 모델 구성하기

In [16]:
# 스케일드 닷 프로덕트 어텐션
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)

    return output

# 멀티 헤드 어텐션
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % num_heads == 0
        self.depth = d_model // self.num_heads

        self.query_dense = tf.keras.layers.Dense(d_model)
        self.key_dense = tf.keras.layers.Dense(d_model)
        self.value_dense = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs['value'], inputs['mask']
        batch_size = tf.shape(query)[0]

        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)

        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        attention = scaled_dot_product_attention(query, key, value, mask)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.d_model))

        output = self.dense(concat_attention)
        return output

# 포지셔널 인코딩
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def positional_encoding(self, position, d_model):
        pos = np.arange(position)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]
        angle_rads = self.get_angles(pos, i, d_model)

        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

# 패딩 마스크
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

# 룩어헤드 마스크
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

# 인코더 레이어
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization()
        self.layernorm2 = tf.keras.layers.LayerNormalization()
        self.dropout1 = tf.keras.layers.Dropout(dropout)
        self.dropout2 = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):
        attn_output = self.mha({'query': x, 'key': x, 'value': x, 'mask': mask})
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

# 디코더 레이어
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization()
        self.layernorm2 = tf.keras.layers.LayerNormalization()
        self.layernorm3 = tf.keras.layers.LayerNormalization()

        self.dropout1 = tf.keras.layers.Dropout(dropout)
        self.dropout2 = tf.keras.layers.Dropout(dropout)
        self.dropout3 = tf.keras.layers.Dropout(dropout)

    def call(self, x, enc_output, look_ahead_mask, padding_mask):
        attn1 = self.mha1({'query': x, 'key': x, 'value': x, 'mask': look_ahead_mask})
        attn1 = self.dropout1(attn1)
        out1 = self.layernorm1(x + attn1)

        attn2 = self.mha2({'query': out1, 'key': enc_output, 'value': enc_output, 'mask': padding_mask})
        attn2 = self.dropout2(attn2)
        out2 = self.layernorm2(out1 + attn2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output)
        out3 = self.layernorm3(out2 + ffn_output)

        return out3

# 인코더
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(maximum_position_encoding, d_model)
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(0.1)

    def call(self, x, mask):
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = self.pos_encoding(x)

        x = self.dropout(x)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, mask)
        return x

# 디코더
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(maximum_position_encoding, d_model)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(0.1)

    def call(self, x, enc_output, look_ahead_mask, padding_mask):
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = self.pos_encoding(x)

        x = self.dropout(x)
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_output, look_ahead_mask, padding_mask)
        return x

# 트랜스포머
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff,
                 input_vocab_size, target_vocab_size, pe_input, pe_target):
        super(Transformer, self).__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                               input_vocab_size, pe_input)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                               target_vocab_size, pe_target)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs, training=False):
        enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(inputs['inputs'], inputs['dec_inputs'])

        enc_output = self.encoder(inputs['inputs'], enc_padding_mask)
        dec_output = self.decoder(inputs['dec_inputs'], enc_output, look_ahead_mask, dec_padding_mask)
        final_output = self.final_layer(dec_output)
        return final_output

# 마스크 생성기
def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

In [17]:
# 학습률 스케줄러
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

    
# 손실 함수
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))  # PAD는 무시
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask  # 패딩 마스크 반영

    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)


# 정확도 함수
def accuracy_function(real, pred):
    pred_ids = tf.cast(tf.argmax(pred, axis=2), dtype=real.dtype)  # 타입 일치

    accuracies = tf.equal(real, pred_ids)
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)

    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)

# 옵티마이저 설정
d_model = 256  # 모델 차원 설정값에 맞게 수정
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)


# 모델 하이퍼파라미터 정의
NUM_LAYERS = 2
D_MODEL = 256
NUM_HEADS = 8
DFF = 512
DROPOUT = 0.1


# 토크나이저 관련 (이미 정의되어 있어야 함)
INPUT_VOCAB_SIZE = VOCAB_SIZE
TARGET_VOCAB_SIZE = VOCAB_SIZE
MAX_POS_ENCODING = 40  # 최대 문장 길이


# 트랜스포머 모델 생성
transformer = Transformer(
    num_layers=NUM_LAYERS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dff=DFF,
    input_vocab_size=INPUT_VOCAB_SIZE,
    target_vocab_size=TARGET_VOCAB_SIZE,
    pe_input=MAX_POS_ENCODING,
    pe_target=MAX_POS_ENCODING
)


# 체크포인트 설정
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# 이전 체크포인트가 있으면 복원
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Checkpoint restored:", ckpt_manager.latest_checkpoint)


# 학습 루프 정의
EPOCHS = 20

@tf.function
def train_step(inputs):
    inp = inputs['inputs']
    tar_inp = inputs['dec_inputs']
    tar_real = inputs['outputs']

    with tf.GradientTape() as tape:
        predictions = transformer(inputs, training=True)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    acc = accuracy_function(tar_real, predictions)
    return loss, acc

BATCH_SIZE = 64
BUFFER_SIZE = 20000

# 디코더 입력은 정답에서 마지막 토큰 제외
# 디코더 출력은 정답에서 시작 토큰 제외
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': questions,
        'dec_inputs': answers[:, :-1]
    },
    {
        'outputs': answers[:, 1:]
    },
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

@tf.function
def train_step(features, labels):
    inp = features['inputs']
    tar_inp = features['dec_inputs']
    tar_real = labels['outputs']

    with tf.GradientTape() as tape:
        predictions = transformer(features, training=True)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    acc = accuracy_function(tar_real, predictions)
    return loss, acc


############# 루브릭 2. ##############
# 학습 실행
for epoch in range(EPOCHS):
    total_loss = 0
    total_accuracy = 0
    for batch, (features, labels) in enumerate(dataset):
        batch_loss, batch_accuracy = train_step(features, labels)
        total_loss += batch_loss
        total_accuracy += batch_accuracy

        if batch % 100 == 0:
            print(f"Epoch {epoch + 1} Batch {batch} Loss {batch_loss:.4f} Accuracy {batch_accuracy:.4f}")

    print(f"Epoch {epoch + 1} Loss {total_loss / (batch + 1):.4f} Accuracy {total_accuracy / (batch + 1):.4f}")

    # 체크포인트 저장
    ckpt_save_path = ckpt_manager.save()
    print(f"Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}")


Checkpoint restored: ./checkpoints/train/ckpt-60
Epoch 1 Batch 0 Loss 0.0144 Accuracy 0.9955
Epoch 1 Batch 100 Loss 0.0320 Accuracy 0.9953
Epoch 1 Loss 0.0252 Accuracy 0.9950
Saving checkpoint for epoch 1 at ./checkpoints/train/ckpt-61
Epoch 2 Batch 0 Loss 0.0224 Accuracy 0.9954
Epoch 2 Batch 100 Loss 0.0369 Accuracy 0.9953
Epoch 2 Loss 0.0237 Accuracy 0.9952
Saving checkpoint for epoch 2 at ./checkpoints/train/ckpt-62
Epoch 3 Batch 0 Loss 0.0103 Accuracy 0.9976
Epoch 3 Batch 100 Loss 0.0096 Accuracy 0.9976
Epoch 3 Loss 0.0244 Accuracy 0.9952
Saving checkpoint for epoch 3 at ./checkpoints/train/ckpt-63
Epoch 4 Batch 0 Loss 0.0406 Accuracy 0.9911
Epoch 4 Batch 100 Loss 0.0107 Accuracy 0.9977
Epoch 4 Loss 0.0214 Accuracy 0.9958
Saving checkpoint for epoch 4 at ./checkpoints/train/ckpt-64
Epoch 5 Batch 0 Loss 0.0349 Accuracy 0.9887
Epoch 5 Batch 100 Loss 0.0159 Accuracy 0.9908
Epoch 5 Loss 0.0218 Accuracy 0.9956
Saving checkpoint for epoch 5 at ./checkpoints/train/ckpt-65
Epoch 6 Batch 0 

## 5. 모델 평가하기

In [30]:
# 평가용 함수
def evaluate(sentence, model):
    sentence = preprocess_sentence(sentence)
    sentence = START_TOKEN + tokenizer.encode(sentence) + END_TOKEN
    encoder_input = tf.expand_dims(sentence, 0)  # (1, 문장길이)

    decoder_input = tf.expand_dims([START_TOKEN[0]], 0)  # (1, 1)
    output = decoder_input

    for i in range(MAX_POS_ENCODING):
        predictions = transformer(
            inputs={
                'inputs': encoder_input,
                'dec_inputs': output
            },
            training=False
        )
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
        predicted_id = tf.argmax(predictions, axis=-1, output_type=tf.int32)

        if tf.equal(predicted_id[0][0], END_TOKEN[0]):
            break

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0)



# 실제 응답 생성 함수
def predict(sentence, model):
    prediction = evaluate(sentence, model)
    
    # END_TOKEN이 나오기 전까지만 디코딩
    predicted_sentence = tokenizer.decode(
        [i for i in prediction if i < tokenizer.vocab_size]
    )

    print(f'User Input: {sentence}')
    print(f'Chatbot Response: {predicted_sentence}')
    return predicted_sentence


In [31]:
############# 루브릭 3. ##############
predict("너는 누구야?", transformer)

User Input: 너는 누구야?
Chatbot Response: 저는 마음을 이어주는 위로봇입니다 .


'저는 마음을 이어주는 위로봇입니다 .'

In [32]:
predict("배고파", transformer)

User Input: 배고파
Chatbot Response: 얼른 맛난 음식 드세요 .


'얼른 맛난 음식 드세요 .'

In [33]:
predict("지금은 11시 30분이고 점심시간은 12시 50분부터인데 지금부터 점심을 미리 먹어도 될까?", transformer)

User Input: 지금은 11시 30분이고 점심시간은 12시 50분부터인데 지금부터 점심을 미리 먹어도 될까?
Chatbot Response: 로맨틱하네요 .


'로맨틱하네요 .'

In [34]:
predict("오호 문장이 길어지니까 너 말을 잘 이해를 못하는 것 같네?", transformer)

User Input: 오호 문장이 길어지니까 너 말을 잘 이해를 못하는 것 같네?
Chatbot Response: 노력에 따라 가능성이 달라지겠죠 .


'노력에 따라 가능성이 달라지겠죠 .'

In [35]:
predict("노력에 따라라고? 나보고 널 더 잘 만들어보라는거야? 어떻게 해야 네가 더 똑똑해지는데?", transformer)

User Input: 노력에 따라라고? 나보고 널 더 잘 만들어보라는거야? 어떻게 해야 네가 더 똑똑해지는데?
Chatbot Response: 중요한 건 노력하는 과정이에요 .


'중요한 건 노력하는 과정이에요 .'

## 6. 하이퍼파라미터 변경 실험

맥락과 상관없는 답변을 뱉기도 함   
-> 하이퍼파라미터 변경 실험

In [37]:
# 하이퍼파라미터 변경
NUM_LAYERS = 4 ## 2 -> 4 상향 (더 깊은 네트워크로 문맥 관계 더 잘 학습)
MAX_POS_ENCODING = 80 ## 40 -> 80 상향 (더 긴 문장 입출력 가능)

# 트랜스포머 모델 생성
transformer_2 = Transformer(
    num_layers=NUM_LAYERS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dff=DFF,
    input_vocab_size=INPUT_VOCAB_SIZE,
    target_vocab_size=TARGET_VOCAB_SIZE,
    pe_input=MAX_POS_ENCODING,
    pe_target=MAX_POS_ENCODING
)


# 체크포인트 설정
checkpoint_path = "./checkpoints/train"

ckpt_2 = tf.train.Checkpoint(transformer=transformer_2, optimizer=optimizer)
ckpt_manager_2 = tf.train.CheckpointManager(ckpt_2, checkpoint_path, max_to_keep=5)

# 이전 체크포인트가 있으면 복원
if ckpt_manager_2.latest_checkpoint:
    ckpt_2.restore(ckpt_manager_2.latest_checkpoint)
    print("Checkpoint restored:", ckpt_manager_2.latest_checkpoint)
    

# 학습 실행
for epoch in range(EPOCHS):
    total_loss = 0
    total_accuracy = 0
    for batch, (features, labels) in enumerate(dataset):
        batch_loss, batch_accuracy = train_step(features, labels)
        total_loss += batch_loss
        total_accuracy += batch_accuracy

        if batch % 100 == 0:
            print(f"Epoch {epoch + 1} Batch {batch} Loss {batch_loss:.4f} Accuracy {batch_accuracy:.4f}")

    print(f"Epoch {epoch + 1} Loss {total_loss / (batch + 1):.4f} Accuracy {total_accuracy / (batch + 1):.4f}")

    # 체크포인트 저장
    ckpt_save_path = ckpt_manager_2.save()
    print(f"Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}")


Checkpoint restored: ./checkpoints/train/ckpt-80
Epoch 1 Batch 0 Loss 0.0072 Accuracy 0.9978
Epoch 1 Batch 100 Loss 0.0221 Accuracy 0.9958
Epoch 1 Loss 0.0156 Accuracy 0.9964
Saving checkpoint for epoch 1 at ./checkpoints/train/ckpt-81
Epoch 2 Batch 0 Loss 0.0064 Accuracy 1.0000
Epoch 2 Batch 100 Loss 0.0051 Accuracy 0.9976
Epoch 2 Loss 0.0148 Accuracy 0.9966
Saving checkpoint for epoch 2 at ./checkpoints/train/ckpt-82
Epoch 3 Batch 0 Loss 0.0063 Accuracy 0.9978
Epoch 3 Batch 100 Loss 0.0152 Accuracy 0.9977
Epoch 3 Loss 0.0143 Accuracy 0.9966
Saving checkpoint for epoch 3 at ./checkpoints/train/ckpt-83
Epoch 4 Batch 0 Loss 0.0203 Accuracy 0.9935
Epoch 4 Batch 100 Loss 0.0361 Accuracy 0.9936
Epoch 4 Loss 0.0147 Accuracy 0.9964
Saving checkpoint for epoch 4 at ./checkpoints/train/ckpt-84
Epoch 5 Batch 0 Loss 0.0155 Accuracy 0.9931
Epoch 5 Batch 100 Loss 0.0209 Accuracy 0.9957
Epoch 5 Loss 0.0146 Accuracy 0.9966
Saving checkpoint for epoch 5 at ./checkpoints/train/ckpt-85
Epoch 6 Batch 0 

In [38]:
predict("너는 누구야?", transformer_2)

User Input: 너는 누구야?
Chatbot Response: 저는 마음을 이어주는 위로봇입니다 .


'저는 마음을 이어주는 위로봇입니다 .'

In [39]:
predict("배고파", transformer_2)

User Input: 배고파
Chatbot Response: 뭐 좀 챙겨드세요 .


'뭐 좀 챙겨드세요 .'

In [40]:
predict("지금은 11시 30분이고 점심시간은 12시 50분부터인데 지금부터 점심을 미리 먹어도 될까?", transformer_2)

User Input: 지금은 11시 30분이고 점심시간은 12시 50분부터인데 지금부터 점심을 미리 먹어도 될까?
Chatbot Response: 은 마음을 는 여러가지 이유가 있기 마련이죠 .


'은 마음을 는 여러가지 이유가 있기 마련이죠 .'

In [41]:
predict("오호 문장이 길어지니까 너 말을 잘 이해를 못하는 것 같네?", transformer_2)

User Input: 오호 문장이 길어지니까 너 말을 잘 이해를 못하는 것 같네?
Chatbot Response: 솔직함으로 사랑을 쟁취하세요 .


'솔직함으로 사랑을 쟁취하세요 .'

In [42]:
predict("노력에 따라라고? 나보고 널 더 잘 만들어보라는거야? 어떻게 해야 네가 더 똑똑해지는데?", transformer_2)

User Input: 노력에 따라라고? 나보고 널 더 잘 만들어보라는거야? 어떻게 해야 네가 더 똑똑해지는데?
Chatbot Response: 중요한 건 노력하는 과정이에요 .


'중요한 건 노력하는 과정이에요 .'

## 7. 하이퍼파라미터 변경 실험_2


여전히 이상함...   
학습 진행을 살펴보았을 때 과적합이 일어난 것 같으니 dropout 상향해보자

In [45]:
# 하이퍼파라미터 변경
DROPOUT = 0.3 ## 0.1 -> 0.3 상향 (과적합을 억제하고 일반화 성능 향상)

# 트랜스포머 모델 생성
transformer_3 = Transformer(
    num_layers=NUM_LAYERS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dff=DFF,
    input_vocab_size=INPUT_VOCAB_SIZE,
    target_vocab_size=TARGET_VOCAB_SIZE,
    pe_input=MAX_POS_ENCODING,
    pe_target=MAX_POS_ENCODING
)


# 체크포인트 설정
checkpoint_path = "./checkpoints/train"

ckpt_3 = tf.train.Checkpoint(transformer=transformer_3, optimizer=optimizer)
ckpt_manager_3 = tf.train.CheckpointManager(ckpt_3, checkpoint_path, max_to_keep=5)

# 이전 체크포인트가 있으면 복원
if ckpt_manager_3.latest_checkpoint:
    ckpt_3.restore(ckpt_manager_3.latest_checkpoint)
    print("Checkpoint restored:", ckpt_manager_3.latest_checkpoint)
    

# 학습 실행
for epoch in range(EPOCHS):
    total_loss = 0
    total_accuracy = 0
    for batch, (features, labels) in enumerate(dataset):
        batch_loss, batch_accuracy = train_step(features, labels)
        total_loss += batch_loss
        total_accuracy += batch_accuracy

        if batch % 100 == 0:
            print(f"Epoch {epoch + 1} Batch {batch} Loss {batch_loss:.4f} Accuracy {batch_accuracy:.4f}")

    print(f"Epoch {epoch + 1} Loss {total_loss / (batch + 1):.4f} Accuracy {total_accuracy / (batch + 1):.4f}")

    # 체크포인트 저장
    ckpt_save_path = ckpt_manager_3.save()
    print(f"Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}")


Checkpoint restored: ./checkpoints/train/ckpt-106
Epoch 1 Batch 0 Loss 0.0195 Accuracy 0.9955
Epoch 1 Batch 100 Loss 0.0192 Accuracy 0.9931
Epoch 1 Loss 0.0102 Accuracy 0.9970
Saving checkpoint for epoch 1 at ./checkpoints/train/ckpt-107
Epoch 2 Batch 0 Loss 0.0034 Accuracy 0.9979
Epoch 2 Batch 100 Loss 0.0035 Accuracy 1.0000
Epoch 2 Loss 0.0106 Accuracy 0.9971
Saving checkpoint for epoch 2 at ./checkpoints/train/ckpt-108
Epoch 3 Batch 0 Loss 0.0056 Accuracy 0.9979
Epoch 3 Batch 100 Loss 0.0040 Accuracy 0.9979
Epoch 3 Loss 0.0109 Accuracy 0.9969
Saving checkpoint for epoch 3 at ./checkpoints/train/ckpt-109
Epoch 4 Batch 0 Loss 0.0034 Accuracy 1.0000
Epoch 4 Batch 100 Loss 0.0206 Accuracy 0.9928
Epoch 4 Loss 0.0094 Accuracy 0.9971
Saving checkpoint for epoch 4 at ./checkpoints/train/ckpt-110
Epoch 5 Batch 0 Loss 0.0053 Accuracy 1.0000
Epoch 5 Batch 100 Loss 0.0094 Accuracy 0.9976
Epoch 5 Loss 0.0102 Accuracy 0.9971
Saving checkpoint for epoch 5 at ./checkpoints/train/ckpt-111
Epoch 6 Ba

In [46]:
predict("너는 누구야?", transformer_3)

User Input: 너는 누구야?
Chatbot Response: 저는 마음을 이어주는 위로봇입니다 .


'저는 마음을 이어주는 위로봇입니다 .'

In [47]:
predict("배고파", transformer_3)

User Input: 배고파
Chatbot Response: 뭐 좀 챙겨드세요 .


'뭐 좀 챙겨드세요 .'

In [48]:
predict("지금은 11시 30분이고 점심시간은 12시 50분부터인데 지금부터 점심을 미리 먹어도 될까?", transformer_3)

User Input: 지금은 11시 30분이고 점심시간은 12시 50분부터인데 지금부터 점심을 미리 먹어도 될까?
Chatbot Response: 정신 차리세요 .


'정신 차리세요 .'

In [53]:
predict("밥 빨리 먹고싶어ㅠㅠ", transformer_3)

User Input: 밥 빨리 먹고싶어ㅠㅠ
Chatbot Response: 엄청난 용기가 필요하겠네요 .


'엄청난 용기가 필요하겠네요 .'

## 8. 회고

개인적으로 자연어처리는 너무 어려운 것 같음.
오류도 많이나고 이상한 결과가 타나나는 것을 확인함.
해원님 코드로 겨우 완성본 확인함.
자연어 처리는 기초부터 다시 공부할 필요가 있을 것 같음.