In [1]:
import numpy as np
import pandas as pd
import tensorflow 
import nltk
import gensim
import re
from konlpy.tag import Mecab
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm

In [2]:
print(gensim.__version__)

3.8.3


In [3]:
df = pd.read_csv('ChatbotData.csv')

In [4]:
df

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0
...,...,...,...
11818,훔쳐보는 것도 눈치 보임.,티가 나니까 눈치가 보이는 거죠!,2
11819,훔쳐보는 것도 눈치 보임.,훔쳐보는 거 티나나봐요.,2
11820,흑기사 해주는 짝남.,설렜겠어요.,2
11821,힘든 연애 좋은 연애라는게 무슨 차이일까?,잘 헤어질 수 있는 사이 여부인 거 같아요.,2


In [5]:
def preprocess_sentence(sentence):
    sentence = sentence.lower() # 소문자로 변환
    sentence = re.sub(r'[^a-z0-9가-힣!,.?]', ' ', sentence) # 영어, 숫자, 한글, !,.? 포함
    return sentence


def build_corpus(Q, A, token_len=None):
    mecab  = Mecab()
    temp = pd.concat([Q,A], axis=1)
    
    # 중복제거
    temp = temp.drop_duplicates('Q') # Q에서 중복있으면 삭제
    temp = temp.drop_duplicates('A') # A에서 중복있으면 삭제
    
    # preprocess
    temp['Q'] = temp['Q'].apply(preprocess_sentence)
    temp['A'] = temp['A'].apply(preprocess_sentence)
    
    #mecab 사용 형태소 추출
    temp['Q'] = temp['Q'].apply(lambda x: mecab.pos(x))
    temp['A'] = temp['A'].apply(lambda x: mecab.pos(x))
#     temp['Q'] = temp['Q'].apply(lambda x: mecab.morphs(x))
#     temp['A'] = temp['A'].apply(lambda x: mecab.morphs(x))
    
    if token_len is not None:
        temp = temp[(temp['Q'].apply(len) < token_len) & (temp['A'].apply(len) < token_len)]
        
    return temp['Q'].values, temp['A'].values

In [6]:
que_corpus, ans_corpus  = build_corpus(df['Q'], df['A'])

In [7]:
que_corpus

array([list([('12', 'SN'), ('시', 'NNBC'), ('땡', 'MAG'), ('!', 'SF')]),
       list([('1', 'SN'), ('지망', 'NNG'), ('학교', 'NNG'), ('떨어졌', 'VV+EP'), ('어', 'EC')]),
       list([('3', 'SN'), ('박', 'NNBC'), ('4', 'SN'), ('일', 'NNBC'), ('놀', 'VV'), ('러', 'EC'), ('가', 'VX'), ('고', 'EC'), ('싶', 'VX'), ('다', 'EC')]),
       ...,
       list([('훔쳐', 'VV+EC'), ('보', 'VX'), ('는', 'ETM'), ('것', 'NNB'), ('도', 'JX'), ('눈치', 'NNG'), ('보임', 'VV+ETN'), ('.', 'SF')]),
       list([('흑기사', 'NNG'), ('해', 'VV+EC'), ('주', 'VX'), ('는', 'ETM'), ('짝', 'VA'), ('남', 'EF'), ('.', 'SF')]),
       list([('힘든', 'VA+ETM'), ('연애', 'NNG'), ('좋', 'VA'), ('은', 'ETM'), ('연애', 'NNG'), ('라는', 'VCP+ETM'), ('게', 'NNB+JKS'), ('무슨', 'MM'), ('차이', 'NNG'), ('일까', 'VCP+EF'), ('?', 'SF')])],
      dtype=object)

In [8]:
ans_corpus

array([list([('하루', 'NNG'), ('가', 'JKS'), ('또', 'MAG'), ('가', 'VV'), ('네요', 'EF'), ('.', 'SF')]),
       list([('위로', 'NNG'), ('해', 'XSV+EC'), ('드립니다', 'VX+EF'), ('.', 'SF')]),
       list([('여행', 'NNG'), ('은', 'JX'), ('언제나', 'MAG'), ('좋', 'VA'), ('죠', 'EF'), ('.', 'SF')]),
       ...,
       list([('티', 'NNG'), ('가', 'JKS'), ('나', 'VV'), ('니까', 'EC'), ('눈치', 'NNG'), ('가', 'JKS'), ('보이', 'VV'), ('는', 'ETM'), ('거', 'NNB'), ('죠', 'VCP+EF'), ('!', 'SF')]),
       list([('설렜', 'VV+EP'), ('겠', 'EP'), ('어요', 'EF'), ('.', 'SF')]),
       list([('잘', 'MAG'), ('헤어질', 'VV+ETM'), ('수', 'NNB'), ('있', 'VV'), ('는', 'ETM'), ('사이', 'NNG'), ('여부', 'NNG'), ('인', 'VCP+ETM'), ('거', 'NNB'), ('같', 'VA'), ('아요', 'EF'), ('.', 'SF')])],
      dtype=object)

In [9]:
len(que_corpus), len(ans_corpus)

(7731, 7731)

In [10]:
cut_len = int(len(que_corpus) * 0.8)
train_que = que_corpus[:cut_len]
train_ans = ans_corpus[:cut_len]
valid_que = que_corpus[cut_len:]
valid_ans = ans_corpus[cut_len:]

In [11]:
len(train_que), len(train_ans), len(valid_que), len(valid_ans)

(6184, 6184, 1547, 1547)

In [12]:
wv_model = gensim.models.Word2Vec.load('ko.bin')
# 모델이 정상적으로 로드되었는지 확인
print(wv_model)

Word2Vec(vocab=30185, size=200, alpha=0.025)


In [48]:
VOCAB_SIZE = len(wv_model.wv.vocab)
print(VOCAB_SIZE)
print(wv_model.vector_size)

30185
200


In [14]:
wv_model.wv.most_similar('단어', topn=3)

[('낱말', 0.8457916975021362),
 ('용어', 0.7469839453697205),
 ('어휘', 0.7309367656707764)]

In [15]:
mecab  = Mecab()

In [40]:
import itertools
def lexical_sub(old_src, wv, topn, similarity_threshold):
    mecab  = Mecab()
#     pos_tags=['NNP', 'VV', 'NP']  # 일반명사, 고유명사, 의존명사, 대명사
    pos_tags=['NNG']  # 일반명사, 고유명사, 의존명사, 대명사
    
    # 원문 문자열
    original = [tok for tok, _ in old_src]

    # 후보 단어 위치 + 유사어 리스트 추출
    candidate_subs = []
    for i, (tok, pos) in enumerate(old_src):
#         print(i, (tok, pos))
        if pos in pos_tags and tok in wv:
#             print(tok)
            try:
                similar_list = wv.wv.most_similar(tok, topn=topn)
                filtered = []
                for w, sim in similar_list:
                    temp_pos = mecab.pos(w)
#                     print(pos, temp_pos, tok, temp_pos[0][0])
                    if sim >= similarity_threshold and pos == temp_pos[0][1]:
                        filtered.append(w)
#                 filtered = [w for w, sim in similar_list if sim >= similarity_threshold]
                if filtered:
                    candidate_subs.append((i, filtered))
            except:
                continue

    # 후보가 없으면 원문만 반환
    if not candidate_subs:
        return [original]

    # 각 후보 위치마다 가능한 치환 리스트를 구성
    # 예: [(3, ['A', 'B']), (5, ['X', 'Y'])] → 치환 조합: [ (A,X), (A,Y), (B,X), (B,Y) ]
    all_versions = []

    positions, replacements = zip(*candidate_subs)
    for combo in itertools.product(*replacements):  # 모든 조합 생성
        new_sent = original.copy()
        for idx, rep in zip(positions, combo):
            new_sent[idx] = rep
        all_versions.append(new_sent)

    # 원문 포함
    all_versions.insert(0, original)

    return all_versions

In [41]:
from itertools import product
def augment_pairwise_cross_product_df(enc_src, dec_tgt, wv, topn=3, similarity_threshold=0.6):
    """
    enc_src, dec_tgt: 형태소+POS 리스트 (예: mecab.pos 결과들)
    반환값: 증강된 (que, ans) 문장쌍을 담은 DataFrame
    """
    rows = []

    for enc_tokens, dec_tokens in zip(enc_src, dec_tgt):
        enc_augments = lexical_sub(enc_tokens, wv, topn, similarity_threshold)
        dec_augments = lexical_sub(dec_tokens, wv, topn, similarity_threshold)

        for enc_version, dec_version in product(enc_augments, dec_augments):
            rows.append({
                'que': ' '.join(enc_version),
                'ans': ' '.join(dec_version)
            })

    return pd.DataFrame(rows)

In [42]:
#cross_augmented = augment_pairwise_cross_product(que_corpus, ans_corpus, wv)

aug_df = augment_pairwise_cross_product_df(train_que, train_ans, wv_model)
print(aug_df.head())

  if pos in pos_tags and tok in wv:


                que            ans
0          12 시 땡 !  하루 가 또 가 네요 .
1     1 지망 학교 떨어졌 어    위로 해 드립니다 .
2    1 중퇴 학교의 떨어졌 어    위로 해 드립니다 .
3    1 중퇴 강습소 떨어졌 어    위로 해 드립니다 .
4  1 중퇴 중고등학교 떨어졌 어    위로 해 드립니다 .


In [43]:
aug_df = aug_df.drop_duplicates('que')

In [38]:
# aug_df = aug_df.drop_duplicates('ans')

In [44]:
aug_df

Unnamed: 0,que,ans
0,12 시 땡 !,하루 가 또 가 네요 .
1,1 지망 학교 떨어졌 어,위로 해 드립니다 .
2,1 중퇴 학교의 떨어졌 어,위로 해 드립니다 .
3,1 중퇴 강습소 떨어졌 어,위로 해 드립니다 .
4,1 중퇴 중고등학교 떨어졌 어,위로 해 드립니다 .
...,...,...
124597,행복 의 편견 을 모르 겠 어,서로 이해 하 고 존중 하 고 아끼 는 마음 이 에요 .
124625,행복 의 열등감 을 모르 겠 어,서로 이해 하 고 존중 하 고 아끼 는 마음 이 에요 .
124653,절망 의 욕망 을 모르 겠 어,서로 이해 하 고 존중 하 고 아끼 는 마음 이 에요 .
124681,절망 의 편견 을 모르 겠 어,서로 이해 하 고 존중 하 고 아끼 는 마음 이 에요 .


In [57]:
train_que = aug_df['que'].to_list()
train_ans = aug_df['ans'].to_list()

In [58]:
valid_que = [' '.join([word for word, pos in tokens]) for tokens in valid_que]
valid_ans = [' '.join([word for word, pos in tokens]) for tokens in valid_ans]

In [46]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, (2*(i//2)) / np.float32(d_model))

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])

    return sinusoid_table

def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_lookahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_enc_mask = generate_padding_mask(src)

    dec_lookahead_mask = generate_lookahead_mask(tgt.shape[1])
    dec_tgt_padding_mask = generate_padding_mask(tgt)
    dec_mask = tf.maximum(dec_tgt_padding_mask, dec_lookahead_mask)

    return enc_mask, dec_enc_mask, dec_mask


class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions
        

    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x

    
    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
                        
        out = self.combine_heads(out)
        out = self.linear(out)
            
        return out, attention_weights

    
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
            
        return out
    
    
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        '''
        Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual
        
        return out, enc_attn
    
    
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        '''
        Masked Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        '''
        Multi-Head Attention
        '''
        residual = out
        out = self.norm_2(out)
        # Q, K, V 순서에 주의하세요!
        out, dec_enc_attn = self.enc_dec_attn(Q=out, K=enc_out, V=enc_out, mask=dec_enc_mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn
    

class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]
    
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        out = x
    
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns
    
    
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]
                            
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        out = x
    
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, dec_enc_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns
    
    
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out

        
    def call(self, enc_in, dec_in, enc_mask, dec_enc_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        
        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, dec_enc_mask, dec_mask)
        
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

In [49]:
transformer = Transformer(
    n_layers=2,
    d_model=512,
    n_heads=8,
    d_ff=2048,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=0.3,
    shared_fc=True,
    shared_emb=True)

In [60]:
len(train_que), len(train_ans), len(valid_que), len(valid_ans)

(28083, 28083, 1547, 1547)

In [62]:
train_ans = [f"<start> {ans} <end>" for ans in train_ans]
valid_ans = [f"<start> {ans} <end>" for ans in valid_ans]

In [65]:
train_que = [sentence.split() for sentence in train_que]
train_ans = [sentence.split() for sentence in train_ans]
valid_que = [sentence.split() for sentence in valid_que]
valid_ans = [sentence.split() for sentence in valid_ans]

In [69]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
all_sentences = train_que + train_ans + valid_que + valid_ans  # 학습할 문장 합치기
tokenizer.fit_on_texts(all_sentences)

In [71]:
word_index = tokenizer.word_index
print("단어 인덱스:", len(word_index))

단어 인덱스: 7699


In [72]:
train_que_tokens = tokenizer.texts_to_sequences(train_que)
train_ans_tokens = tokenizer.texts_to_sequences(train_ans)
valid_que_tokens = tokenizer.texts_to_sequences(valid_que)
valid_ans_tokens = tokenizer.texts_to_sequences(valid_ans)

In [74]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [78]:
BATCH_SIZE = 64
train_que_padded = pad_sequences(train_que_tokens, padding='post')
train_ans_padded = pad_sequences(train_ans_tokens, padding='post')
train_dataset = tf.data.Dataset.from_tensor_slices((train_que_padded, train_ans_padded)).batch(batch_size=BATCH_SIZE)

valid_que_padded = pad_sequences(valid_que_tokens, padding='post')
valid_ans_padded = pad_sequences(valid_ans_tokens, padding='post')
# tf.data.Dataset을 사용하여 배치 처리
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_que_padded, valid_ans_padded)).batch(batch_size=BATCH_SIZE)

In [79]:
train_dataset

<BatchDataset shapes: ((None, 28), (None, 37)), types: (tf.int32, tf.int32)>

In [80]:
valid_dataset

<BatchDataset shapes: ((None, 32), (None, 42)), types: (tf.int32, tf.int32)>

In [86]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

@tf.function
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]  # Decoder의 input
    gold = tgt[:, 1:]     # Decoder의 output과 비교하기 위해 right shift를 통해 생성한 최종 타겟

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
            model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

In [87]:
def train(model, train_dataset, valid_dataset, optimizer, num_epochs=10, patience=3):
    best_valid_loss = float('inf')  # 최저 검증 손실을 저장
    epochs_without_improvement = 0  # 연속적으로 검증 손실이 개선되지 않은 에폭 수
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        total_loss = 0
        for step, (src, tgt) in enumerate(train_dataset):
            # 훈련 step 수행
            loss, enc_attns, dec_attns, dec_enc_attns = train_step(src, tgt, model, optimizer)
            total_loss += loss

        avg_train_loss = total_loss / (step + 1)  # 평균 훈련 손실
        print(f"Train Loss: {avg_train_loss.numpy()}")

        # Validation loss 계산
        valid_loss = validate(model, valid_dataset)
        print(f"Validation Loss: {valid_loss.numpy()}")

        # Early stopping check
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs.")
            break

def validate(model, valid_dataset):
    total_valid_loss = 0
    for src, tgt in valid_dataset:
        tgt_in = tgt[:, :-1]  # Decoder의 input
        gold = tgt[:, 1:]     # Decoder의 output과 비교하기 위해 right shift를 통해 생성한 최종 타겟
        enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

        predictions, enc_attns, dec_attns, dec_enc_attns = model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        valid_loss = loss_function(gold, predictions)
        total_valid_loss += valid_loss

    avg_valid_loss = total_valid_loss / len(valid_dataset)  # 평균 검증 손실
    return avg_valid_loss

In [88]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

In [89]:
train(transformer, train_dataset, valid_dataset, optimizer, num_epochs=10, patience=3)

Epoch 1/10
Train Loss: 4.631084442138672
Validation Loss: 4.357891082763672
Epoch 2/10
Train Loss: 3.2934648990631104
Validation Loss: 4.25074577331543
Epoch 3/10
Train Loss: 2.5585858821868896
Validation Loss: 4.278972625732422
Epoch 4/10
Train Loss: 1.9226468801498413
Validation Loss: 4.316685676574707
Epoch 5/10
Train Loss: 1.4241939783096313
Validation Loss: 4.511600971221924
Early stopping triggered after 5 epochs.
