# **[GD-12] 멋진 챗봇 만들기**

# **Step 0. 라이브러리 불러오기**
---

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import gensim

import re
import os
import random

# **Step 1. 데이터 불러오기**
---

In [2]:
path_to_data = '/content/drive/MyDrive/AIFFEL/[GD-12]/ChatbotData.csv'
data = pd.read_csv(path_to_data)
data.head(50)

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0
5,SD카드 망가졌어,다시 새로 사는 게 마음 편해요.,0
6,SD카드 안돼,다시 새로 사는 게 마음 편해요.,0
7,SNS 맞팔 왜 안하지ㅠㅠ,잘 모르고 있을 수도 있어요.,0
8,SNS 시간낭비인 거 아는데 매일 하는 중,시간을 정하고 해보세요.,0
9,SNS 시간낭비인데 자꾸 보게됨,시간을 정하고 해보세요.,0


In [3]:
print('전체 데이터 수 : ',  len(data))

전체 데이터 수 :  11823


In [4]:
questions = data['Q']
answers = data['A']

print('questions의 샘플 수 : ', len(questions))
print('answers의 샘플 수 : ', len(answers))

questions의 샘플 수 :  11823
answers의 샘플 수 :  11823


# **Step 2. 데이터 정제**
---

## **2.1 중복 제거**

In [5]:
data_zip = zip(questions, answers)
data_zip

<zip at 0x7f8f886adc80>

In [6]:
set_data = set(data_zip)

In [7]:
len(set_data)

11750

In [8]:
questions, answers = zip(*set_data)

In [9]:
questions[0]

'왔다갔다'

In [10]:
answers[0]

'허전함에 이랬다 저랬다 해요.'

## **2.2 정규식을 활용한 데이터 정제**

In [11]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()     # 영어를 모두 소문자로 변환
    sentence = re.sub(r"[^a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣0-9?.!,]+", " ", sentence)   # 영어, 한글, ?.!, 제외한 것들을 공백으로 변경
    sentence = sentence.strip()

    return sentence

In [12]:
print(preprocess_sentence(questions[0]))
print(preprocess_sentence(answers[0]))

왔다갔다
허전함에 이랬다 저랬다 해요.


# **Step 3. 데이터 토큰화**
---
토큰화에는 KoNLPy의 `mecab` 클래스를 사용해보자.

1. 소스 문장 데이터와 타겟 문장 데이터를 입력으로 받습니다.
2. 데이터를 앞서 정의한 preprocess_sentence() 함수로 정제하고, 토큰화합니다.
3. 토큰화는 전달받은 토크나이즈 함수를 사용합니다. 이번엔 mecab.morphs 함수를 전달하시면 됩니다.
4. 토큰의 개수가 일정 길이 이상인 문장은 데이터에서 제외합니다.
5. 중복되는 문장은 데이터에서 제외합니다. 소스 : 타겟 쌍을 비교하지 않고 소스는 소스대로 타겟은 타겟대로 검사합니다. 중복 쌍이 흐트러지지 않도록 유의하세요!

In [13]:
from pandas._libs.tslibs.timedeltas import parse_timedelta_unit
input_corpus = []
target_corpus = []

for i, t in zip(questions, answers):
    tmp_input = preprocess_sentence(i)
    tmp_target = preprocess_sentence(t)

    if len(tmp_input) <= 40 and len(tmp_target) <= 40:
        input_corpus.append(tmp_input)
        target_corpus.append(tmp_target)

print('소스 문장 데이터 수 : ', len(input_corpus))
print('타겟 문장 데이터 수 : ', len(target_corpus))

소스 문장 데이터 수 :  11644
타겟 문장 데이터 수 :  11644


In [14]:
print('정제된 소스 문장 : ', input_corpus[50])
print('정제된 타겟 문장 : ', target_corpus[50])

정제된 소스 문장 :  헤어진건가이거?
정제된 타겟 문장 :  애매하네요.


In [15]:
%%bash
apt-get update
apt-get install g++ openjdk-8-jdk python-dev python3-dev
pip3 install JPype1
pip3 install konlpy

Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:12 http://security.ubuntu.com/ubuntu bionic-securi

In [16]:
%env JAVA_HOME "/usr/lib/jvm/java-8-openjdk-amd64"

env: JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"


In [17]:
%%bash
bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
pip3 install /tmp/mecab-python-0.996

Installing automake (A dependency for mecab-ko)
Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:14 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:15 http://ppa.l

debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 5.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1381k  100 1381k    0     0  3370k      0 --:--:-- --:--:-- --:--:-- 3370k
./configure: line 7378: /usr/bin/file: No such file or directory
In file in

In [18]:
from konlpy.tag import Mecab

In [19]:
def tokenize(corpus, vocab_size=50000):
    mecab = Mecab()
    morph = [" ".join(mecab.morphs(sen)) for sen in corpus]
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='',
                                                     num_words=vocab_size)
    tokenizer.fit_on_texts(morph)

    tensor = tokenizer.texts_to_sequences(morph)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

        
    return tensor, tokenizer, morph

In [20]:
_, _, tok_input = tokenize(input_corpus)
_, _, tok_target = tokenize(target_corpus)

In [21]:
tok_input[:10]

['왔 다 갔 다',
 '학식 이제 물린다',
 '용기 내 서 새해 인사 했 네',
 '막말 쩔 어',
 '낭만 이 라고 는 없 어 가지 구',
 '대외 활동 하 다 만났 어',
 '이별 이야기 는 신중 해야 하 는 걸 아 니까 오늘 도 고민 합니다',
 '수능 어떻게 해',
 '썸 타 는 거 티 내 고 싶 진 않 아 .',
 '시어머니 가 차별 해']

In [22]:
tok_target[:10]

['허전 함 에 이랬 다 저랬 다 해요 .',
 '맛난 걸 드셔 보 세요 .',
 '이제 연락 하 지 마세요 .',
 '생각 없이 말 하 는 사람 이 옆 에 있 나 봐요 . 무시 하 세요 .',
 '낭만 적 인 거 좋 아 하 시 는구나 !',
 '꿩 먹 고 알 먹 고 네요 .',
 '저 는 비밀 을 보장 해요 . 답답 하 다면 저 에게 말 해 보 세요 .',
 '잘 볼 수 있 을 거 예요 .',
 '누구 에게 요 ?',
 '진실 되게 대하 면 시어머니 도 알아주 실 거 예요 .']

# **Step 4. Augmentation**
---
`Lexical Substitution`을 실제로 적용해보자.

한국어로 사전 훈련된 Embedding 모델을 다운로드한 후, 이 모델을 활용해 데이터를 Augmentation해보자.

In [23]:
w2v_path = '/content/drive/MyDrive/AIFFEL/[GD-12]/ko.bin'

In [24]:
from gensim.models.word2vec import Word2Vec

In [25]:
w2v_model = Word2Vec.load(w2v_path)

단어 벡터로 변환해보자.

In [26]:
w2v_model.wv['오늘']

array([-1.1059036 , -0.98584527,  1.8960359 , -0.34106794,  0.38498962,
       -0.56333584,  0.39262745, -0.34532717,  0.73293006,  1.2918564 ,
       -0.99007374, -0.23255022,  0.11140338,  0.9468523 ,  0.11374847,
       -0.3785802 ,  0.48704836,  2.5685015 ,  1.2415222 , -1.1356351 ,
       -1.0993221 , -0.3196903 ,  0.8298067 , -1.1164417 ,  0.19156536,
        0.3006387 , -0.6183675 , -0.762924  , -0.23708159, -1.502345  ,
        0.6342514 , -0.64432037,  0.8408901 , -0.67482626, -0.86358494,
        0.5716444 , -0.12138291,  1.6880897 ,  1.318415  ,  1.5844064 ,
       -1.3829565 , -0.1477219 ,  0.7206054 , -0.4495997 , -0.05504679,
       -0.21900962,  0.01089218, -1.5404348 ,  0.02255861,  0.49219918,
       -0.13918048,  0.40318212,  0.73132664, -0.24907936,  0.7307537 ,
        0.37259933,  1.0505751 , -0.6467893 , -0.0154875 , -1.9566497 ,
        0.19638617,  0.07844608,  1.2504076 ,  1.4382341 ,  0.16306905,
        1.0717745 ,  0.912932  , -1.1997408 , -0.9928817 , -0.77

유사한 단어를 출력해보자.

In [27]:
w2v_model.wv.similar_by_word("오늘")

[('아침', 0.5024495124816895),
 ('저물', 0.455463707447052),
 ('내일', 0.4449270963668823),
 ('지금', 0.4403955936431885),
 ('새벽', 0.4366644620895386),
 ('토요일', 0.40253746509552),
 ('저녁', 0.3899332880973816),
 ('어제', 0.38877636194229126),
 ('매주', 0.3823928236961365),
 ('주말', 0.3814442753791809)]

In [28]:
# Lexical Substitution 구현하기
def lexical_sub(sentence, word2vec):
    res = ""
    toks = sentence.split()

    try:
        _from = random.choice(toks)
        _to = word2vec.most_similar(_from)[0][0]
        
    except:   # 단어장에 없는 단어
        return None

    for tok in toks:
        if tok is _from: res += _to + " "
        else: res += tok + " "

    return res

In [29]:
from tqdm import tqdm_notebook

def augment_corpus(src_corpus, tgt_corpus, wv):
    new_src_corpus = []
    new_tgt_corpus = []
    corpus_size = len(src_corpus)
    
    for i in tqdm_notebook(range(corpus_size)):
        old_src = src_corpus[i]
        old_tgt = tgt_corpus[i]

        new_src = lexical_sub(old_src, wv)
        new_tgt = lexical_sub(old_tgt, wv)
        

        if new_src: 
            new_src_corpus.append(new_src)
            new_tgt_corpus.append(old_tgt)
            
        if new_tgt: 
            new_src_corpus.append(old_src)
            new_tgt_corpus.append(new_tgt)

    return new_src_corpus, new_tgt_corpus

In [30]:
new_tok_input, new_tok_target = augment_corpus(tok_input, tok_target, w2v_model)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/11644 [00:00<?, ?it/s]

  


In [31]:
print('augmentation된 소스 문장 수 : ', len(new_tok_input))
print('augmentation된 타겟 문장 수 : ', len(new_tok_target))

augmentation된 소스 문장 수 :  20135
augmentation된 타겟 문장 수 :  20135


In [32]:
tok_input_augmented = tok_input + new_tok_input
tok_target_augmented = tok_target + new_tok_target

In [33]:
print('augmentation하여 추가한 소스 문장 수 : ', len(tok_input_augmented))
print('augmentation하여 추가한 타겟 문장 수 : ', len(tok_target_augmented))

augmentation하여 추가한 소스 문장 수 :  31779
augmentation하여 추가한 타겟 문장 수 :  31779


In [34]:
def build_corpus(que_data, ans_data, max_len=40):
    questions = [preprocess_sentence(q) for q in que_data]
    answers = [preprocess_sentence(a) for a in ans_data]
    
    que_tensor, que_tokenizer, _ = tokenize(questions)
    ans_tensor, ans_tokenizer, _ = tokenize(answers)
    
    que_corpus = []
    ans_corpus = []

    for q, a in zip(que_tensor, ans_tensor):
        if (len(q) <= max_len) or (len(a) <= max_len):
            que_corpus.append(q)
            ans_corpus.append(a)
            
    return que_corpus, ans_corpus, que_tokenizer, ans_tokenizer

In [35]:
que_corpus, ans_corpus, que_tokenizer, ans_tokenizer = build_corpus(tok_input_augmented, tok_target_augmented)

In [36]:
print('questions tokeinzer size : ', len(que_corpus))
print('answers tokeinzer size : ', len(ans_corpus))

questions tokeinzer size :  31779
answers tokeinzer size :  31779


In [37]:
from sklearn.model_selection import train_test_split

enc_train, enc_val, dec_train, dec_val = \
train_test_split(que_corpus, ans_corpus, test_size=0.01)

print("enc_train :", len(enc_train), "enc_val :", len(enc_val))
print("dec_train :", len(dec_train), "dec_val :",len(dec_val))

enc_train : 31461 enc_val : 318
dec_train : 31461 dec_val : 318


# **Step 6. Transformer 모델 설계**
---

## **6.1 Positional Encoding**

In [38]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, (2*(i//2)) / np.float32(d_model))

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])

    return sinusoid_table

## **6.2 마스크 생성**

In [39]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_lookahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_enc_mask = generate_padding_mask(src)

    dec_lookahead_mask = generate_lookahead_mask(tgt.shape[1])
    dec_tgt_padding_mask = generate_padding_mask(tgt)
    dec_mask = tf.maximum(dec_tgt_padding_mask, dec_lookahead_mask)

    return enc_mask, dec_enc_mask, dec_mask

## **6.3 Multi-head Attention**

In [40]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions
        

    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x

    
    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
                        
        out = self.combine_heads(out)
        out = self.linear(out)
            
        return out, attention_weights

## **6.4 Position-wise Feed Forward Network**

In [41]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
            
        return out

## **6.5 Encoder Layer**

In [42]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        '''
        Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual
        
        return out, enc_attn

## **6.6 Decoder Layer**

In [43]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        '''
        Masked Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        '''
        Multi-Head Attention
        '''
        residual = out
        out = self.norm_2(out)
        # Q, K, V 순서에 주의하세요!
        out, dec_enc_attn = self.enc_dec_attn(Q=out, K=enc_out, V=enc_out, mask=dec_enc_mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn

## **6.7 Encoder**

In [44]:
class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]
    
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        out = x
    
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns

## **6.8 Decoder**

In [45]:
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]
                            
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        out = x
    
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, dec_enc_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

### **6.9 Transformer 전체 모델 조립**

In [46]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out

        
    def call(self, enc_in, dec_in, enc_mask, dec_enc_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        
        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, dec_enc_mask, dec_mask)
        
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

## **6.10 모델 인스턴스 생성**

In [47]:
VOCAB_SIZE = 20000

transformer = Transformer(
    n_layers=2,
    d_model=512,
    n_heads=8,
    d_ff=2048,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=0.3,
    shared_fc=True,
    shared_emb=True)

d_model = 512

# **Step 7. 모델 학습하기**
---

## **7.1 Learning Rate Scheduler**

In [48]:
# Learning Rate Scheduler 구현
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

## **7.2 Learning Rage & Optimizer**

In [49]:
# Learning Rate 인스턴스 선언 & Optimizer 구현
learning_rate = LearningRateScheduler(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.98, 
                                        epsilon=1e-9)

## **7.3 Loss Function 정의**

In [50]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

## **7.4 Train Step 정의**

In [51]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]  
    gold = tgt[:, 1:]     

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

## **7.5 모델 훈련하기**

In [None]:
# from tqdm import tqdm_notebook 

# BATCH_SIZE = 64
# EPOCHS = 30

# for epoch in range(EPOCHS):
#     total_loss = 0

#     idx_list = list(range(0, enc_train.shape[0], BATCH_SIZE))
#     random.shuffle(idx_list)
#     t = tqdm_notebook(idx_list)

#     for (batch, idx) in enumerate(t):
#         batch_loss, enc_attns, dec_attns, dec_enc_attns = \
#         train_step(enc_train[idx:idx+BATCH_SIZE],
#                     dec_train[idx:idx+BATCH_SIZE],
#                     transformer,
#                     optimizer)

#         total_loss += batch_loss

#         t.set_description_str('Epoch %2d' % (epoch + 1))
#         t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))

# **🌞 회고**
---

## **1) 결과**
- 학습까지 시키고 싶었으나,,, 아쉽게도 학습을 시키지 못하였다.
- 어떤 코드가 문제인지, 다른 분의 코드를 참고했음에도 해결하지 못하였다ㅠ_ㅠ...

## **2) 후기**
- 이번 프로젝트를 통해 번역과 챗봇이 유사하다는 것을 알게 되었다.
- 번역을 구현시키는 코드로 챗봇의 결과까지 낼 수 있었으면 좋았을 텐데...너무 아쉽다..!
- 다시 한번 찬찬히 해서 결과물을 내봐야겠다.