In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import os
import json
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt

from preprocessing import *

In [2]:
'''전처리 결과'''
seed = 99
tf.random.set_seed(seed)

# 인코더의 입력값
index_inputs = np.load(open('data_in/train_inputs.npy','rb'), allow_pickle=True)
# 디코더의 입력값
index_outputs = np.load(open('data_in/train_outputs.npy','rb'), allow_pickle=True)
# 디코더의 타깃값
index_targets = np.load(open('data_in/train_targets.npy','rb'), allow_pickle=True)
# dictonary
prepro_configs = json.load(open('data_in/data_configs.json'))

'''
인코더 Input : 최대 길이만큼 <PAD>
디코더 Input : 시작을 알리는 <SOS>
디코더 타겟 : 끝을 알리는 <END>
'''



'\n인코더 Input : 최대 길이만큼 <PAD>\n디코더 Input : 시작을 알리는 <SOS>\n디코더 타겟 : 끝을 알리는 <END>\n'

In [3]:
BATCH_SIZE = 2  # set을 키워보자 -> NoneType 에러가 발생한다. - 메모리이슈
MAX_SEQUENCE =25
EPOCH =30
UNITS =1024
EMBEDDING_DIM = 256
VALIDATION_SPLIT = 0.1

char2idx = prepro_configs['char2idx']
idx2char = prepro_configs['idx2char']
std_index = prepro_configs['std_symbol']
end_index = prepro_configs['end_symbol']
vocab_size = prepro_configs['vocab_size']



In [4]:
'''
스케일 내적 어텐션
softmax(Q,k/크기)*value
Scaling을 해주는 이유는 query, value를 이용해 내적한 값이 벡터 차원이 커지면
학습이 잘 안될 수도 있기 때문에 벡터 크기에 따라 값이 반비례하도록 크기를 조정함


'''
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
    mask= 1 - tf.linalg.band_part(tf.ones((size,size)),-1,0)
    return mask
def create_masks(inp, tar):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)

    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by 
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

def scaled_dot_product_attention(q,k,v,mask=None):
    matmul_qk = tf.matmul(q,k,transpose_b = True)
    dk = tf.cast(tf.shape(k)[-1],tf.float32) # Type을 변환함
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
    if mask is not None:
#         print(scaled_attention_logits)
#         print(mask)
        # scaled_attention_logits 이 상삼각행렬이 0인가?
        scaled_attention_logits += (mask * -1e9) # 마스킹 대상에  모두 작은 음수값을 넣는 것  매우 작아지는구나
#         print(scaled_attention_logits)
    
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
    output = tf.matmul(attention_weights,v)
    
    return output, attention_weights

    

In [5]:
# test
x=[[1.,2.,3.],[3.,4.,5],[5.,6.,7.]]
mask = create_look_ahead_mask(3)
scaled_dot_product_attention(x,x,x,mask)

(<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
 array([[1.      , 2.      , 3.      ],
        [2.999998, 3.999998, 4.999998],
        [5.      , 6.      , 7.      ]], dtype=float32)>,
 <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
 array([[1.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [9.5992834e-07, 9.9999905e-01, 0.0000000e+00],
        [8.8453794e-19, 9.4049879e-10, 1.0000000e+00]], dtype=float32)>)

In [6]:
'''
멀티 헤드 어텐션 
'''
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self,**kargs):
        super(MultiHeadAttention,self).__init__()
        self.num_heads = kargs['num_heads']
        self.d_model = kargs['d_model']
        
        # d_model : Q,K,V 차원을 결정하는 Parameter
        # num_heads : 어텐션 head 수를 결정하는 parameter
        # assert는 아래 조건에 해당하지 않으면 에러를 발생시킴
        assert self.d_model % self.num_heads ==0 # 나머지가 없어야한다.
        
        self.depth = self.d_model // self.num_heads #각 Head에 입력될 벡터 차원 수
        
        self.wq = tf.keras.layers.Dense(kargs['d_model'])
        self.wk = tf.keras.layers.Dense(kargs['d_model'])
        self.wv = tf.keras.layers.Dense(kargs['d_model'])
        
        self.dense = tf.keras.layers.Dense(kargs['d_model'])
        
    def split_heads(self, x, batch_size): # 학습 중에 배치 크기가 바뀔 수 있음.
        # (batch, sequence, feature) → (batch, head, sequence, feature)
        x = tf.reshape(x, (batch_size,-1,self.num_heads,self.depth))
        return tf.transpose(x, perm = [0,2,1,3]) # Sequence, head 차원을 바꿈
    
    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]
        
        q= self.wq(q)
        k= self.wk(k)
        v = self.wv(v)
        
        q = self.split_heads(q,batch_size)
        k = self.split_heads(k,batch_size)
        v = self.split_heads(v,batch_size)
        
        scaled_attention, attention_weights = scaled_dot_product_attention(q,k,v,mask)
        
        scaled_attention = tf.transpose(scaled_attention, perm = [0,2,1,3]) # (batch, seq, feature) 차원
        # feature 
        concat_attention = tf.reshape(scaled_attention,(batch_size,-1, self.d_model))
        
        output = self.dense(concat_attention) # 멀티헤드 어텐션 벡터 
        
        return output, attention_weights
        



In [7]:
'''
Position-wise FFNN
'''
def point_wise_feed_forward_network(**kargs):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(kargs['dff'], activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(kargs['d_model'])  # (batch_size, seq_len, d_model)
    ])



In [8]:
'''
Positional Encoding
'''

def get_angles(pos,i,d_model):
    angle_rates = 1 / np.power(10000, (2*i) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    # 포지션과 차원별로 각기 다른 값을 순차적으로 할당
    angle_rads = get_angles(np.arange(position)[:,np.newaxis],
                           np.arange(d_model)[np.newaxis,:],
                            d_model
                           )
    
    # 짝수차원에는 사인 함수, 홀수 차원에는 코사인 함수를 적용 
    angle_rads[:,0::2] = np.sin(angle_rads[:,0::2])
    angle_rads[:,1::2] = np.cos(angle_rads[:,1::2])

    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype= tf.float32)

In [9]:
'''
인코더
'''
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, **kargs):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(**kargs)
        
        self.ffn = point_wise_feed_forward_network(**kargs)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)

        self.dropout1 = tf.keras.layers.Dropout(kargs['rate'])
        self.dropout2 = tf.keras.layers.Dropout(kargs['rate'])
        
    def call(self, x, mask=None):
        attn_output, _ = self.mha(x,x,x,mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)  # x를 그대로 더해줌 Residual Connection

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output) 
        
        return out2
    
        



In [10]:

class Encoder(tf.keras.layers.Layer):
    def __init__(self, **kargs):
        super(Encoder, self).__init__()
        
        self.d_model = kargs['d_model']
        self.num_layers = kargs['num_layers']
        
        self.embedding = tf.keras.layers.Embedding(kargs['input_vocab_size'], self.d_model)
        self.pos_encoding = positional_encoding(kargs['maximum_position_encoding'], self.d_model)
        
        self.enc_layers = [EncoderLayer(**kargs) for _ in range(self.num_layers)]
        self.dropout = tf.keras.layers.Dropout(kargs['rate'])
        
    def call(self,x,mask=None):
        seq_len = tf.shape(x)[1] # 포지션 임베딩을 위함
        # word embedding은  입력 길이가 가변적이고 포지션 임베딩인 고정이기 때문
        
        x= self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model,tf.float32)) # 임베딩에 대한 스케일을 맞추는 것
        # 임베딩 차원의 제곱근 만큼 가중치 곱함
        x += self.pos_encoding[:,:seq_len,:]
        
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x,mask)
            
        return x
    
        

In [11]:
'''
Decoder
'''
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, **kargs):
        super(DecoderLayer,self).__init__()
        
        self.mha1 = MultiHeadAttention(**kargs)
        self.mha2 = MultiHeadAttention(**kargs)
        
        self.ffn = point_wise_feed_forward_network(**kargs)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(kargs['rate'])
        self.dropout2 = tf.keras.layers.Dropout(kargs['rate'])
        self.dropout3 = tf.keras.layers.Dropout(kargs['rate'])
        
    def call(self, x, enc_output, look_ahead_mask, padding_mask):
        # 순방향 어텐션 마스크 look ahead mask 추가
        attn1, attn_weights_block1 = self.mha1(x,x,x,look_ahead_mask)
        attn1 = self.dropout1(attn1)
        out1 = self.layernorm1(attn1 + x) # residual connection
        
        # encoder-decoder attention  : v, k , q -: Value, Key, Query 순 
        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        # decoder- self attention 결과 : Query, encoder  결과값 : value, key
        # softmax(query*key)* value : Encoder에서 집중해야하는 포인트에 가중치를 줘서 가져오는 것
        attn2 = self.dropout2(attn2)
        out2 = self.layernorm2(attn2 + out1)
        
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output)
        
        out3 = self.layernorm3(ffn_output + out2)
        return out3, attn_weights_block1, attn_weights_block2
    
        
        

In [12]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, **kargs):
        super(Decoder, self).__init__()
        
        self.d_model = kargs['d_model']
        self.num_layers = kargs['num_layers']
        
        self.embedding = tf.keras.layers.Embedding(kargs['target_vocab_size'], self.d_model)
        self.pos_encoding = positional_encoding(kargs['maximum_position_encoding'], self.d_model)
        
        self.dec_layers = [DecoderLayer(**kargs) for _ in range(self.num_layers)]
        self.dropout = tf.keras.layers.Dropout(kargs['rate'])
        
    def call(self,  x, enc_output, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:,:seq_len,:]
        
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            # look_ahead_mask에 이미 상삼각행렬이 -로 처리 되어 있음 -- Matrix 형태
            x, block1, block2 = self.dec_layers[i](x, enc_output, look_ahead_mask, padding_mask)
            attention_weights['decoder_layer{}_block1'.format(i+1)]= block1
            attention_weights['decoder_layer{}_block2'.format(i+1)]= block2
            
        return x, attention_weights
    
        
        

In [13]:
'''
Transformer Model
'''
class Transformer(tf.keras.Model):
    def __init__(self, **kargs):
        super(Transformer, self).__init__()
        self.end_token_idx = kargs['end_token_idx']
        
        self.encoder = Encoder(**kargs)
        self.decoder = Decoder(**kargs)
        
        self.final_layer = tf.keras.layers.Dense(kargs['target_vocab_size'])
        
    def call(self, x):
        inp, tar = x
        enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(inp,tar)
        enc_output = self.encoder(inp, enc_padding_mask)
        dec_output, _ = self.decoder(
                            tar, enc_output, look_ahead_mask, dec_padding_mask)
        
        final_output = self.final_layer(dec_output)
        return final_output
    
    def inference(self,x):
        inp = x
        tar = tf.expand_dims([STD_INDEX],0) # start
        
        enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(inp,tar)
        enc_output = self.encoder(inp, enc_padding_mask)
        
        predict_tokens = list()
        # 하나씩 반복해가면서 Inference
        for t in range(0,MAX_SEQUENCE):
            dec_output, _ = self.decoder(tar, enc_output, look_ahead_mask, dec_padding_mask)
            final_output = self.final_layer(dec_output)
            outputs = tf.argmax(final_output, -1).numpy()
            pred_token = outputs[0][-1]
            
            if pred_token== self.end_token_idx:
                break
            predict_tokens.append(pred_token)
            
            tar = tf.expand_dims([STD_INDEX] + predict_tokens,0)
            _, look_ahead_mask, dec_padding_mask = create_masks(inp, tar)
            
        return predict_tokens
        

In [14]:
word2idx = prepro_configs['char2idx']
end_index = prepro_configs['end_symbol']
model_name = 'transformer'
vocab_size = prepro_configs['vocab_size']
BATCH_SIZE = 256
MAX_SEQUENCE = 25
EPOCHS = 200
VALID_SPLIT = 0.1

kargs = {#'model_name': model_name,
        'num_layers': 2,
        'd_model': 512,
         'num_heads' : 8,
         'dff' :2048,
         'input_vocab_size': vocab_size,
         'target_vocab_size' : vocab_size,
         'maximum_position_encoding': MAX_SEQUENCE,
         'end_token_idx' : word2idx[end_index],
         'rate' : 0.1 #dropout        
        }

In [15]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

def accuracy(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.expand_dims(tf.cast(mask, dtype=pred.dtype), axis=-1)
    pred *= mask    
    acc = train_accuracy(real, pred)

    return tf.reduce_mean(acc)

In [16]:
model = Transformer(**kargs)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss=loss,
              metrics=[accuracy])

In [17]:
# overfitting을 막기 위한 ealrystop 추가
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=10)
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)

checkpoint_path = 'data_out/' + model_name + '/weights.h5'
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    

cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

data_out/transformer -- Folder already exists 



In [None]:
history = model.fit([index_inputs, index_outputs], index_targets, 
                    batch_size=BATCH_SIZE, epochs=EPOCHS,
                    validation_split=VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

Train on 10640 samples, validate on 1183 samples
Epoch 1/200
Epoch 00001: val_accuracy improved from -inf to 0.81197, saving model to data_out/transformer/weights.h5
Epoch 2/200
Epoch 00002: val_accuracy improved from 0.81197 to 0.81986, saving model to data_out/transformer/weights.h5
Epoch 3/200
Epoch 00003: val_accuracy improved from 0.81986 to 0.82721, saving model to data_out/transformer/weights.h5
Epoch 4/200
Epoch 00004: val_accuracy improved from 0.82721 to 0.83150, saving model to data_out/transformer/weights.h5
Epoch 5/200
Epoch 00005: val_accuracy improved from 0.83150 to 0.83434, saving model to data_out/transformer/weights.h5
Epoch 6/200
Epoch 00006: val_accuracy improved from 0.83434 to 0.83633, saving model to data_out/transformer/weights.h5
Epoch 7/200
Epoch 00007: val_accuracy improved from 0.83633 to 0.83809, saving model to data_out/transformer/weights.h5
Epoch 8/200

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
plot_graphs(history, 'accuracy')

In [None]:
DATA_OUT_PATH = './data_out/'
SAVE_FILE_NM = 'weights.h5'

model.load_weights(os.path.join(DATA_OUT_PATH, model_name, SAVE_FILE_NM))

In [None]:
char2idx = prepro_configs['char2idx']
idx2char = prepro_configs['idx2char']

In [None]:
text = "남자친구 승진 선물로 뭐가 좋을까?"
test_index_inputs, _ = enc_processing([text], char2idx)
outputs = model.inference(test_index_inputs)

print(' '.join([idx2char[str(o)] for o in outputs]))