# Chapter 6 게이트가 추가된 RNN
## 6.1 RNN의 문제점
### 6.1.3 기울기 소실과 기울기 폭발의 원인

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
import numpy as np
import matplotlib.pyplot as plt

N = 2 # 미니배치 크기
H = 3 # 은닉 상태 벡터의 차원 수
T = 20 # 시계열 데이터의 길이

dh = np.ones((N, H))
np.random.seed(3) # 재현할 수 있도록 난수의 시드 고정
Wh = np.random.randn(H, H)

norm_list = []
for t in range(T):
    dh = np.matmul(dh, Wh.T)
    norm = np.sqrt(np.sum(dh**2)) / N  #L2 Norm 
    norm_list.append(norm)

print(norm_list) # 기울기 폭발발


[np.float64(2.4684068094579303), np.float64(3.335704974161037), np.float64(4.783279375373183), np.float64(6.2795873320876145), np.float64(8.080776465019055), np.float64(10.25116303229294), np.float64(12.9360635066099), np.float64(16.276861327786712), np.float64(20.454829618345983), np.float64(25.688972842084684), np.float64(32.25315718048336), np.float64(40.48895641683869), np.float64(50.824407307019094), np.float64(63.79612654485427), np.float64(80.07737014308985), np.float64(100.51298922051251), np.float64(126.16331847536827), np.float64(158.3592064825883), np.float64(198.77107967611957), np.float64(249.495615421267)]


In [7]:
Wh = np.random.randn(H, H) * 0.5

norm_list = []
for t in range(T):
    dh = np.matmul(dh, Wh.T)
    norm = np.sqrt(np.sum(dh**2)) / N  #L2 Norm 
    norm_list.append(norm)

print(norm_list) # 기울기 소실

[np.float64(0.037021198376299184), np.float64(0.03811468532884431), np.float64(0.039845054893327424), np.float64(0.041623713695508026), np.float64(0.04347909376653374), np.float64(0.04539180055793864), np.float64(0.04736759915825979), np.float64(0.04941320832816529), np.float64(0.051535758553624605), np.float64(0.053741719239181376), np.float64(0.056036926830665926), np.float64(0.058426744788985935), np.float64(0.06091624712765684), np.float64(0.0635103678514805), np.float64(0.0662140121922354), np.float64(0.06903213620054491), np.float64(0.07196980322724954), np.float64(0.07503222445828138), np.float64(0.07822478881528006), np.float64(0.08155308593400877)]


### 6.1.4 기울기 폭발 대책

In [11]:
# 기울기 클리핑

dW1 = np.random.rand(3, 3) * 10
dW2 = np.random.rand(3, 3) * 10
grads = [dW1, dW2]
max_norm = 5.0

print("Before: ",grads)
def clip_grads(grads, max_norm):
    total_norm = 0
    for grad in grads:
        total_norm += np.sum(grad ** 2)
    total_norm = np.sqrt(total_norm)

    rate = max_norm / (total_norm + 1e-6) # max_norm = threshold(문턱값)
    if rate < 1: # total_norm > max_norm
        for grad in grads:
            grad *= rate

print("After: ",grads)

Before:  [array([[7.46746223, 6.91092922, 6.89180414],
       [3.73600124, 6.68134805, 3.39848664],
       [5.7279387 , 3.25807158, 4.45145049]]), array([[0.61528931, 2.42675422, 9.71602606],
       [2.30584204, 6.91477511, 6.50476858],
       [7.23939139, 4.75088611, 5.96663775]])]
After:  [array([[7.46746223, 6.91092922, 6.89180414],
       [3.73600124, 6.68134805, 3.39848664],
       [5.7279387 , 3.25807158, 4.45145049]]), array([[0.61528931, 2.42675422, 9.71602606],
       [2.30584204, 6.91477511, 6.50476858],
       [7.23939139, 4.75088611, 5.96663775]])]


## 6.3 LSTM 구현

In [12]:
from common.functions import sigmoid 
class LSTM:
    def __init__(self, Wx, Wh, b):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None
    
    def forward(self, x, h_prev, c_prev):
        Wx, Wh, b = self.params
        N, H = h_prev.shape

        A = np.matmul(x, Wx) + np.matmul(h_prev, Wh) + b

        # slice
        f = A[:, :H]
        g = A[:, H:2*H]
        i = A[:, 2*H:3*H]
        o = A[:, 3*H:]

        f = sigmoid(f)
        g = np.tanh(g)
        i = sigmoid(i)
        o = sigmoid(i)

        c_next = f * c_prev + g*i
        h_next = o * np.tanh(c_next)

        self.cache = (x, h_prev, c_prev, i, f, g, o, c_next)
        return h_next, c_next
    
    def backward(self, dh_next, dc_next):
        Wx, Wh, b = self.params
        x, h_prev, c_prev, i, f, g, o, c_next = self.cache

        tanh_c_next = np.tanh(c_next)

        ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)

        dc_prev = ds * f

        di = ds * g
        df = ds * c_prev
        do = dh_next * tanh_c_next
        dg = ds * i

        di *= i * (1 - i)
        df *= f * (1 - f)
        do *= o * (1 - o)
        dg *= (1 - g ** 2)

        dA = np.hstack((df, dg, di, do))

        dWh = np.dot(h_prev.T, dA)
        dWx = np.dot(x.T, dA)
        db = dA.sum(axis=0)

        self.grads[0][...] = dWx
        self.grads[1][...] = dWh
        self.grads[2][...] = db

        dx = np.dot(dA, Wx.T)
        dh_prev = np.dot(dA, Wh.T)

        return dx, dh_prev, dc_prev

### 6.3.1 Time LSTM 구현

In [13]:
class TimeLSTM:
    def __init__(self, Wx, Wh, b, stateful=False):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.layers = None
        self.h, self.c = None, None
        self.dh = None
        self.stateful = stateful

    def forward(self, xs):
        Wx, Wh, b = self.params
        N, T, D = xs.shape
        H = Wh.shape[0]

        self.layers = []
        hs = np.empty((N, T, H), dtype='f')

        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')
        if not self.stateful or self.c is None:
            self.c = np.zeros((N, H), dypte='f')
        
        for t in range(T):
            layer = LSTM(*self.params)
            self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
            hs[:, t, :] = self.h

            self.layers.append(layer)

        return hs
    
    def backward(self, dhs):
        Wx, Wh, b = self.params
        N, T, H = dhs.shape
        D = Wx.shape[0]

        dxs = np.empty((N, T, D), dtype='f')
        dh, dc = 0,0

        grads = [0,0,0]
        for t in reversed(range(T)):
            layer = self.layers[t]
            dx, dh, dc = layer.backward(dhs[:,t,:]+ dh, dc)
            dxs[:,t,:] = dx
            for i, grad in enumerate(layer.grads):
                grads[i] += grad
            
            for i, grad in enumerate(grads):
                self.grads[i][...] = grad
                self.dh = dh
                return dxs
            
    def set_state(self, h, c = None):
        self.h, self.c = h, c
    
    def reset_state(self):
        self.h, self.c = None, None


## 6.4 LSTM을 사용한 언어 모델

In [14]:
from common.time_layers import *
from common.base_model import BaseModel


class Rnnlm(BaseModel):
    def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        # 가중치 초기화
        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        # 계층 생성
        self.layers = [
            TimeEmbedding(embed_W),
            TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True),
            TimeAffine(affine_W, affine_b)
        ]
        self.loss_layer = TimeSoftmaxWithLoss()
        self.lstm_layer = self.layers[1]

        # 모든 가중치와 기울기를 리스트에 모은다.
        self.params, self.grads = [], []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads

    def predict(self, xs):
        for layer in self.layers:
            xs = layer.forward(xs)
        return xs

    def forward(self, xs, ts):
        score = self.predict(xs)
        loss = self.loss_layer.forward(score, ts)
        return loss

    def backward(self, dout=1):
        dout = self.loss_layer.backward(dout)
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout

    def reset_state(self):
        self.lstm_layer.reset_state()


In [15]:
# coding: utf-8
import sys
sys.path.append('..')
from common.optimizer import SGD
from common.trainer import RnnlmTrainer
from common.util import eval_perplexity
from dataset import ptb


# 하이퍼파라미터 설정
batch_size = 20
wordvec_size = 100
hidden_size = 100  # RNN의 은닉 상태 벡터의 원소 수
time_size = 35     # RNN을 펼치는 크기
lr = 20.0
max_epoch = 4
max_grad = 0.25

# 학습 데이터 읽기
corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus_test, _, _ = ptb.load_data('test')
vocab_size = len(word_to_id)
xs = corpus[:-1]
ts = corpus[1:]

# 모델 생성
model = Rnnlm(vocab_size, wordvec_size, hidden_size)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)

# 기울기 클리핑을 적용하여 학습
trainer.fit(xs, ts, max_epoch, batch_size, time_size, max_grad,
            eval_interval=20)
trainer.plot(ylim=(0, 500))

# 테스트 데이터로 평가
model.reset_state()
ppl_test = eval_perplexity(model, corpus_test)
print('테스트 퍼플렉서티: ', ppl_test)

# 매개변수 저장
model.save_params()


Downloading ptb.test.txt ... 
Done
| 에폭 1 |  반복 1 / 1327 | 시간 0[s] | 퍼플렉서티 10000.20
| 에폭 1 |  반복 21 / 1327 | 시간 4[s] | 퍼플렉서티 3117.37
| 에폭 1 |  반복 41 / 1327 | 시간 9[s] | 퍼플렉서티 1240.47
| 에폭 1 |  반복 61 / 1327 | 시간 14[s] | 퍼플렉서티 961.41
| 에폭 1 |  반복 81 / 1327 | 시간 19[s] | 퍼플렉서티 817.72
| 에폭 1 |  반복 101 / 1327 | 시간 24[s] | 퍼플렉서티 675.88
| 에폭 1 |  반복 121 / 1327 | 시간 30[s] | 퍼플렉서티 655.41
| 에폭 1 |  반복 141 / 1327 | 시간 36[s] | 퍼플렉서티 617.38
| 에폭 1 |  반복 161 / 1327 | 시간 42[s] | 퍼플렉서티 590.19
| 에폭 1 |  반복 181 / 1327 | 시간 48[s] | 퍼플렉서티 589.11
| 에폭 1 |  반복 201 / 1327 | 시간 53[s] | 퍼플렉서티 518.65
| 에폭 1 |  반복 221 / 1327 | 시간 59[s] | 퍼플렉서티 494.49
| 에폭 1 |  반복 241 / 1327 | 시간 65[s] | 퍼플렉서티 441.97
| 에폭 1 |  반복 261 / 1327 | 시간 71[s] | 퍼플렉서티 458.49
| 에폭 1 |  반복 281 / 1327 | 시간 76[s] | 퍼플렉서티 460.74


KeyboardInterrupt: 

## 6.5 RNNLM 추가 개선
### 6.5.4 개선된 RNNLM 구현

In [16]:
# coding: utf-8
import sys
sys.path.append('..')
from common.time_layers import *
from common.np import *  # import numpy as np
from common.base_model import BaseModel


class BetterRnnlm(BaseModel):
    '''
     LSTM 계층을 2개 사용하고 각 층에 드롭아웃을 적용한 모델이다.
     아래 [1]에서 제안한 모델을 기초로 하였고, [2]와 [3]의 가중치 공유(weight tying)를 적용했다.

     [1] Recurrent Neural Network Regularization (https://arxiv.org/abs/1409.2329)
     [2] Using the Output Embedding to Improve Language Models (https://arxiv.org/abs/1608.05859)
     [3] Tying Word Vectors and Word Classifiers (https://arxiv.org/pdf/1611.01462.pdf)
    '''
    def __init__(self, vocab_size=10000, wordvec_size=650,
                 hidden_size=650, dropout_ratio=0.5):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b1 = np.zeros(4 * H).astype('f')
        lstm_Wx2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b2 = np.zeros(4 * H).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.layers = [
            TimeEmbedding(embed_W),
            TimeDropout(dropout_ratio),
            TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True),
            TimeDropout(dropout_ratio),
            TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True),
            TimeDropout(dropout_ratio),
            TimeAffine(embed_W.T, affine_b)  # weight tying!!
        ]
        self.loss_layer = TimeSoftmaxWithLoss()
        self.lstm_layers = [self.layers[2], self.layers[4]]
        self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]]

        self.params, self.grads = [], []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads

    def predict(self, xs, train_flg=False):
        for layer in self.drop_layers:
            layer.train_flg = train_flg

        for layer in self.layers:
            xs = layer.forward(xs)
        return xs

    def forward(self, xs, ts, train_flg=True):
        score = self.predict(xs, train_flg)
        loss = self.loss_layer.forward(score, ts)
        return loss

    def backward(self, dout=1):
        dout = self.loss_layer.backward(dout)
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout

    def reset_state(self):
        for layer in self.lstm_layers:
            layer.reset_state()
