In [1]:
import numpy as np
import matplotlib.pyplot as plt
# 乱数シードを指定
np.random.seed(seed=0)

In [2]:
class Affine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        N, D = x.shape
        W, b = self.params

        out = np.dot(x, W) + b
        self.x = x
        return out

    def backward(self, dout):
        x = self.x
        N, D = x.shape
        W, b = self.params

        db = np.sum(dout, axis=0)
        dW = np.dot(x.T, dout)
        dx = np.dot(dout, W.T)

        self.grads[0][...] = dW
        self.grads[1][...] = db

        return dx

In [3]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for i in range(len(params)):
            params[i] -= self.lr * grads[i]
            
class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = [], []
            for param in params:
                self.m.append(np.zeros_like(param))
                self.v.append(np.zeros_like(param))
        
        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)

        for i in range(len(params)):
            self.m[i] += (1 - self.beta1) * (grads[i] - self.m[i])
            self.v[i] += (1 - self.beta2) * (grads[i]**2 - self.v[i])
            
            params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)

In [4]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x):
    if x.ndim == 2:
        x = x - x.max(axis=1, keepdims=True)
        x = np.exp(x)
        x /= x.sum(axis=1, keepdims=True)
    elif x.ndim == 1:
        x = x - np.max(x)
        x = np.exp(x) / np.sum(np.exp(x))

    return x

In [5]:
def clip_grads(grads, max_norm):
    total_norm = 0
    for grad in grads:
        total_norm += np.sum(grad ** 2)
    total_norm = np.sqrt(total_norm)

    rate = max_norm / (total_norm + 1e-6)
    if rate < 1:
        for grad in grads:
            grad *= rate

In [6]:
class Embedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx):
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out

    def backward(self, dout):
        dW, = self.grads
        dW[...] = 0
        np.add.at(dW, self.idx, dout)
        return None

In [7]:
w = np.array([[0.5, 1.2],
              [-1.4, 0.7],
              [2.5, -0.9],
              [5.6, 9.8],
              [-2.3, -0.8]])
embedder_example = Embedding(w)
minibatch_example = np.array([3,0,4,1])
embedder_example.forward(minibatch_example)

array([[ 5.6,  9.8],
       [ 0.5,  1.2],
       [-2.3, -0.8],
       [-1.4,  0.7]])

In [8]:
class TimeEmbedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.layers = None
        self.W = W

    def forward(self, xs):
        N, T = xs.shape
        V, D = self.W.shape

        out = np.empty((N, T, D), dtype='f')
        self.layers = []

        for t in range(T):
            layer = Embedding(self.W)
            out[:, t, :] = layer.forward(xs[:, t])
            self.layers.append(layer)

        return out

    def backward(self, dout):
        N, T, D = dout.shape

        grad = 0
        for t in range(T):
            layer = self.layers[t]
            layer.backward(dout[:, t, :])
            grad += layer.grads[0]

        self.grads[0][...] = grad
        return None

In [9]:
class TimeAffine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        N, T, D = x.shape
        W, b = self.params

        rx = x.reshape(N*T, -1)
        out = np.dot(rx, W) + b
        self.x = x
        return out.reshape(N, T, -1)

    def backward(self, dout):
        x = self.x
        N, T, D = x.shape
        W, b = self.params

        dout = dout.reshape(N*T, -1)
        rx = x.reshape(N*T, -1)

        db = np.sum(dout, axis=0)
        dW = np.dot(rx.T, dout)
        dx = np.dot(dout, W.T)
        dx = dx.reshape(*x.shape)

        self.grads[0][...] = dW
        self.grads[1][...] = db

        return dx

In [10]:
class TimeSoftmaxWithLoss:
    def __init__(self):
        self.params, self.grads = [], []
        self.cache = None

    def forward(self, xs, ts):
        N, T, V = xs.shape

        if ts.ndim == 3:  # 教師ラベルがone-hotベクトルの場合
            ts = ts.argmax(axis=2)

        # バッチ分と時系列分をまとめる（reshape）
        xs = xs.reshape(N * T, V)
        ts = ts.reshape(N * T)

        ys = softmax(xs)
        ls = np.log(ys[np.arange(N * T), ts])
        loss = -np.sum(ls)
        loss /= len(ts)

        self.cache = (ts, ys, (N, T, V))
        return loss

    def backward(self, dout=1):
        ts, ys, (N, T, V) = self.cache

        dx = ys
        dx[np.arange(N * T), ts] -= 1
        dx *= dout
        dx /= len(ys)

        dx = dx.reshape((N, T, V))

        return dx

In [11]:
import sys
import os
import urllib.request
import pickle

url_base = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/'
key_file = {
    'train':'ptb.train.txt'
}
save_file = {
    'train':'ptb.train.npy'
}
vocab_file = 'ptb.vocab.pkl'

def _download(file_name):
    file_path = './' + file_name
    if os.path.exists(file_path):
        return

    print('Downloading ' + file_name + ' ... ')

    try:
        urllib.request.urlretrieve(url_base + file_name, file_path)
    except urllib.error.URLError:
        import ssl
        ssl._create_default_https_context = ssl._create_unverified_context
        urllib.request.urlretrieve(url_base + file_name, file_path)

    print('Done')

def load_vocab():
    vocab_path = './' + vocab_file

    if os.path.exists(vocab_path):
        with open(vocab_path, 'rb') as f:
            word_to_id, id_to_word = pickle.load(f)
        return word_to_id, id_to_word

    word_to_id = {}
    id_to_word = {}
    data_type = 'train'
    file_name = key_file[data_type]
    file_path = './' + file_name

    _download(file_name)

    words = open(file_path).read().replace('\n', '<eos>').strip().split()

    for i, word in enumerate(words):
        if word not in word_to_id:
            tmp_id = len(word_to_id)
            word_to_id[word] = tmp_id
            id_to_word[tmp_id] = word

    with open(vocab_path, 'wb') as f:
        pickle.dump((word_to_id, id_to_word), f)

    return word_to_id, id_to_word

In [12]:
def load_ptb(data_type='train'):
    save_path = './' + save_file[data_type]
    word_to_id, id_to_word = load_vocab()

    if os.path.exists(save_path):
        corpus = np.load(save_path)
        return corpus, word_to_id, id_to_word

    file_name = key_file[data_type]
    file_path = './' + file_name
    _download(file_name)

    words = open(file_path).read().replace('\n', '<eos>').strip().split()
    corpus = np.array([word_to_id[w] for w in words])

    np.save(save_path, corpus)
    return corpus, word_to_id, id_to_word

In [13]:
# 学習データの読み込み（データセットを小さくする）
corpus, word_to_id, id_to_word = load_ptb('train')
corpus_size = 1000
corpus = corpus[:corpus_size]
vocab_size = int(max(corpus) + 1)

xs = corpus[:-1]  # 入力
ts = corpus[1:]  # 出力（教師ラベル）
data_size = len(xs)
print('corpus size: %d, vocabulary size: %d' % (corpus_size, vocab_size))

Downloading ptb.train.txt ... 
Done
corpus size: 1000, vocabulary size: 418


In [14]:
class RNN:
    def __init__(self, Wx, Wh, b):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None

    def forward(self, x, h_prev):
        Wx, Wh, b = self.params 
        t =  np.dot(h_prev,Wh)+np.dot(x,Wx)+b
        h_next = np.tanh(t) 

        self.cache = (x, h_prev, h_next)
        return h_next

    def backward(self, dh_next):
        Wx, Wh, b = self.params
        x, h_prev, h_next = self.cache

        dt =  dh_next*(1-h_next**2)
        db =  np.sum(dt,axis=0)
        dWh =  np.dot(h_prev.T,dt)
        dh_prev = np.dot(dt,Wh.T)
        dWx =  np.dot(x.T,dt)
        dx =  np.dot(dt,Wx.T)

        self.grads[0][...] = dWx
        self.grads[1][...] = dWh
        self.grads[2][...] = db

        return dx, dh_prev

In [15]:
class TimeRNN:
    def __init__(self,input_size, output_size, stateful=False):
        D, H = input_size, output_size
        rn = np.random.randn
        np.random.seed(0)

        # 重みの初期化
        Wx = (rn(D, H) / np.sqrt(D)).astype('f')
        Wh = (rn(H, H) / np.sqrt(H)).astype('f')
        b = np.zeros(H).astype('f')

        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.layers = None

        self.h, self.dh = None, None
        self.stateful = stateful
        self.input_shapes = None

    def forward(self, xs):
        Wx, Wh, b = self.params
        N, T, D = xs.shape
        D, H = Wx.shape
        self.input_shapes = [N,T,D]

        self.layers = []
        hs = np.empty((N, T, H), dtype='f')

        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')

        for t in range(T):
            layer = RNN(*self.params)
            self.h = layer.forward(xs[:,t,:], self.h) 
            hs[:, t, :] = self.h
            self.layers.append(layer)

        return hs

    def backward(self, dhs):
        Wx, Wh, b = self.params
        N, T, D = self.input_shapes
        D, H = Wx.shape
        
        if dhs.ndim == 2:
            temp = np.zeros((N,T,H))
            temp[:,-1,:] = dhs
            dhs = temp
        
        N, T, H = dhs.shape

        dxs = np.empty((N, T, D), dtype='f')
        dh = 0
        grads = [0, 0, 0]
        for t in reversed(range(T)):
            layer = self.layers[t]
            dx, dh = layer.backward(dhs[:,t,:]+dh) 
            dxs[:, t, :] = dx

            for i, grad in enumerate(layer.grads):
                grads[i] += grad

        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        self.dh = dh

        return dxs

    def set_state(self, h):
        self.h = h

    def reset_state(self):
        self.h = None

In [16]:
class SimpleRnnNetwork:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        np.random.seed(0)

        # 重みの初期化
        embed_W = (rn(V, D) / 100).astype('f')
        affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        # レイヤの生成
        self.layers = [
            TimeEmbedding(embed_W),
            TimeRNN(D, H, stateful=True),
            TimeAffine(affine_W, affine_b)
        ]
        self.loss_layer = TimeSoftmaxWithLoss()
        self.rnn_layer = self.layers[1]

        # すべての重みと勾配をリストにまとめる
        self.params, self.grads = [], []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs, ts):
        for layer in self.layers:
            xs = layer.forward(xs)
        loss = self.loss_layer.forward(xs, ts)
        return loss

    def backward(self, dout=1):
        dout = self.loss_layer.backward(dout)
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout

    def reset_state(self):
        self.rnn_layer.reset_state()


In [17]:
# ハイパーパラメータの設定
batch_size = 20 #10
wordvec_size = 100
hidden_size = 100
time_size = 5 
lr = 10 #0.1
max_epoch = 40
max_grad = 0.25

max_iters = data_size // (batch_size * time_size)

optimizer = SGD(lr)

jump = (corpus_size - 1) // batch_size
offsets = [i * jump for i in range(batch_size)]

In [18]:
model = SimpleRnnNetwork(vocab_size, wordvec_size, hidden_size)
#model = LSTMNetwork(vocab_size, wordvec_size, hidden_size)

time_idx = 0
total_loss = 0
loss_count = 0
rnn_ppl_list = []

for epoch in range(max_epoch):
    for iter in range(max_iters):
        # ミニバッチの取得
        batch_x = np.empty((batch_size, time_size), dtype='i')
        batch_t = np.empty((batch_size, time_size), dtype='i')
        for t in range(time_size):
            for i, offset in enumerate(offsets):
                batch_x[i, t] = xs[(offset + time_idx) % data_size]
                batch_t[i, t] = ts[(offset + time_idx) % data_size]
            time_idx += 1

        # 勾配を求め、パラメータを更新
        loss = model.forward(batch_x, batch_t)
        model.backward()
        if max_grad is not None:
            clip_grads(model.grads, max_grad)

        optimizer.update(model.params, model.grads)
        total_loss += loss
        loss_count += 1

    # エポックごとにパープレキシティの評価
    ppl = np.exp(total_loss / loss_count)
    print('| epoch %d | perplexity %.2f'
          % (epoch+1, ppl))
    rnn_ppl_list.append(float(ppl))
    total_loss, loss_count = 0, 0


| epoch 1 | perplexity 383.29
| epoch 2 | perplexity 404.66
| epoch 3 | perplexity 476.16
| epoch 4 | perplexity 497.84
| epoch 5 | perplexity 485.39
| epoch 6 | perplexity 481.43
| epoch 7 | perplexity 369.13
| epoch 8 | perplexity 520.18
| epoch 9 | perplexity 594.56
| epoch 10 | perplexity 528.29
| epoch 11 | perplexity 557.51
| epoch 12 | perplexity 414.89
| epoch 13 | perplexity 358.61
| epoch 14 | perplexity 347.29
| epoch 15 | perplexity 354.37
| epoch 16 | perplexity 369.20
| epoch 17 | perplexity 305.21
| epoch 18 | perplexity 256.95
| epoch 19 | perplexity 216.35
| epoch 20 | perplexity 184.68
| epoch 21 | perplexity 160.95
| epoch 22 | perplexity 131.39
| epoch 23 | perplexity 99.73
| epoch 24 | perplexity 78.21
| epoch 25 | perplexity 61.50
| epoch 26 | perplexity 50.10
| epoch 27 | perplexity 37.52
| epoch 28 | perplexity 28.31
| epoch 29 | perplexity 24.55
| epoch 30 | perplexity 18.69
| epoch 31 | perplexity 15.70
| epoch 32 | perplexity 13.12
| epoch 33 | perplexity 10.