In [1]:
import sys
import numpy as np
import impl.RNN as rnn
import impl.solver as solver

In [2]:
# if __name__ == '__main__':
with open('data/text_data/japan.txt', 'r') as f:
    txt = f.read()

    X = []
    y = []

    char_to_idx = {char: i for i, char in enumerate(set(txt))}
    idx_to_char = {i: char for i, char in enumerate(set(txt))}

    X = np.array([char_to_idx[x] for x in txt])
    y = [char_to_idx[x] for x in txt[1:]]
    y.append(char_to_idx['.'])
    y = np.array(y)

In [7]:
vocab_size = len(char_to_idx)

# hyper parameters
time_step = 100
n_iter = 13000 * 2 # epochs
alpha = 1e-3
print_after = 1000
H = 64

In [8]:
import numpy as np
import impl.loss as loss_fun
import impl.layer as l
import impl.regularization as reg
import impl.utils as util
import impl.NN as nn

class RNN(nn.NN):

    def __init__(self, D, H, char2idx, idx2char):
        self.D = D
        self.H = H
        self.char2idx = char2idx
        self.idx2char = idx2char
        self.vocab_size = len(char2idx)
        super().__init__(D, D, H, None, None, loss='cross_ent', nonlin='relu')

    def initial_state(self):
        return np.zeros((1, self.H))

    def forward(self, X, h, m): # m = self.model
        Wxh, Whh, Why = m['Wxh'], m['Whh'], m['Why']
        bh, by = m['bh'], m['by']

        hprev = h.copy()

        h, h_cache = l.tanh_forward(X @ Wxh + hprev @ Whh + bh)
        y, y_cache = l.fc_forward(h, Why, by)

        cache = X, Whh, h, hprev, y, h_cache, y_cache, Wxh

        return y, h, cache

    def backward(self, dy, dh, cache):
        X, Whh, h, hprev, y, h_cache, y_cache, Wxh = cache

        dh_next = dh.copy()
        
        # Hidden to output gradient
        dh, dWhy, dby = l.fc_backward(dy, y_cache)
        dh += dh_next
        dby = dby.reshape((1, -1))

        # tanh
        dh = l.tanh_backward(dh, h_cache)

        # Hidden gradient
        dbh = dh
        dWhh = hprev.T @ dh
        dWxh = X.T @ dh
        
        dX = dh @ Wxh.T
        dh = dh @ Whh.T

        grad = dict(Wxh=dWxh, Whh=dWhh, Why=dWhy, bh=dbh, by=dby)
        
        return dX, dh, grad

    def _init_model(self, D, C, H):
        self.model = []
        for l in range(4):
            m = dict(
                Wxh=np.random.randn(D, H) / np.sqrt(D / 2.),
                Whh=np.random.randn(H, H) / np.sqrt(H / 2.),
                Why=np.random.randn(H, D) / np.sqrt(C / 2.),
                bh=np.zeros((1, H)),
                by=np.zeros((1, D))
                )
            self.model.append(m)

    def train_step_fwd(self, X_train, h):
        ys, caches, caches2, caches3, caches4 = [], [], [], [], []
        h2 = h.copy()
        h3 = h.copy()
        h4 = h.copy()
              
        for X in X_train:
            X_one_hot = np.zeros(self.D)
            X_one_hot[X] = 1.
            x = X_one_hot.reshape(1, -1)
            y, h, cache = self.forward(x, h, self.model[0])
            y, h2, cache2 = self.forward(y, h2, self.model[1])
            y, h3, cache3 = self.forward(y, h3, self.model[2])
            y, h4, cache4 = self.forward(y, h4, self.model[3])
            ys.append(y)
            caches.append(cache)
            caches2.append(cache2)
            caches3.append(cache3)
            caches4.append(cache4)
            
        caches_ = []
        caches_.append(caches)
        caches_.append(caches2)
        caches_.append(caches3)
        caches_.append(caches4)
           
        return ys, caches_

    def train_step_bwd(self, y_train, ys, caches):
        loss, dys = 0.0, []

        for y_pred, y in zip(ys, y_train):
            loss += loss_fun.cross_entropy(self.model[0], y_pred, y, lam=0)/ y_train.shape[0]
            dy = loss_fun.dcross_entropy(y_pred, y)
            dys.append(dy)

        # Grads
        dh = np.zeros((1, self.H))
        dh2 = np.zeros((1, self.H))
        dh3 = np.zeros((1, self.H))
        dh4 = np.zeros((1, self.H))
        grads = {key: np.zeros_like(val) for key, val in self.model[0].items()}
        grads2 = {key: np.zeros_like(val) for key, val in self.model[0].items()}
        grads3 = {key: np.zeros_like(val) for key, val in self.model[0].items()}
        grads4 = {key: np.zeros_like(val) for key, val in self.model[0].items()}
        for t in reversed(range(len(dys))):
            dX, dh4, grad4 = self.backward(dys[t], dh4, caches[3][t]) # 4th layer 
            dX, dh3, grad3 = self.backward(dX, dh3, caches[2][t]) # 3rd layer 
            dX, dh2, grad2 = self.backward(dX, dh2, caches[1][t]) # 2nd layer 
            dX, dh, grad = self.backward(dX, dh, caches[0][t]) # first layer
            for k in grad.keys():
                grads[k] += grad[k]
                grads2[k] += grad2[k]
                grads3[k] += grad3[k]
                grads4[k] += grad4[k]
                
        grads_ = []
        grads_.append(grads)
        grads_.append(grads2)
        grads_.append(grads3)
        grads_.append(grads4)
        
        return loss, grads_
    
    def test(self, X_seed, h, size=100):
        chars = [self.idx2char[X_seed]]
        idx_list = list(range(self.vocab_size))
        X = X_seed
        
        h_init = h.copy()
        h = []
        for l in range(4):
            h.append(h_init.copy())

        for _ in range(size - 1):
            X_one_hot = np.zeros(self.D)
            X_one_hot[X] = 1.
            x = X_one_hot.reshape(1, -1)
            y, h[0], _ = self.forward(x, h[0], self.model[0])
            y, h[1], _ = self.forward(y, h[1], self.model[1])
            y, h[2], _ = self.forward(y, h[2], self.model[2])
            y, h[3], _ = self.forward(y, h[3], self.model[3])
            prob = util.softmax(y)
            idx = np.random.choice(idx_list, p=prob.ravel())
            chars.append(self.idx2char[idx])
            X = idx

        return ''.join(chars)

In [9]:
net = RNN(D=vocab_size, H=H, char2idx=char_to_idx, idx2char=idx_to_char)

In [11]:
import numpy as np
import impl.utils as util
import impl.constant as c
import copy
from sklearn.utils import shuffle as skshuffle

def get_minibatch(X, y, minibatch_size, shuffle=True):
    minibatches = []

    if shuffle:
        X, y = skshuffle(X, y)

    for i in range(0, X.shape[0], minibatch_size):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]

        minibatches.append((X_mini, y_mini))

    return minibatches

def adam_rnn(nn, X_train, y_train, alpha=0.001, mb_size=256, n_iter=2000, print_after=100):
    minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False)

    idx = 0
    state = nn.initial_state()
    smooth_loss = -np.log(1.0 / len(set(X_train)))

    M = {k: np.zeros_like(v) for k, v in nn.model[0].items()}
    R = {k: np.zeros_like(v) for k, v in nn.model[0].items()}
    M2 = {k: np.zeros_like(v) for k, v in nn.model[0].items()}
    R2 = {k: np.zeros_like(v) for k, v in nn.model[0].items()}
    M3 = {k: np.zeros_like(v) for k, v in nn.model[0].items()}
    R3 = {k: np.zeros_like(v) for k, v in nn.model[0].items()}    
    M4 = {k: np.zeros_like(v) for k, v in nn.model[0].items()}
    R4 = {k: np.zeros_like(v) for k, v in nn.model[0].items()}
    beta1 = .9
    beta2 = .999

    for iter in range(1, n_iter + 1):
        t = iter

        if idx >= len(minibatches):
            idx = 0
            state = nn.initial_state()

        X_mini, y_mini = minibatches[idx]
        idx += 1

        # Print loss and test sample
        if iter % print_after == 0:
            print('Iter-{} loss: {:.4f}'.format(iter, smooth_loss))
            sample = nn.test(X_mini[0], state)
            print(sample)

        ys, caches = nn.train_step_fwd(X_mini, state)
        loss, grads = nn.train_step_bwd(y_mini, ys, caches)
        smooth_loss = 0.999 * smooth_loss + 0.001 * loss

        for k in grads[0].keys(): #key, value: items
            M[k] = util.exp_running_avg(M[k], grads[0][k], beta1)
            R[k] = util.exp_running_avg(R[k], grads[0][k]**2, beta2)

            m_k_hat = M[k] / (1. - beta1**(t))
            r_k_hat = R[k] / (1. - beta2**(t))

            nn.model[0][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)
    
        for k in grads[0].keys(): #key, value: items
            M2[k] = util.exp_running_avg(M2[k], grads[1][k], beta1)
            R2[k] = util.exp_running_avg(R2[k], grads[1][k]**2, beta2)

            m_k_hat = M2[k] / (1. - beta1**(t))
            r_k_hat = R2[k] / (1. - beta2**(t))

            nn.model[1][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)

        for k in grads[0].keys(): #key, value: items
            M3[k] = util.exp_running_avg(M3[k], grads[2][k], beta1)
            R3[k] = util.exp_running_avg(R3[k], grads[2][k]**2, beta2)

            m_k_hat = M3[k] / (1. - beta1**(t))
            r_k_hat = R3[k] / (1. - beta2**(t))

            nn.model[2][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)

        for k in grads[0].keys(): #key, value: items
            M4[k] = util.exp_running_avg(M4[k], grads[3][k], beta1)
            R4[k] = util.exp_running_avg(R4[k], grads[3][k]**2, beta2)

            m_k_hat = M4[k] / (1. - beta1**(t))
            r_k_hat = R4[k] / (1. - beta2**(t))

            nn.model[3][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)

    return nn

In [12]:
adam_rnn(nn=net, X_train=X, y_train=y, alpha=alpha, mb_size=time_step, n_iter=n_iter, 
                print_after=print_after)

Iter-1000 loss: 3.1522
Jy6ann' fo6 thy Gn0e tr thets uture th Onthn Eost ictd thet, ligo ic tir the wirty bes intd the J pe
Iter-2000 loss: 2.1541
 ofo anten birnolidh4scofnotan on Nta-tarta. Thitgl whe7 oce op r 6iza7out niridhe West meatitro the
Iter-3000 loss: 1.3868
 Japan Sigh himstgres. Wor and Go,d a misaloled the Gtita in the Global of aed in the toty the "itit
Iter-4000 loss: 0.8159
ed and caas of lalalita. The an foly Innth and .cpolld ma and NaNdory Grs lerititane the East A9ia-.
Iter-5000 loss: 0.4586
 Sea of Japan, the East China Sea, ,y,ititer colekrolasongurat meleaseng Itato itarion of Otunlates 
Iter-6000 loss: 0.2160
 importer. Although Japan has om4h r-raroky  far rn movens cenetiale Ifte monse to the West Pmproc t
Iter-7000 loss: 0.2673
nety-seven percent of Japan's land area. The country is divided into 47 prefectures in eight re, Nev
Iter-8000 loss: 0.1159
. The population of 126 million is the world's tenth largest. Japanese people make up 98.5% of Japan
Iter-900

<__main__.RNN at 0x10d082f98>