In [1]:
import sys
import numpy as np
import impl.RNN as rnn
import impl.solver as solver

In [2]:
# if __name__ == '__main__':
with open('data/text_data/japan.txt', 'r') as f:
    txt = f.read()

    X = []
    y = []

    char_to_idx = {char: i for i, char in enumerate(set(txt))}
    idx_to_char = {i: char for i, char in enumerate(set(txt))}

    X = np.array([char_to_idx[x] for x in txt])
    y = [char_to_idx[x] for x in txt[1:]]
    y.append(char_to_idx['.'])
    y = np.array(y)

In [3]:
vocab_size = len(char_to_idx)

# hyper parameters
time_step = 1
n_iter = 3000 # epochs
alpha = 1e-3
print_after = 1000
H = 64

In [62]:
import numpy as np
import impl.loss as loss_fun
import impl.layer as l
import impl.regularization as reg
import impl.utils as util
import impl.NN as nn

class RNN(nn.NN):

    def __init__(self, D, H, char2idx, idx2char):
        self.D = D
        self.H = H
        self.char2idx = char2idx
        self.idx2char = idx2char
        self.vocab_size = len(char2idx)
        super().__init__(D, D, H, None, None, loss='cross_ent', nonlin='relu')

    def initial_state(self):
        return np.zeros((1, self.H))

    #     # Forward depth for layers in the space direction
    #     def fwd_deep(self, X, h, train=True):

    #         # preparing the input to feed into the first layer
    #         X_one_hot = np.zeros(self.D)
    #         X_one_hot[X] = 1.
    #         X_one_hot = X_one_hot.reshape(1, -1)

    
    def cell_forward(self, X, h, train=True):
        Wxh, Whh, Why = self.model['Wxh'], self.model['Whh'], self.model['Why']
        bh, by = self.model['bh'], self.model['by']

        X_one_hot = np.zeros(self.D)
        X_one_hot[X] = 1.
        X_one_hot = X_one_hot.reshape(1, -1)

        hprev = h.copy()

        h, h_cache = l.tanh_forward(X_one_hot @ Wxh + hprev @ Whh + bh)
        y, y_cache = l.fc_forward(h, Why, by)

        cache = (X_one_hot, Whh, h, hprev, y, h_cache, y_cache)

        if not train:
            y = util.softmax(y)

        return y, h, cache

    def cell_backward(self, y_pred, y_train, dh_next, cache):
        X, Whh, h, hprev, y, h_cache, y_cache = cache

        # Softmax gradient
        dy = loss_fun.dcross_entropy(y_pred, y_train)

        # Hidden to output gradient
        dh, dWhy, dby = l.fc_backward(dy, y_cache)
        dh += dh_next
        dby = dby.reshape((1, -1))

        # tanh
        dh = l.tanh_backward(dh, h_cache)

        # Hidden gradient
        dbh = dh
        dWhh = hprev.T @ dh
        dWxh = X.T @ dh
        dh_next = dh @ Whh.T

        grad = dict(Wxh=dWxh, Whh=dWhh, Why=dWhy, bh=dbh, by=dby)

        return grad, dh_next

    def train_step_fwd(self, X_train, y_train, h):
        ys = []
        caches = []

        # Forward
        for x, y in zip(X_train, y_train):
            y_pred, h, cache = self.cell_forward(x, h, train=True)
            ys.append(y_pred)
            caches.append(cache)

        return ys, caches, h

    def layer_fwd(self, X_train, y_train, h):

        # 1st layer
        ys, caches, h = self.train_step_fwd(X_train, y_train, h)
        
        # the final loss/error
        loss = 0.
        for y_pred, y in zip(ys, y_train):
            loss += loss_fun.cross_entropy(self.model, y_pred, y, lam=0)
        loss /= X_train.shape[0]
        
        return ys, caches, loss, h

    # Backward
    def train_step_bwd(self, X_train, y_train, ys, caches):
        dh_next = np.zeros((1, self.H))
        grads = {k: np.zeros_like(v) for k, v in self.model.items()}

        for t in reversed(range(len(X_train))):
            grad, dh_next = self.cell_backward(ys[t], y_train[t], dh_next, caches[t])

            for k in grads.keys():
                grads[k] += grad[k]

        return grads

    # layer Backward
    def layer_bwd(self, X_train, y_train, ys, caches):

        # 1st layer
        grads = self.train_step_bwd(X_train, y_train, ys, caches)
        
        # Clipping the gradients for exploding grads problem
        for k, v in grads.items():
            grads[k] = np.clip(v, -5., 5.)

        return grads

    def test_step_fwd(self, X_seed, h, size=100):
        chars = [self.idx2char[X_seed]]
        idx_list = list(range(self.vocab_size))
        X = X_seed

        for t in range(size - 1):
            y_pred, h, cache = self.cell_forward(x, h, train=False) # test
            idx = np.random.choice(idx_list, p=y_pred.ravel())
            chars.append(self.idx2char[idx])
            X = idx

        return ''.join(chars)

    def _init_model(self, D, C, H):
        self.model = dict(
            Wxh=np.random.randn(D, H) / np.sqrt(D / 2.),
            Whh=np.random.randn(H, H) / np.sqrt(H / 2.),
            Why=np.random.randn(H, D) / np.sqrt(C / 2.),
            bh=np.zeros((1, H)),
            by=np.zeros((1, D))
        )

In [63]:
net = RNN(D=vocab_size, H=H, char2idx=char_to_idx, idx2char=idx_to_char)

In [64]:
import numpy as np
import impl.utils as util
import impl.constant as c
import copy
from sklearn.utils import shuffle as skshuffle

def get_minibatch(X, y, minibatch_size, shuffle=True):
    minibatches = []

    if shuffle:
        X, y = skshuffle(X, y)

    for i in range(0, X.shape[0], minibatch_size):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]

        minibatches.append((X_mini, y_mini))

    return minibatches

def adam_rnn(nn, X_train, y_train, alpha=0.001, mb_size=256, n_iter=2000, print_after=100):
    M = {k: np.zeros_like(v) for k, v in nn.model.items()}
    R = {k: np.zeros_like(v) for k, v in nn.model.items()}
    beta1 = .9
    beta2 = .999

    minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False)

    idx = 0
    state = nn.initial_state()
    smooth_loss = -np.log(1.0 / len(set(X_train)))

    for iter in range(1, n_iter + 1):
        t = iter

        if idx >= len(minibatches):
            idx = 0
            state = nn.initial_state()

        X_mini, y_mini = minibatches[idx]
        idx += 1

        if iter % print_after == 0:
            print("=========================================================================")
            print('Iter-{} loss: {:.4f}'.format(iter, smooth_loss))
            print("=========================================================================")

            sample = nn.test_step_fwd(X_mini[0], state, 100)
            print(sample)

            print("=========================================================================")
            print()
            print()

#         grad, loss, state = nn.train_step(X_mini, y_mini, state)
#         ys, caches, loss, state = nn.train_step_fwd(X_mini, y_mini, state)
        ys, caches, loss, state = nn.layer_fwd(X_mini, y_mini, state)
        smooth_loss = 0.999 * smooth_loss + 0.001 * loss
        
#         grad = nn.train_step_bwd(X_mini, y_mini, ys, caches)
        grad = nn.layer_bwd(X_mini, y_mini, ys, caches)

        for k in grad:
            M[k] = util.exp_running_avg(M[k], grad[k], beta1)
            R[k] = util.exp_running_avg(R[k], grad[k]**2, beta2)

            m_k_hat = M[k] / (1. - beta1**(t))
            r_k_hat = R[k] / (1. - beta2**(t))

            nn.model[k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)

    return nn

In [65]:
adam_rnn(nn=net, X_train=X, y_train=y, alpha=alpha, mb_size=time_step, n_iter=n_iter, 
                print_after=print_after)

Iter-1000 loss: 3.6885
iPte,iocue  jtc td p pJnCnh itt t leJanigity5gmthw ntf oa-a  ililmfnec in ty  cl tltetaanai i oooiio


Iter-2000 loss: 3.2286
Ja,ces deiuaneeadmvypwdnataen rmld h6 reenwt i t hJ nw'efsa e8vtcCteh8 fdmeitEo6mCreoiet fc wnderrrS


Iter-3000 loss: 2.9918
mar.f ustG re afii rt7usade,gnetuto  s dd ao1ycalant mai row n  a ao7ii t  ai t g.w nseb ant.H duc t




<__main__.RNN at 0x7f00f7b81550>

In [5]:
x= []
for i in range(5):
    for j in range(3):
        x.append

In [9]:
class initTest:    
    def __init__(self, D, C, H, depth):
        self.models = []
        for layer in range(depth):
            model = dict(
                Wxh=np.random.randn(D, H) / np.sqrt(D / 2.),
                Whh=np.random.randn(H, H) / np.sqrt(H / 2.),
                Why=np.random.randn(H, D) / np.sqrt(C / 2.),
                bh=np.zeros((1, H)),
                by=np.zeros((1, D))
            )
            self.models.append(model)

In [10]:
test = initTest(C=vocab_size, D=vocab_size, depth=2, H=H)
test.models[0].items()
test.models[0]['by']
# for item in test.models[1]:
#     print(item)

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.]])