In [71]:
# Data
import numpy as np

with open('data/text_data/japan.txt', 'r') as f:
# with open('data/text_data/anna.txt', 'r') as f:

    txt = f.read()

    X = []
    y = []

    char_to_idx = {char: i for i, char in enumerate(set(txt))}
    idx_to_char = {i: char for i, char in enumerate(set(txt))}

    X = np.array([char_to_idx[x] for x in txt])
    y = [char_to_idx[x] for x in txt[1:]]
    y.append(char_to_idx['.'])
    y = np.array(y)

# Looking at the X, y
X.shape, y.shape, X[:10], y[:10]

((3629,),
 (3629,),
 array([ 7, 12, 41, 12,  2, 11, 67,  7, 12, 41]),
 array([12, 41, 12,  2, 11, 67,  7, 12, 41, 12]))

In [73]:
# Model or Network
import impl.layer as l
from impl.loss import *

class GRU:
    def __init__(self, D, H, L, char2idx, idx2char):
        self.D = D
        self.H = H
        self.L = L
        self.char2idx = char2idx
        self.idx2char = idx2char
        self.vocab_size = len(char2idx)
        self.losses = {'train':[], 'smooth train':[]}
        
        # Model params
        Z = H + D
        m = dict(
            Wz=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wh=np.random.randn(Z, H) / np.sqrt(Z / 2.),
            Wy=np.random.randn(H, D) / np.sqrt(H / 2.), 
            bz=np.zeros((1, H)),
            bh=np.zeros((1, H)),
            by=np.zeros((1, D))
        )
        self.model = []
        for _ in range(self.L):
            self.model.append(m)
            
        # Conv model parameters
        kernel_size = 9
        # y = X @ W + b # X_txn.T @ W_tx1 + b_nx1 = y_nx1
        m = dict(
            W = np.random.randn(kernel_size, 1) / np.sqrt(kernel_size / 2.),
            b = np.random.randn(1, 1) / np.sqrt(1 / 2.)
        )

    def initial_state(self):
        return np.zeros((1, self.H))

    def forward(self, X, h, m):
        Wz, Wh, Wy = m['Wz'], m['Wh'], m['Wy']
        bz, bh, by = m['bz'], m['bh'], m['by']

        X_in = X.copy()
        h_in = h.copy()

        X = np.column_stack((h_in, X_in))

        hz, hz_cache = l.fc_forward(X, Wz, bz)
        hz, hz_sigm_cache = l.sigmoid_forward(hz)
        
        hh, hh_cache = l.fc_forward(X, Wh, bh)
        hh, hh_tanh_cache = l.tanh_forward(hh)

        # h = (1. - hz) * h_old + hz * hh
        # or
        # h = ((1. - hz) * h_in) + (hz * hh)
        # or
        h = h_in + (hz * (hh - h_in))

        y, y_cache = l.fc_forward(h, Wy, by)

        cache = (h_in, hz, hz_cache, hz_sigm_cache, hh, hh_cache, hh_tanh_cache, y_cache)

        return y, h, cache

    def backward(self, dy, dh, cache):
        h_in, hz, hz_cache, hz_sigm_cache, hh, hh_cache, hh_tanh_cache, y_cache = cache
        
        dh_out = dh.copy()

        dh, dWy, dby = l.fc_backward(dy, y_cache)
        dh += dh_out

        dh_in1 = (1. - hz) * dh
        
        dhh = hz * dh
        dhh = l.tanh_backward(dhh, hh_tanh_cache)
        dXh, dWh, dbh = l.fc_backward(dhh, hh_cache)

        # dhz = (hh * dh) - (h_in * dh)
        # or
        dhz = (hh - h_in) * dh
        dhz = l.sigmoid_backward(dhz, hz_sigm_cache)
        dXz, dWz, dbz = l.fc_backward(dhz, hz_cache)

        dX = dXh + dXz
        dh_in2 = dX[:, :self.H]
        dX_in = dX[:, self.H:]

        dh = dh_in1 + dh_in2
        dX = dX_in

        grad = dict(Wz=dWz, Wh=dWh, Wy=dWy, bz=dbz, bh=dbh, by=dby)
        
        return dX, dh, grad
        
    def train_forward(self, X_train, h):
        ys, caches, caches_conv = [], [], []
        h_init = h.copy()
        h = []
        for _ in range(self.L):
            h.append(h_init.copy())
            caches.append([])
            caches_conv.append([])
        
        # Embedding, Input layer, 1st layer
        Xs = []
        for X in X_train:
            X_one_hot = np.zeros(self.D)
            X_one_hot[X] = 1.
            X = X_one_hot.reshape(1, -1)
            Xs.append(X)
        
        for layer in range(self.L):
            # Convolution RNN forward
            kernel_size = 9
            pad_const = 0.
            X_pad = np.pad(X, (kernel_size//2, kernel_size//2), 'constant', (pad_left_const, pad_right_const))
            for idx in range(0, len(X_pad) - kernel_size + 1, 1):
                X = X_pad[idx: idx + kernel_size] # X_txn
                # y = X @ W + b # X_txn.T @ W_tx1 + b_nx1 = y_nx1
                y, cache = l.fc_forward(X, self.model_conv[layer]) # 2b defined
                ys.append(y)
                caches_conv[layer].append(cache) # 2b defined

            # for layer in range(self.L): # from 0 to L-1
            Xs = ys.copy()
            ys = []
            for X in Xs:
                X = X.reshape(1, -1)
                y, h[layer], cache = self.forward(X, h[layer], self.model[layer])
                caches[layer].append(cache)
                ys.append(y)

            ys_caches = caches, caches_conv

        return ys, caches
    
    def loss_function(self, y_train, ys):
        loss, dys = 0.0, []

        for y_pred, y in zip(ys, y_train):
            loss += cross_entropy(y_pred, y)
            dy = dcross_entropy(y_pred, y)
            dys.append(dy)
            
        return loss, dys
    
    def train_backward(self, dys, caches):
        dh, grad, grads, grads_conv = [], [], [], []
        for layer in range(self.L):
            dh.append(np.zeros((1, self.H)))
            grad.append({key: np.zeros_like(val) for key, val in self.model[layer].items()})
            grads.append({key: np.zeros_like(val) for key, val in self.model[layer].items()})
            grads_conv.append({key: np.zeros_like(val) for key, val in self.model_conv[layer].items()})
        
        # Convolution RNN forward
        kernel_size = 9
        pad_const = 0.
        dXs = np.zeros_like(y_train)
        dXs_pad = np.pad(dXs, (kernel_size//2, kernel_size//2), 'constant', (pad_const, pad_const))
        # dXs_pad = np.zeros((len(dys) + kernel_size, dys[0].shape(1)))

        dXs = dys.copy()
        for layer in reversed(range(self.L)):
            dys = dXs.copy()
            for t in reversed(range(len(dys))):
                dy = dys[t]
                dX, dh[layer], grad[layer] = self.backward(dy, dh[layer], caches[layer][t])
                for k in grad[layer].keys():
                    grads[layer][k] += grad[layer][k]
                dXs[t] = dX
                
            dys = dXs.copy()
            dXs = []
            for t in reversed(range(len(dys))):
                dy = dys[t]
                dX, dW, db = l.fc_backward(dy, caches_conv[layer][t])
                grads[layer]['W'] += dW
                grads[layer]['b'] += db
                for idx in range(t - (kernel_size//2), t + (kernel_size//2), 1):
                    # X = X_pad[idx: idx + kernel_size] # X_txn
                    # y = X @ W + b # X_txn.T @ W_tx1 + b_nx1 = y_nx1
                    # y, cache = l.fc_forward(X, self.model_conv[layer]) # 2b defined
                    np.add.at(dXs_pad, [idx], dX[idx-(t-(kernel_size//2))])
                dXs = dXs_pad[kernel_size//2:-kernel_size//2]
            dys = dXs.copy()
                
        return dXs, grads

In [3]:
def get_minibatch(X, y, minibatch_size, shuffle):
    minibatches = []

    #for i in range(0, X.shape[0], minibatch_size):
    for i in range(0, X.shape[0] - minibatch_size + 1, 1):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]
        minibatches.append((X_mini, y_mini))

    return minibatches

def adam_rnn(nn, X_train, y_train, alpha, mb_size, n_iter, print_after):

    M, R = [], []
    for layer in range(nn.L):
        M.append({k: np.zeros_like(v) for k, v in nn.model[layer].items()})
        R.append({k: np.zeros_like(v) for k, v in nn.model[layer].items()})
        
    beta1 = .99
    beta2 = .999
    state = nn.initial_state()
    smooth_loss = 1.
    minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False)
    
    for iter in range(1, n_iter + 1):
        for idx in range(len(minibatches)):
            X_mini, y_mini = minibatches[idx]
            ys, caches = nn.train_forward(X_mini, state)
            loss, dys = nn.loss_function(y_mini, ys)
            _, grads = nn.train_backward(dys, caches)
            nn.losses['train'].append(loss)
            smooth_loss = (0.999 * smooth_loss) + (0.001 * loss)
            nn.losses['smooth train'].append(smooth_loss)

            for layer in range(nn.L):
                for k in grads[layer].keys(): #key, value: items
                    M[layer][k] = l.exp_running_avg(M[layer][k], grads[layer][k], beta1)
                    R[layer][k] = l.exp_running_avg(R[layer][k], grads[layer][k]**2, beta2)

                    m_k_hat = M[layer][k] / (1. - (beta1**(iter)))
                    r_k_hat = R[layer][k] / (1. - (beta2**(iter)))

                    nn.model[layer][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + l.eps)

        # Print loss and test sample
        if iter % print_after == 0:
`            print('Iter-{} loss: {:.4f}'.format(iter, loss))
            #             sample = nn.test(X_mini[0], state, 100)
            #             print(sample)

    return nn

In [None]:
# Hyper-parameters
time_step = 100 # width, minibatch size and test sample size as well
num_layers = 2 # depth
n_iter = 30 # epochs
alpha = 1e-4 # learning_rate
print_after = 1 # n_iter//10 # print training loss, valid, and test
num_hidden_units = 64 # num_hidden_units in hidden layer
num_input_units = len(char_to_idx) # vocab_size = len(char_to_idx)

# Build the network and learning it or optimizing it using SGD
net = GRU(D=num_input_units, H=num_hidden_units, L=num_layers, char2idx=char_to_idx, idx2char=idx_to_char)

# Start learning using BP-SGD-ADAM
adam_rnn(nn=net, X_train=X, y_train=y, alpha=alpha, mb_size=time_step, n_iter=n_iter, print_after=print_after)

# # Display the learning curve and losses for training, validation, and testing
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(net.losses['train'], label='Train loss')
plt.plot(net.losses['smooth train'], label='Train smooth loss')
plt.legend()
plt.show()

Iter-1 loss: 217.5480
 Ue6 the Npamagt Wranhd IndFf6 ArranS ustirsin indes ree,Gtef in the Gors hethe estax,essy fech d Pra
Iter-2 loss: 197.5714
 the doonndekth Rsithe rofy eun C1k. Anichess lapan fivext on6t rLan Whor oullat 3uxJan Indd farkked 
Iter-3 loss: 191.1596
 pancp liut exJar ith onde Jaunwh, hingy ex4 t7 anditre n mi8 C8 Gnunrthe 7af%ges mtint nNortles, fom
Iter-4 loss: 205.4693
 is rhes Eusser ran useapend expinnD poppunord and fir5ld, is the G4 tere aI bontar Ned Japand nanwJa
Iter-5 loss: 214.9694
 wan wus, in the hidsteuntt in the Gter e8iJar Gvele1nomteanloming and napan isorees of the 8uper ala
Iter-6 loss: 194.1631
 man morrekyy eand Japan its of of, Ind the cinst in the G2glytAJapan intan's the the sevrinment. Jap
Iter-7 loss: 195.4187
 panse suuntry omG Nnal of wake. Japan hafin sivint exploreate Japan is laniny unnenst in6pest powesn
Iter-8 loss: 200.8147
 ron segens poknemt into opNyG at a pevild inlalitiand in the Glebounowhen and and Rantassedeppeuntry
Iter-9 l

In [34]:
X.shape, X[:2]
X_sample = X[:10]
X_samples = X[:40]
X_sample.shape, X_sample, X_samples.shape
# X_sample.add.at([0:10], X_sample)
X_samples, X_sample
X_ones = np.ones_like(X_sample)
X_ones.shape, X_ones
# X_samples[-1-10:], X_samples[-1]
# X_samples[-1-10:] += X_sample
# X_samples = X_samples.reshape(1, -1) 
# X_sample = X_sample.reshape(1, -1)
X_sample.shape, X_samples.shape, X_sample, X_samples
np.add.at(X_samples, [0:10], X_sample)

SyntaxError: invalid syntax (<ipython-input-34-b5c9a01b6d9c>, line 14)

In [69]:
a = np.array([[1, 2, 3, 4], [1, 2, 3, 4]]).T
b = np.array([[1, 2, 3], [1, 2, 3]]).T
a, b, a.shape, b.shape
# np.add.at(a, [1, 2, 3], b)
np.add.at(a, [1], b[0])
# a.add.at([1, 2], b)
print(a)

[[1 1]
 [3 3]
 [3 3]
 [4 4]]
