In [2]:
import numpy as np
#-----------------------------------------------------------------------------------------
class RNN:
    def __init__(self, text='hello'):
        self.text = text
        self.vocabulary = sorted(set(list(self.text)))
        self.vocab_size = len(self.vocabulary)
        
        self.W_x_to_hidden = None
        self.W_hidden_to_hidden = None
        self.W_hidden_to_y = None
    
        self.bias_hidden = None
        self.bias_output = None
    
        self.reset()
    #-----------------------------------------------------------------------------------------
    def reset(self):
        self.h = np.array([[0.]])
        return self.h[0][0]
    #-----------------------------------------------------------------------------------------
    def load_weights(self):
        self.W_x_to_hidden = np.array([[ 3.6, -4.8,  0.35, -0.26]])
        self.W_hidden_to_hidden = np.array([[ 4.1]])
        self.W_hidden_to_y = np.array([[-12.], [ -0.67], [ -0.85], [ 14.]])
        self.bias_hidden = np.array([[ 0.41]])
        self.bias_output = np.array([[-0.2], [-2.9], [ 6.1], [-3.4]])
        
        self.reset()
    #-----------------------------------------------------------------------------------------
    def step(self, x_char):
        assert isinstance(x_char, str)
        x_index = self.vocabulary.index(x_char)
        x_one_hot = np.zeros((self.vocab_size, 1))
        x_one_hot[x_index] = 1

        hidden_state = np.tanh(\
              np.dot(self.W_x_to_hidden, x_one_hot) \
            + np.dot(self.W_hidden_to_hidden, self.h)\
            + self.bias_hidden)
        
        y_output = np.dot(self.W_hidden_to_y, hidden_state) + self.bias_output

        y_prob_after_softmax = np.exp(y_output) / np.sum(np.exp(y_output))
        index = np.argmax(y_prob_after_softmax)
        y_char = self.vocabulary[index]
        
        self.h = hidden_state # important to not forget to update the hidden state
        
        return y_char, self.h[0][0]
    #-----------------------------------------------------------------------------------------
    def loss(self, x_train, y_train, h_prev):
        xs, hs, ps = {}, {}, {}
        hs[-1] = h_prev.copy()
        loss_value = 0.

        # Forward propagation
        for t, x_char in enumerate(x_train):
            x_index = self.vocabulary.index(x_char)
            x_one_hot = np.zeros((self.vocab_size, 1))
            x_one_hot[x_index] = 1
        
            y_prob_target = np.zeros((self.vocab_size, 1))
            y_index = self.vocabulary.index(y_train[t])
            y_prob_target[y_index] = 1
        
            hidden_state = np.tanh(\
                  np.dot(self.W_x_to_hidden, x_one_hot) \
                + np.dot(self.W_hidden_to_hidden, hs[t - 1])\
                + self.bias_hidden)
        
            y_output = np.dot(self.W_hidden_to_y, hidden_state) + self.bias_output
        
            y_prob_after_softmax = np.exp(y_output) / np.sum(np.exp(y_output))

            xs[t] = x_one_hot
            hs[t] = hidden_state
            ps[t] = y_prob_after_softmax

            loss_value += -np.log(y_prob_after_softmax[y_index, 0])
        
        dWhy = np.zeros_like(self.W_hidden_to_y)
        dWxh = np.zeros_like(self.W_x_to_hidden)
        dWhh = np.zeros_like(self.W_hidden_to_hidden)
        
        dbh = np.zeros_like(self.bias_hidden)
        dby = np.zeros_like(self.bias_output)
        dh_next = np.zeros_like(hidden_state)

        for t, x_char in reversed(list(enumerate(x_train))):
            x_index = self.vocabulary.index(x_char)
            x_one_hot = np.zeros((self.vocab_size, 1))
            x_one_hot[x_index] = 1
            
            y_prob_target = np.zeros((self.vocab_size, 1))
            y_index = self.vocabulary.index(y_train[t])
            y_prob_target[y_index] = 1
        
            y_prob_after_softmax = ps[t]
            hidden_state = hs[t]

            dy = np.copy(y_prob_after_softmax)
            dy -= y_prob_target
        
            dWhy += np.dot(dy, hidden_state.T)
            dby += dy
        
            dh = np.dot(self.W_hidden_to_y.T, dy) + dh_next
        
            # tanh derivative and backprop
            dh_raw = (1 - hidden_state ** 2) * dh
        
            dbh += dh_raw
            dWxh += np.dot(dh_raw, x_one_hot.T)
            dWhh += np.dot(dh_raw, hs[t - 1].T)
        
            dh_next = np.dot(self.W_hidden_to_hidden.T, dh_raw)
        
        for dp in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dp, -5, 5, out=dp) # mitigate exploding gradients
        
        return loss_value, dWxh, dWhh, dWhy, dbh, dby, hidden_state
    #-----------------------------------------------------------------------------------------
    def print_weights(self):
        print(self.W_x_to_hidden)
        print(self.W_hidden_to_hidden)
        print(self.W_hidden_to_y)

        print(self.bias_hidden)
        print(self.bias_output)
    #-----------------------------------------------------------------------------------------
    def train(self, show=False, iters=10000000):
        self.W_x_to_hidden = np.random.randn(1, self.vocab_size)
        self.W_hidden_to_hidden = np.array([[0.]])
        self.W_hidden_to_y = np.random.randn(self.vocab_size, 1)
    
        self.bias_hidden = np.random.randn(1, 1)
        self.bias_output = np.random.randn(self.vocab_size, 1)

        text = list(self.text)
        inputs = text[:len(text) - 1]
        targets = text[1:]
        
        learning_rate = 0.1
        
        mWxh, mWhh, mWhy =  np.zeros_like(self.W_x_to_hidden), \
                            np.zeros_like(self.W_hidden_to_hidden), \
                            np.zeros_like(self.W_hidden_to_y)
        mbh, mby = np.zeros_like(self.bias_hidden), np.zeros_like(self.bias_output) # memory variables for Adagrad
        
        last_n = 0
        prev_loss = None
        h_prev = self.h.copy()
        
        for n in range(iters):
            last_n = n
            loss, dWxh, dWhh, dWhy, dbh, dby, h_prev = self.loss(inputs, targets, h_prev)
            
            if n % 10000 == 0: 
                print('iter {}, loss: {}'.format(n, loss))
                if prev_loss is not None and np.abs(prev_loss - loss) < 0.01:
                    break
                prev_loss = loss
            
            for param, dparam, mem in zip(
                    [self.W_x_to_hidden, 
                        self.W_hidden_to_hidden, 
                        self.W_hidden_to_y, 
                        self.bias_hidden, 
                        self.bias_output],
                    [dWxh, dWhh, dWhy, dbh, dby],
                    [mWxh, mWhh, mWhy, mbh, mby]
            ):
                mem += dparam ** 2
                param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad
                
        print('iter {}, loss: {}'.format(last_n, loss))
        
        if show:
            self.print_weights()

In [3]:
n = RNN('it was the best')
n.train(iters=300000)


n.reset()
print((n.text[0], n.h))


for c in list(n.text[:-1]):
    next = n.step(c)
    print(next)

iter 0, loss: 41.49437051279101
iter 10000, loss: 14.988488146942387
iter 20000, loss: 14.290996060902039
iter 29999, loss: 11.17479173022433
('i', array([[ 0.]]))
(' ', -0.9999882602923269)
(' ', -0.98406058506244398)
('w', 0.4771155482050799)
(' ', -0.99999647163065986)
('s', 0.99995421584185984)
(' ', -0.86167016020888954)
('t', -0.19006165864219066)
(' ', -0.99999869546271736)
('t', -0.49068048332567965)
(' ', -0.92270134247109592)
('b', 0.16105263348730725)
(' ', -0.95883912182149478)
('s', 0.80590078258747466)
('t', -0.169801751476059)
