In [12]:
import numpy as np
#-----------------------------------------------------------------------------------------
class RNN:
    def __init__(self):
        self.text = 'hello'
        self.vocabulary = sorted(set(list(self.text)))
        self.vocab_size = len(self.vocabulary)
        
        self.W_x_to_hidden = None
        self.W_hidden_to_hidden = None
        self.W_hidden_to_y = None
    
        self.bias_hidden = None
        self.bias_output = None
    
        self.h = [[0.]]
    #-----------------------------------------------------------------------------------------
    def clone(self):
        ret = RNN()
        ret.vocabulary = self.vocabulary
        ret.vocab_size = self.vocab_size
        
        ret.W_x_to_hidden = self.W_x_to_hidden
        ret.W_hidden_to_hidden = self.W_hidden_to_hidden
        ret.W_hidden_to_y = self.W_hidden_to_y
    
        ret.bias_hidden = self.bias_hidden
        ret.bias_output = self.bias_output
    
        self.h = [[0.]]
        return ret
    #-----------------------------------------------------------------------------------------
    def reset(self):
        self.h = [[0.]]
        return self.h[0][0]
    #-----------------------------------------------------------------------------------------
    def load_weights(self):
        self.W_x_to_hidden = np.array([[ 3.6, -4.8,  0.35, -0.26]])
        self.W_hidden_to_hidden = np.array([[ 4.1]])
        self.W_hidden_to_y = np.array([[-12.], [ -0.67], [ -0.85], [ 14.]])
        self.bias_hidden = np.array([[ 0.41]])
        self.bias_output = np.array([[-0.2], [-2.9], [ 6.1], [-3.4]])
        
        self.h = [[0.]]
    #-----------------------------------------------------------------------------------------
    def step(self, x_char):
        assert isinstance(x_char, str)
        x_index = self.vocabulary.index(x_char)
        x_one_hot = np.zeros((self.vocab_size, 1))
        x_one_hot[x_index] = 1

        hidden_state = np.tanh(\
              np.dot(self.W_x_to_hidden, x_one_hot) \
            + np.dot(self.W_hidden_to_hidden, self.h)\
            + self.bias_hidden)
        
        y_output = np.dot(self.W_hidden_to_y, hidden_state) + self.bias_output

        y_prob_after_softmax = np.exp(y_output) / np.sum(np.exp(y_output))
        index = np.argmax(y_prob_after_softmax)
        y_char = self.vocabulary[index]
        
        self.h = hidden_state # important to not forget to update the hidden state
        
        return y_char, self.h[0][0]
    #-----------------------------------------------------------------------------------------
    def loss(self, x_train, y_train, h_prev):
        xs, hs, ps = {}, {}, {}
        hs[-1] = h_prev.copy()
        loss_value = 0.

        # Forward propagation
        for t, x_char in enumerate(x_train):
            x_index = self.vocabulary.index(x_char)
            x_one_hot = np.zeros((self.vocab_size, 1))
            x_one_hot[x_index] = 1
        
            y_prob_target = np.zeros((self.vocab_size, 1))
            y_index = self.vocabulary.index(y_train[t])
            y_prob_target[y_index] = 1
        
            hidden_state = np.tanh(\
                  np.dot(self.W_x_to_hidden, x_one_hot) \
                + np.dot(self.W_hidden_to_hidden, hs[t - 1])\
                + self.bias_hidden)
        
            y_output = np.dot(self.W_hidden_to_y, hidden_state) + self.bias_output
        
            y_prob_after_softmax = np.exp(y_output) / np.sum(np.exp(y_output))

            xs[t] = x_one_hot
            hs[t] = hidden_state
            ps[t] = y_prob_after_softmax

            loss_value += -np.log(y_prob_after_softmax[y_index, 0])
        
        dWhy = np.zeros_like(self.W_hidden_to_y)
        dWxh = np.zeros_like(self.W_x_to_hidden)
        dWhh = np.zeros_like(self.W_hidden_to_hidden)
        
        dbh = np.zeros_like(self.bias_hidden)
        dby = np.zeros_like(self.bias_output)
        dh_next = np.zeros_like(hidden_state)

        for t, x_char in reversed(list(enumerate(x_train))):
            x_index = self.vocabulary.index(x_char)
            x_one_hot = np.zeros((self.vocab_size, 1))
            x_one_hot[x_index] = 1
            
            y_prob_target = np.zeros((self.vocab_size, 1))
            y_index = self.vocabulary.index(y_train[t])
            y_prob_target[y_index] = 1
        
            y_prob_after_softmax = ps[t]
            hidden_state = hs[t]

            dy = np.copy(y_prob_after_softmax)
            dy -= y_prob_target
        
            dWhy += np.dot(dy, hidden_state.T)
            dby += dy
        
            dh = np.dot(self.W_hidden_to_y.T, dy) + dh_next
        
            # tanh derivative and backprop
            dh_raw = (1 - hidden_state ** 2) * dh
        
            dbh += dh_raw
            dWxh += np.dot(dh_raw, x_one_hot.T)
            dWhh += np.dot(dh_raw, hs[t - 1].T)
        
            dh_next = np.dot(self.W_hidden_to_hidden.T, dh_raw)
        
        for dp in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dp, -5, 5, out=dp) # mitigate exploding gradients
        
        return loss_value, dWxh, dWhh, dWhy, dbh, dby, hidden_state
    #-----------------------------------------------------------------------------------------
    def print_weights(self):
        print(self.W_x_to_hidden)
        print(self.W_hidden_to_hidden)
        print(self.W_hidden_to_y)

        print(self.bias_hidden)
        print(self.bias_output)
    #-----------------------------------------------------------------------------------------
    def train(self, show=False):
        self.W_x_to_hidden = np.random.randn(1, self.vocab_size)
        self.W_hidden_to_hidden = np.array([[0.]])
        self.W_hidden_to_y = np.random.randn(self.vocab_size, 1)
    
        self.bias_hidden = np.random.randn(1, 1)
        self.bias_output = np.random.randn(self.vocab_size, 1)

        text = list(self.text)
        inputs = text[:len(text) - 1]
        targets = text[1:]
        # smooth_loss = -np.log(1.0 / self.vocab_size)
        
        learning_rate = 0.01
        prev_loss = None
        
        for n in range(10000000):
            h_prev = np.zeros((1, 1))
            loss, dWxh, dWhh, dWhy, dbh, dby, hprev = self.loss(inputs, targets, h_prev)
            # smooth_loss = smooth_loss * 0.999 + loss * 0.001
            
            if n % 10000 == 0: 
                print('iter {}, loss: {}, prev_loss: {}'.format(n, loss, prev_loss))
                if prev_loss is not None and np.abs(prev_loss - loss) < 0.01:
                    break
                prev_loss = loss
            
            for param, dparam in zip([
                    self.W_x_to_hidden, 
                    self.W_hidden_to_hidden, 
                    self.W_hidden_to_y, 
                    self.bias_hidden, 
                    self.bias_output],
                                          [dWxh, dWhh, dWhy, dbh, dby]):
                param += -learning_rate * dparam 

        print('loss: {}'.format(loss))
        
        if show:
            self.print_weights()

In [13]:
n = RNN()
n.train()


n.reset()
print((n.text[0], n.h))


for c in list(n.text[:-1]):
    next = n.step(c)
    print(next)

iter 0, loss: 6.15267754818036, prev_loss: None
iter 10000, loss: 0.15620661114649903, prev_loss: 6.15267754818036
iter 20000, loss: 0.06610570506554893, prev_loss: 0.15620661114649903
iter 30000, loss: 0.04055294504242273, prev_loss: 0.06610570506554893
iter 40000, loss: 0.3456545078401454, prev_loss: 0.04055294504242273
iter 50000, loss: 0.26247759258899683, prev_loss: 0.3456545078401454
iter 60000, loss: 0.016521952254129908, prev_loss: 0.26247759258899683
iter 70000, loss: 0.012272806561888484, prev_loss: 0.016521952254129908
loss: 0.012272806561888484
('h', [[0.0]])
('e', -0.99978868376414132)
('l', -0.056773616904606111)
('l', 0.27364530407626697)
('o', 0.94466414990500724)


In [None]:
er = RNN()
er.load_weights()
# er.print_weights()

er.reset()
print(('i', er.h))
next = er.step('i'); print(next)
next = er.step(next[0]); print(next)
next = er.step(next[0]); print(next)
next = er.step(next[0]); print(next)


## Calculations for the presentation

In [7]:
n = RNN()
n.load_weights()
n.reset()

### Input 'h'

In [10]:
x = np.array([[0], 
              [1], 
              [0], 
              [0]])

h_input = np.dot(n.W_hidden_to_hidden, n.h) + np.dot(n.W_x_to_hidden, x) + n.bias_hidden
n.h = np.tanh(h_input)
n.h

array([[-0.99969249]])

In [12]:
y = np.dot(n.W_hidden_to_y, n.h) + n.bias_output
y

array([[ 11.79630989],
       [ -2.23020603],
       [  6.94973862],
       [-17.39569488]])

In [14]:
p = np.exp(y) / np.sum(np.exp(y))
p

array([[  9.92205162e-01],
       [  8.03457698e-07],
       [  7.79403445e-03],
       [  2.08293106e-13]])

### Input 'e'

In [15]:
x = np.array([[1], 
              [0], 
              [0], 
              [0]])

h_input = np.dot(n.W_hidden_to_hidden, n.h) + np.dot(n.W_x_to_hidden, x) + n.bias_hidden
n.h = np.tanh(h_input)
n.h

array([[-0.08850701]])

In [16]:
y = np.dot(n.W_hidden_to_y, n.h) + n.bias_output
y

array([[ 0.86208418],
       [-2.8407003 ],
       [ 6.17523096],
       [-4.63909821]])

In [17]:
p = np.exp(y) / np.sum(np.exp(y))
p

array([[  4.90155904e-03],
       [  1.20846860e-04],
       [  9.94957586e-01],
       [  2.00078804e-05]])

### Input 'l'

In [18]:
x = np.array([[0], 
              [0], 
              [1], 
              [0]])

h_input = np.dot(n.W_hidden_to_hidden, n.h) + np.dot(n.W_x_to_hidden, x) + n.bias_hidden
n.h = np.tanh(h_input)
n.h

array([[ 0.37748309]])

In [19]:
y = np.dot(n.W_hidden_to_y, n.h) + n.bias_output
y

array([[-4.72979711],
       [-3.15291367],
       [ 5.77913937],
       [ 1.88476329]])

In [20]:
p = np.exp(y) / np.sum(np.exp(y))
p

array([[  2.67428257e-05],
       [  1.29431227e-04],
       [  9.79896974e-01],
       [  1.99468524e-02]])

### Input 'l'

In [21]:
x = np.array([[0], 
              [0], 
              [1], 
              [0]])

h_input = np.dot(n.W_hidden_to_hidden, n.h) + np.dot(n.W_x_to_hidden, x) + n.bias_hidden
n.h = np.tanh(h_input)
n.h

array([[ 0.98039683]])

In [22]:
y = np.dot(n.W_hidden_to_y, n.h) + n.bias_output
y

array([[-11.96476199],
       [ -3.55686588],
       [  5.26666269],
       [ 10.32555565]])

In [23]:
p = np.exp(y) / np.sum(np.exp(y))
p

array([[  2.07342092e-10],
       [  9.29373665e-07],
       [  6.31248169e-03],
       [  9.93686589e-01]])

In [11]:
sorted(set(list('hello world')))

[' ', 'd', 'e', 'h', 'l', 'o', 'r', 'w']

In [15]:
len(set(list('it was the best')))

9

In [16]:
len(set(list('hello world')))

8