LSTM course from https://machinelearningmastery.com/long-short-term-memory-recurrent-neural-networks-mini-course
The Vanilla LSTM is defined as:
Input layer.
Fully connected LSTM hidden layer.
Fully connected output layer.
model = Sequential()
model.add(LSTM(..., input_shape=(...)))
model.add(Dense(...))

RNN Examples from http://karpathy.github.io/2015/05/21/rnn-effectiveness/


In [8]:
import tensorflow as tf
import numpy as np
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.models import Sequential

In [13]:
# The RNN class has some internal state that it gets to update every 
# time step() is called. In the simplest case this state consists of a 
# single hidden vector, h 
# The RNNs parameters are three matrices W_hh, W_xh and W_hy
# The hidden state is self.h
# Activations are forced to the range (-1, 1) by tanh
# Input is x
class RNN:
  def step(self, x):
    # update the hidden state
    self.h = np.tanh(np.dot(self.W_hh, self.h) + 
                     np.dot(self.W_xh, x))
    # compute the output vector
    y = np.dot(self.W_hy, self.h)
    return y
# use example
# rnn = RNN()
# x = np.array([0,1,2,3,4])
# y = rnn.step(x)

In [28]:
# Vanilla RNN example from https://gist.github.com/karpathy/d4dee566867f8291f086
# My conversion to python 3
# data i/o
data = open('anh-crawl.txt', 'r').read()
chars = list(set(data))
# get contents and enumerate characters
data_size, vocab_size = len(data), len(chars)
print ('data has %d characters, %d unique.' % 
       (data_size, vocab_size))
char_to_ix = { ch:i for i, ch in enumerate(chars)}
ix_to_char = { i:ch for i, ch in enumerate(chars)}


data has 555 characters, 37 unique.


In [41]:
# set parameters 
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll RNN for
learning_rate = 1e-1
# input to hidden
Wxh = np.random.randn(hidden_size, vocab_size)*0.01
# hidden to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01
# hidden to output
Why = np.random.randn(vocab_size, hidden_size)*0.01
# hidden bias
bh = np.zeros((hidden_size, 1))
# output bias
by = np.zeros((vocab_size, 1))

In [48]:
def lossFun(inputs, targets, hprev):
    """
    inputs, targets are both lists of integers
    inputs = encoding of character as input
    targets = encoding of character which should be the output
    hprev is a H x 1 array of initial hidden state
    function returns the loss, gradients on model parameters
    and last hidden state
    """
    # create empty dictionaries for states
    xs, hs, ys, ps = {}, {}, {}, {}
    # copy contents of initial hidden state to last item in 
    # dictionary
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    # loop over input sequence (time)
    for t in range(len(inputs)):
        # encode in 1 of k representation
        xs[t] = np.zeros((vocab_size, 1)) # initialise to 0
        xs[t][inputs[t]] = 1 # set value for this character to 1
        # hidden state
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
        # unnormalized log probabilities for next chars
        ys[t] = np.dot(Why, hs[t]) + by 
        # probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        # softmax (cross-entropy loss)
        loss += -np.log(ps[t][targets[t], 0])
        # backward pass: compute gradients going backwards
        dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
        dbh, dby = np.zeros_like(bh), np.zeros_like(by)
        dhnext = np.zeros_like(hs[0])
        for t in reversed(xrange(len(inputs))):
            dy = np.copy(ps[t])
            dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
            dWhy += np.dot(dy, hs[t].T)
            dby += dy
            dh = np.dot(Why.T, dy) + dhnext # backprop into h
            dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
            dbh += dhraw
            dWxh += np.dot(dhraw, xs[t].T)
            dWhh += np.dot(dhraw, hs[t-1].T)
            dhnext = np.dot(Whh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [47]:
def sample(h, seed_ix, n):
    """ 
     sample a sequence of integers from the model 
     h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in xrange(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

0
1
2
