 ## RNN Equations

1. Predict State

$ H_t = \phi(X_t W_{xh} + H_{t-1} W_{hh} + b_h) $

2. Predict Output from State

$ O_t = H_t W_{hy} + b_y $

## Points
* The state, H changes for every step

* The weight matrix is always same across time steps during prediction -> drastic parameter reduction

* 3 W, 2 b are the GD updateable parameters
* Inputs is usually 3D -> num_examples, time_steps, dimension
* dimension above is 1 -> univariate

* At any step t, computations are:
    1. concatenate $X_t, H_{t_1}$
    2. feed this to FC layer with weights $ W_{xh}, W_{hh}$ -> perform activation
    3. feed this to another FC output layer -> $O_t$
     

## Dimensions

### State Prediction Step
* $X_t$ = n * d (number of minibatch, input_size)

* $W_{xh}$ = d * h (input_size, hidden_state_size)
* $H_t$ = n * h (number of minibatch, hidden_state_size)

* $W_{hh}$ = h * h (hidden_state_size, hidden_state_size)
* $b_h$ = 1 * h (hidden_state_size) 


### Output Prediction Step
* $W_{yh}$ = h * q (hidden_state_size, output_size)
* $b_y$ = 1 * q  (output_size) 

## Forward Prop


In [1]:
from rnn_helper import *
import tensorflow as tf

In [2]:
n, d, h, q = 1, 3, 4, 1
x, Wxh = tf.random.normal((n, d), 0, 1), tf.random.normal((d, h), 0, 1)
h, Whh = tf.random.normal((n, h), 0, 1), tf.random.normal((h, h), 0, 1)
tf.matmul(x, Wxh) + tf.matmul(h, Whh)

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-2.1495504 ,  1.3647544 , -2.4028628 ,  0.20764503]],
      dtype=float32)>

In [3]:
# Concat x, h and Wxh, Whh
tf.matmul(tf.concat((x, h), 1), tf.concat((Wxh, Whh), 0))

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-2.1495504 ,  1.3647543 , -2.4028628 ,  0.20764506]],
      dtype=float32)>

In [4]:
batch_size, num_steps = 32, 35
train_iter, vocab = load_data_time_machine(batch_size, num_steps)

train_random_iter, vocab_random_iter = load_data_time_machine(
    batch_size, num_steps, use_random_iter=True)

In [7]:
def get_params(vocab_size, num_hidden):
    num_inputs = num_outputs = vocab_size

    def normal(shape):
        return tf.random.normal(shape = shape, mean = 0, stddev = 0.01, dtype=tf.float32)

    # Hidden Layers
    Wxh = tf.Variable(normal((num_inputs, num_hidden)), dtype = tf.float32)
    Whh = tf.Variable(normal((num_hidden, num_hidden)), dtype = tf.float32)
    Wyh = tf.Variable(normal((num_hidden, num_outputs)), dtype = tf.float32)

    bh = tf.Variable(tf.zeros(num_hidden), dtype = tf.float32)
    by = tf.Variable(tf.zeros(num_outputs), dtype = tf.float32)
    params = [Wxh, Whh, Wyh, bh, by]
    return params

In [8]:
def init_rnn_state(batch_size, num_hidden):
    return (tf.zeros(shape = (batch_size, num_hidden)))

In [9]:
def rnn(inputs, state, params):
    ''' inputs shape - (num_steps, batch_size, vocab_size)
    '''
    Wxh, Whh, Wyh, bh, by = params
    H = state
    outputs = []
    for X in inputs:
        # X.shape : (batch_size, vocab_size)
        X = tf.reshape(X, [-1, Wxh.shape[0]])
        H = tf.tanh(tf.matmul(X, Wxh) + tf.matmul(H, Whh) + bh)
        Y = tf.matmul(H, Wyh) + by
        outputs.append(Y)
    return tf.concat(outputs, axis = 0), (H, )

In [10]:
class RNNModelScratch:
    def __init__(self, vocab_size, num_hidden, init_state, forward_fn):
        self.vocab_size, self.num_hidden = vocab_size, num_hidden
        self.init_state, self.forward_fn = init_state, forward_fn

    def __call__(self, X, state, params):
        X = tf.one_hot(tf.transpose(X), self.vocab_size)
        X = tf.cast(X, tf.float32)
        return self.forward_fn(X, state, params)

    def begin_state(self, batch_size):
        return self.init_state(batch_size, self.num_hidden)

In [12]:
X = tf.reshape(tf.range(10), (2, 5))
num_hiddens = 512
model = RNNModelScratch(len(vocab), num_hiddens,
                        init_rnn_state, rnn)
state = model.begin_state(X.shape[0])
params = get_params(len(vocab), num_hiddens)
Y, new_state = model(X, state, params)
Y.shape, len(new_state), new_state[0].shape

(TensorShape([10, 28]), 1, TensorShape([2, 512]))

In [23]:
def predict_ch8(prefix, num_preds, model, vocab, params):  #@save
    """Generate new characters following the `prefix`."""
    state = model.begin_state(batch_size=1)
    outputs = [vocab[prefix[0]]]
    get_input = lambda: tf.reshape(tf.constant([outputs[-1]]), (1, 1)).numpy()
    for y in prefix[1:]:  # Warm-up period
        _, state = model(get_input(), state, params)
        outputs.append(vocab[y])
    for _ in range(num_preds):  # Predict `num_preds` steps
        y, state = model(get_input(), state, params)
        print(y.numpy())
        outputs.append(int(y.numpy().argmax(axis=1).reshape(1)))
    return ''.join([vocab.idx_to_token[i] for i in outputs])

In [24]:
predict_ch8('time traveller ', 10, model, vocab, params)

[[[[[[[[[[[[[[[[-1.0938584e-03 -8.3082268e-04
                 5.7275733e-03 -4.1611833e-03
                 7.3284667e-05  1.2659185e-03
                 5.7526413e-05  1.1367769e-03
                 1.8775004e-03  3.4535432e-03
                 1.6019269e-03  4.9708481e-04
                 2.2623630e-03 -3.1120356e-03
                 9.6344424e-04  2.0565109e-03
                -2.6395558e-03 -3.0460595e-03
                -3.9546560e-03  5.4238751e-03
                 1.1826421e-03 -1.5820157e-03
                -7.2282576e-04  4.4794474e-04
                -1.4377866e-04  3.9684842e-03
                -2.3874268e-03 -1.6274324e-03]]]]]]]]]]]]]]]]


ValueError: cannot reshape array of size 28 into shape (1,)