https://gist.github.com/karpathy/d4dee566867f8291f086
https://www.youtube.com/watch?v=cO0a0QYmFm8

commenting and understanding the toy implementation of RNN

In [49]:
"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np

# data I/O
data = open('/users/momori/data/input.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print 'data has %d characters, %d unique.' % (data_size, vocab_size)

##create ordering of characters
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 5#100 # size of hidden layer of neurons

#chunks of 25 datapoints at a time. for example, if input data too large (million time steps),
#there is no way to keep it in memory and backprop.  So  keep the data in chunks.
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

def lossFun(inputs, targets, hprev):
  #print 'loss function called with targets:',inputs,  targets
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in xrange(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    #print '**calculating loss**', t, len(ps), len(targets), targets[t]
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(xrange(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in xrange(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  ## sample a batch of data
  if p+seq_length+1 >= len(data) or n == 0: 
    #print '*******larger********', len(data), n
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
  #inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  #targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
  inputs = [char_to_ix[ch] for ch in data[0:p+len(data)-1]]
  targets = [char_to_ix[ch] for ch in data[1:p+len(data)]]
  #print 'inputs and targets:', inputs, targets

    
  # sample from the model now and then
  if n == 0 :
    sample_ix = sample(hprev, inputs[0], 20)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print '----\n %s \n----' % (txt, )

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  print 'hprev:', hprev
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress
  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter 
  if n == 500:
    break

data has 5 characters, 4 unique.
----
 looeeeohlhhheeolhoee 
----
hprev: [[-0.00597143]
 [ 0.00227201]
 [ 0.01172229]
 [ 0.00349197]
 [-0.01454518]]
iter 0, loss: 34.628247
hprev: [[ 0.16555391]
 [-0.14672674]
 [ 0.12212079]
 [ 0.15643707]
 [-0.21012226]]
hprev: [[-0.17870843]
 [-0.36584117]
 [ 0.42624092]
 [ 0.14598626]
 [-0.5286047 ]]
hprev: [[-0.42464381]
 [-0.63340512]
 [ 0.61916431]
 [ 0.43895141]
 [-0.76824202]]
hprev: [[-0.6697973 ]
 [-0.73580534]
 [ 0.54755189]
 [ 0.71688776]
 [-0.8568557 ]]
hprev: [[-0.78615463]
 [-0.73059473]
 [ 0.30856502]
 [ 0.82730047]
 [-0.87479539]]
hprev: [[-0.84402058]
 [-0.67830635]
 [-0.09753681]
 [ 0.88035437]
 [-0.86813453]]
hprev: [[-0.87747224]
 [-0.64867313]
 [-0.32977771]
 [ 0.89594425]
 [-0.86694206]]
hprev: [[-0.89429714]
 [-0.53616212]
 [-0.54137864]
 [ 0.91242022]
 [-0.85035702]]
hprev: [[-0.90760038]
 [-0.54949223]
 [-0.46581168]
 [ 0.89172334]
 [-0.86904599]]
hprev: [[-0.91809924]
 [-0.22607949]
 [-0.7258855 ]
 [ 0.93278834]
 [-0.81972531

hprev: [[-0.97537992]
 [ 0.7992227 ]
 [-0.79848531]
 [ 0.90451064]
 [-0.93595816]]
hprev: [[-0.97545044]
 [ 0.80021457]
 [-0.79883789]
 [ 0.90462716]
 [-0.93613337]]
hprev: [[-0.97551987]
 [ 0.80118942]
 [-0.79918615]
 [ 0.90474291]
 [-0.93630596]]
hprev: [[-0.97558823]
 [ 0.80214774]
 [-0.79953018]
 [ 0.90485791]
 [-0.93647599]]
hprev: [[-0.97565554]
 [ 0.80308998]
 [-0.79987007]
 [ 0.90497215]
 [-0.93664352]]
hprev: [[-0.97572183]
 [ 0.80401659]
 [-0.80020593]
 [ 0.90508564]
 [-0.93680863]]
hprev: [[-0.97578714]
 [ 0.80492797]
 [-0.80053782]
 [ 0.90519839]
 [-0.93697136]]
hprev: [[-0.97585148]
 [ 0.80582455]
 [-0.80086583]
 [ 0.9053104 ]
 [-0.93713178]]
hprev: [[-0.97591487]
 [ 0.80670672]
 [-0.80119006]
 [ 0.90542168]
 [-0.93728995]]
hprev: [[-0.97597734]
 [ 0.80757486]
 [-0.80151056]
 [ 0.90553223]
 [-0.93744593]]
hprev: [[-0.97603892]
 [ 0.80842932]
 [-0.80182743]
 [ 0.90564207]
 [-0.93759975]]
hprev: [[-0.97609962]
 [ 0.80927047]
 [-0.80214072]
 [ 0.90575119]
 [-0.93775148]]
hpre

hprev: [[-0.98095652]
 [ 0.87308654]
 [-0.8308253 ]
 [ 0.91771468]
 [-0.95037359]]
hprev: [[-0.98097109]
 [ 0.87326793]
 [-0.83092419]
 [ 0.91776182]
 [-0.95041352]]
hprev: [[-0.98098558]
 [ 0.8734483 ]
 [-0.83102265]
 [ 0.91780878]
 [-0.95045326]]
hprev: [[-0.981     ]
 [ 0.87362767]
 [-0.83112068]
 [ 0.91785558]
 [-0.9504928 ]]
hprev: [[-0.98101434]
 [ 0.87380603]
 [-0.83121828]
 [ 0.9179022 ]
 [-0.95053216]]
hprev: [[-0.9810286 ]
 [ 0.8739834 ]
 [-0.83131546]
 [ 0.91794866]
 [-0.95057132]]
hprev: [[-0.98104279]
 [ 0.87415979]
 [-0.83141223]
 [ 0.91799495]
 [-0.9506103 ]]
hprev: [[-0.98105691]
 [ 0.87433521]
 [-0.83150858]
 [ 0.91804107]
 [-0.95064909]]
hprev: [[-0.98107095]
 [ 0.87450967]
 [-0.83160452]
 [ 0.91808703]
 [-0.9506877 ]]
hprev: [[-0.98108492]
 [ 0.87468317]
 [-0.83170005]
 [ 0.91813282]
 [-0.95072612]]
hprev: [[-0.98109882]
 [ 0.87485572]
 [-0.83179517]
 [ 0.91817845]
 [-0.95076437]]
hprev: [[-0.98111265]
 [ 0.87502734]
 [-0.8318899 ]
 [ 0.91822392]
 [-0.95080243]]
hpre

In [52]:
def sample(h, seed_ix, n):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  print x
  ixes = []
  print Wxh, Whh, h
  for t in xrange(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) ##input to hidden layer calculation
    y = np.dot(Why, h) + by ##hidden to output layer calculation
    p = np.exp(y) / np.sum(np.exp(y)) #probability
    print 'p', p, p.ravel()
    ix = np.random.choice(range(vocab_size), p=p.ravel()) #choose random choice based on probability!!!
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

samp = sample(hprev, inputs[0], 2)
txt = ''.join(ix_to_char[ix] for ix in samp)
print txt, samp

[[ 1.]
 [ 0.]
 [ 0.]
 [ 0.]]
[[ 0.94270838  0.14435514 -0.53741642 -0.00264746]
 [ 1.062819   -0.64454494 -0.17670526 -0.01455252]
 [ 0.91767769  0.49805766 -0.28840531  0.00692118]
 [-0.66304976 -0.28711977  0.4631909   0.00535495]
 [ 1.18267922 -0.86737607 -0.64371182  0.00529882]] [[ 0.10268089  0.93785705 -0.68285824 -0.06944124  0.93291303]
 [-1.56040255 -0.26203712 -0.50552195  0.85895503 -0.34925457]
 [ 1.29239624  0.43814079  0.21233099 -0.59212146  0.49738124]
 [-1.49932358 -0.407987   -0.14157471  0.54926778 -0.42464609]
 [-0.63243438  0.16566009 -0.96595032  0.98870619  0.35625808]] [[-0.98300957]
 [ 0.89805046]
 [-0.84576079]
 [ 0.92517865]
 [-0.95619604]]
p [[  4.64984569e-02]
 [  8.55271538e-01]
 [  3.45743186e-04]
 [  9.78842616e-02]] [  4.64984569e-02   8.55271538e-01   3.45743186e-04   9.78842616e-02]
p [[  1.41376330e-03]
 [  9.91853243e-01]
 [  6.63993307e-03]
 [  9.30604032e-05]] [  1.41376330e-03   9.91853243e-01   6.63993307e-03   9.30604032e-05]
he [0, 1]


In [22]:
[char_to_ix[ch] for ch in data[p+1:p+seq_length]]

[1, 2, 2, 3]




In [62]:
np.random.choice(range(4), p= [0, 0, 0.1, 0.9])

3