#Long Short Term Memory (LSTM) - Overcoming limitations of RNN

##What you'll learn
1.  What is an LSTM
2.  How the mechanisms of LSTM cure vanishing gradients
2.  How to code an LSTM in theano
3.  Performance improvements and extensions

##Readings

http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf - Hochreiter's original paper on LSTM

http://arxiv.org/pdf/1410.4615.pdf - Sutskever SOTA LSTM - Feb 2015

http://arxiv.org/pdf/1506.00019v4.pdf - Lipton's excellent survey of RNN

https://www.youtube.com/watch?v=izGl1YSH_JA - Hinton video on LSTM



##LSTM background

http://arxiv.org/pdf/1506.00019v4.pdf - architecture of LSTM

https://www.youtube.com/watch?v=izGl1YSH_JA - Hinton video gives clear animation of signal coming into LSTM, waiting in LSTM until useful, gradient calculation and signal leaving LSTM.  Anyone bring popcorn?

LSTM was invented by Hochreiter to overcome some of the issues with conventional RNN's.  The forget gate was introduced by Gers et. al.  Code for LSTM is below.  

In [1]:
__author__ = 'mike.bowles'
#based on  code from https://gist.github.com/tmramalho/5e8fda10f99233b2370f
import theano
import theano.tensor as T
import numpy as np
import cPickle as pickle
import random
import matplotlib.pyplot as plt

class RNN(object):

    def __init__(self, nin, n_hidden, nout):
        rng = np.random.RandomState(1234)
        #cell input
        W_ug = np.asarray(rng.normal(size=(nin, n_hidden), scale= .01, loc = 0.0), dtype = theano.config.floatX)
        W_hg = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale=.01, loc = 0.0), dtype = theano.config.floatX)
        b_g = np.zeros((n_hidden,), dtype=theano.config.floatX)
        #input gate equation
        W_ui = np.asarray(rng.normal(size=(nin, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
        W_hi = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
        b_i = np.zeros((n_hidden,), dtype=theano.config.floatX)
        #forget gate equations
        W_uf = np.asarray(rng.normal(size=(nin, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
        W_hf = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
        b_f = np.zeros((n_hidden,), dtype=theano.config.floatX)
        #cell output gate equations
        W_uo = np.asarray(rng.normal(size=(nin, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
        W_ho = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
        b_o = np.zeros((n_hidden,), dtype=theano.config.floatX)
        #output layer
        W_hy = np.asarray(rng.normal(size=(n_hidden, nout), scale =.01, loc=0.0), dtype = theano.config.floatX)
        b_hy = np.zeros((nout,), dtype=theano.config.floatX)

        #cell input
        W_ug = theano.shared(W_ug, 'W_ug')
        W_hg = theano.shared(W_hg, 'W_hg')
        b_g = theano.shared(b_g, 'b_g')
        #input gate equation
        W_ui = theano.shared(W_ui, 'W_ui')
        W_hi = theano.shared(W_hi, 'W_hi')
        b_i = theano.shared(b_i, 'b_i')
        #forget gate equations
        W_uf = theano.shared(W_uf, 'W_uf')
        W_hf = theano.shared(W_hf, 'W_hf')
        b_f = theano.shared(b_f, 'b_f')
        #cell output gate equations
        W_uo = theano.shared(W_uo, 'W_uo')
        W_ho = theano.shared(W_ho, 'W_ho')
        b_o = theano.shared(b_o, 'b_o')
        #output layer
        W_hy = theano.shared(W_hy, 'W_hy')
        b_hy = theano.shared(b_hy, 'b_hy')

        self.activ1 = T.nnet.sigmoid
        self.activ2 = T.tanh
        lr = T.scalar()
        u = T.matrix()
        t = T.scalar()


        h0_tm1 = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX))
        s0_tm1 = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX))


        #theano.printing.debugprint([h0_tm1, u, W_hh, W_uh, W_hy, b_hh, b_hy], print_type=True)
        [h, s], _ = theano.scan(self.recurrent_fn, sequences = u,
                           outputs_info = [h0_tm1, s0_tm1],
                           non_sequences = [W_ug, W_hg, b_g, W_ui, W_hi,
                                            b_i, W_uf, W_hf, b_f, W_uo, W_ho, b_o, W_hy, b_hy])

        y = T.dot(h[-1], W_hy) + b_hy
        cost = ((t - y)**2).mean(axis=0).sum()

        gW_ug, gW_hg, gb_g, gW_ui, gW_hi, gb_i, \
        gW_uf, gW_hf, gb_f, gW_uo, gW_ho, gb_o, gW_hy, gb_hy \
            = T.grad(cost, [W_ug, W_hg, b_g, W_ui, W_hi, b_i, \
            W_uf, W_hf, b_f, W_uo, W_ho, b_o, W_hy, b_hy])
        update = [(W_ug, W_ug - lr*gW_ug), (W_hg, W_hg - lr*gW_hg ), (b_g, b_g - lr*gb_g), (W_ui, W_ui - lr*gW_ui),
                  (W_hi, W_hi - lr*gW_hi), (b_i, b_i - lr*gb_i), (W_uf, W_uf - lr*gW_uf), (W_hf, W_hf - lr*gW_hf),
                  (b_f, b_f - lr*gb_f), (W_uo, W_uo - lr*gW_uo), (W_ho, W_ho - lr*gW_ho), (b_o, b_o - lr*gb_o),
                  (W_hy, W_hy - lr*gW_hy), (b_hy, b_hy - lr*gb_hy)]
        #theano.printing.debugprint([h0_tm1], print_type=True)
        self.train_step = theano.function([u, t, lr], cost,
            on_unused_input='warn',
            updates=update,
            allow_input_downcast=True)

    def recurrent_fn(self, u_t, h_tm1, s_tm1, W_ug, W_hg, b_g, W_ui, W_hi,
                                            b_i, W_uf, W_hf, b_f, W_uo, W_ho, b_o, W_hy, b_hy):
        g_t = self.activ2(T.dot(u_t, W_ug) + T.dot(h_tm1, W_hg) + b_g)
        i_t = self.activ1(T.dot(u_t, W_ui) + T.dot(h_tm1, W_hi) + b_i)
        f_t = self.activ1(T.dot(u_t, W_uf) + T.dot(h_tm1, W_hf) + b_f)
        o_t = self.activ1(T.dot(u_t, W_uo) + T.dot(h_tm1, W_ho) + b_o)
        s_t = g_t * i_t + s_tm1*f_t
        h_t = self.activ2(s_t)*o_t
        #h_t = self.activ(T.dot(h_tm1, W_hh) + T.dot(u_t, W_uh) + b_hh)
        return [h_t, s_t]

if __name__ == '__main__':

    (xlist, ylist) = pickle.load(open('stockTT.bin', 'rb'))
    nInputs = len(xlist[0])
    x = np.array(xlist, dtype = theano.config.floatX)
    y = np.array(ylist, dtype = theano.config.floatX)
    nHidden = 20
    nOutputs = 1
    rnn = RNN(nInputs, nHidden, nOutputs)
    lr = 0.01
    e = 1.0
    nPasses = 1
    vals = []
    for i in range(nPasses):
        for j in range(len(x)):
            u = np.asarray(xlist[j], dtype = theano.config.floatX).reshape((1,nInputs))
            t = y[j]

            c = rnn.train_step(u, t, lr)
            if j%10==0: print "iteration {0}: {1}".format(j, np.sqrt(c))
            e = 0.1*np.sqrt(c) + 0.9*e
            vals.append(e)
    plt.plot(vals)
    plt.show()

iteration 0: 1.27395105362
iteration 10: 1.62759017944
iteration 20: 12.0258903503
iteration 30: 0.965957164764
iteration 40: 4.66629219055
iteration 50: 2.24896335602
iteration 60: 17.6596603394
iteration 70: 33.0507698059
iteration 80: 1.89690697193
iteration 90: 3.80808353424
iteration 100: 5.77995252609
iteration 110: 3.99296855927
iteration 120: 17.7919807434
iteration 130: 12.9250450134
iteration 140: 4.25853300095
iteration 150: 1.78161656857
iteration 160: 8.44487190247
iteration 170: 2.07122850418
iteration 180: 8.16769695282
iteration 190: 0.601239681244
iteration 200: 5.48171377182
iteration 210: 3.97003507614
iteration 220: 11.6794128418
iteration 230: 2.60186719894
iteration 240: 2.22204780579
iteration 250: 3.94717645645
iteration 260: 0.741184353828
iteration 270: 8.20735359192
iteration 280: 1.59923231602
iteration 290: 1.69198179245
iteration 300: 2.44786214828
iteration 310: 5.33109617233
iteration 320: 7.50127792358
iteration 330: 1.03071403503
iteration 340: 1.96485

  from scan_perform.scan_perform import *


##Q's
1.  Make parametric changes to LSTM and see how performance changes. 
2.  Add "peephole" connection mentioned in Lipton

##HW
Build multiple layer LSTM by stacking method from Pascanu paper on deep RNN's