In [1]:
import theano
import theano.tensor as T
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use('fivethirtyeight')

# Training Algorithms

In [2]:
def adam_loves_theano(inp_list,cost,param_list,alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7):
    """
    adam: adaptive... momentum???

    Parameters
    ----------
    inp_list: List of Theano variables
        Whatever non-parameter things are needed to do a training step
    cost: Theano variable
        Objective fucntion to minimize
    param_list: List of Theano variables
        The variables that are changed for optimization
    [alpha]: {0.001}
        Training parameter: learning rate
    [beta1]: {0.9}
        Training parameter: decay rate for momentum
    [beta2]: {0.999}
        Training parameter: decay rate for velocity
    [epsilon]: {1e-7}
        Training parameter: i dunno.
        
    Outputs
    -------
    2 functions, which take the same inputs and must be called sequentially:
        f_adam_helpers (updates helpers)
        f_adam_train (uses updated helpers to update parameters in param_list)

    """
    # Create 2 theano functions that will be called sequentially
    # The first one "updates" the shared variables that go into the calculation of the parameter update
    # The second one combines them into an update
    
    # Create the first function:
    # Initialize the helper variables, one for each parameter (this will only happen once and doesn't affect updates)
    Ts = [theano.shared(0.)               for p in param_list] # t term in adam
    Ms = [theano.shared(p.get_value()*0.) for p in param_list] # m term in adam
    Vs = [theano.shared(p.get_value()*0.) for p in param_list] # v term in adam
    # Define each of their update rules
    up_t = [(T_,T_+1) for T_ in Ts]
    up_m = [(M,beta1*M + (1-beta1)*T.grad(cost,p))      for M, p in zip(Ms,param_list)]
    up_v = [(V,beta2*V + (1-beta2)*(T.grad(cost,p)**2)) for V, p in zip(Vs,param_list)]
    # Combine this into a full update list
    up_h = up_t + up_m + up_v
    # Create that first function
    f_adam_helpers = theano.function(inp_list,cost,updates=up_h)
    
    # Create the second function (during training, this is called right after calling the first):
    # Compute, using the updated helper variables, the components of the parameter update equation
    # (updated by the call to f_adam_helpers, which will occurr during training)
    mHat = [m / (1-(beta1**t)) for m, t in zip(Ms,Ts)]
    vHat = [v / (1-(beta2**t)) for v, t in zip(Vs,Ts)]
    # Use them to update the parameters
    up_p = [(p, p - (alpha*mH / (T.sqrt(vH)+epsilon))) for p, mH, vH in zip(param_list,mHat,vHat)]
    # Create your training function with this update
    f_adam_train = theano.function(inp_list,cost,updates=up_p)
    
    return f_adam_helpers, f_adam_train

In [3]:
def adadelta_fears_committment(inp_list,cost,param_list,rho=.95, epsilon=1e-6):
    """
    An adaptive learning rate optimizer

    Parameters
    ----------
    inp_list: List of Theano variables
        Whatever non-parameter things are needed to do a training step
    cost: Theano variable
        Objective fucntion to minimize
    param_list: List of Theano variables
        The variables that are changed for optimization
    [rho]: {0.95}
        Training parameter: decay rate
    [epsilon]: {1e-6}
        Training parameter: i dunno.
        
    Outputs
    -------
    2 functions, which take the same inputs and must be called sequentially:
        f_adadelta_helpers (updates helpers)
        f_adadelta_train (uses updated helpers to update parameters in param_list)

    Notes
    -----
    For more information, see [ADADELTA]_.

    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
       Rate Method*, arXiv:1212.5701.
    """

    ### = DESCRIPTION FROM LITERATURE
    
    # Initialize the helper variables, one for each parameter (this will only happen once and doesn't affect updates)
    # Standard gradients: g_t
    zipped_grads   = [theano.shared(p.get_value()*np.zeros(1).astype(theano.config.floatX))
                      for p in param_list]
    # Running expectation of squared update: E[ d[x]**2 ]_t
    running_up2    = [theano.shared(p.get_value()*np.zeros(1).astype(theano.config.floatX))
                      for p in param_list]
    # Running expectation of squared gradient: E[g**2]_t
    running_grads2 = [theano.shared(p.get_value()*np.zeros(1).astype(theano.config.floatX))
                      for p in param_list]
    

    
    ### Compute Gradient: g_t
    # Update rule for shared variables in zipped_grads (they just equal variables in grads)
    zgup = [(zg, T.grad(cost,p)) for zg, p in zip(zipped_grads, param_list)]
    
    ### Accumulate Gradient: E[g**2]_t = rho * E[g**2]_t-1  +  (1-rho) * (g_t)**2
    # Update rule for shared variables in running_grads2
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (T.grad(cost,p) ** 2))
             for rg2, p in zip(running_grads2, param_list)]

    # Function that, when called, applies the two above update rules
    # (during training, this is called, then f_update is)
    f_adadelta_helpers = theano.function(inp_list,cost,updates=zgup+rg2up)

    
    ### Compute Update: d[x]_t = - [ RMS(d[x])_t-1 / RMS(g)_t ] * g_t
    # Create symbolic variable out of zipped_grads, running_up2, and running_grads2 for each parameter
    updir = [-T.sqrt(ru2 + epsilon) / T.sqrt(rg2 + epsilon) * zg
             for zg, ru2, rg2 in zip(zipped_grads,
                                     running_up2,
                                     running_grads2)]
    
    ### Accumulate Update: E[ d[x]**2 ]_t = rho * E[ d[x]**2 ]_t-1  +  (1-rho) * (d[x]_t)**2
    # Update rule for ru2up (whatever that is)
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    
    ### Apply Update: x_t+1 = x_t + d[x]_t
    # Final update rule for parameter, combining all that
    param_up = [(p, p + ud) for p, ud in zip(param_list, updir)]

    # Function to actually update the parameters (as well as ru2up)
    f_adadelta_train = theano.function(inp_list,cost, updates=ru2up + param_up)

    return f_adadelta_helpers, f_adadelta_train

In [4]:
def i_hate_SGD(inp_list,cost,param_list,alpha=0.01):
    """
    SGD: but why???

    Parameters
    ----------
    inp_list: List of Theano variables
        Whatever non-parameter things are needed to do a training step
    cost: Theano variable
        Objective fucntion to minimize
    param_list: List of Theano variables
        The variables that are changed for optimization
    [alpha]: {0.001}
        Training parameter: learning rate
        
    Outputs
    -------
    f_SGD_train: function
        Uses updated helpers to update parameters in param_list

    """
    # This is so straightforward I should punch you if you don't understand.
    update_rules = [(p,p-T.grad(cost,p)*alpha) for p in param_list]
    f_SGD_train = theano.function(inp_list,cost,updates=update_rules)
    # Did you get it? Because if not you deserve punches.
    return f_SGD_train

# Network Component Builders

In [5]:
class LSTM_layer:
    """A layer of an LSTM network"""
    def __init__(self,n_inp,n_hidden,n_out):
        self.n_inp    = n_inp
        self.n_hidden = n_hidden
        self.n_out    = n_out
        # LSTM layers have, for every hidden "unit" a unit and a corresponding memory cell
        # Memory cells include input, forget, and output gates as well as a value
        # There is also a set of outputs. 
        # Fuck that's a lot of stuff.
        # (this should help):
        def init_w(n_in,n_out=n_hidden):
            return theano.shared( np.random.uniform(
                low = -1. / np.sqrt(n_in),
                high = 1. / np.sqrt(n_in),
                size = (n_out,n_in) ).astype(theano.config.floatX) )
        def init_b(n=n_hidden):
            return theano.shared( np.zeros(n).astype(theano.config.floatX) )
        # Initialize attributes for every weight of i
        self.w_i = init_w(n_inp+n_out + n_hidden + n_hidden) # (inp+prev_out + prev_hidden + prev_c)
        self.b_i = init_b()
        # Initialize attributes for every weight of f
        self.w_f = init_w(n_inp + n_hidden + n_hidden) # (inp + prev_hidden + prev_c)
        self.b_f = init_b()
        # Initialize attributes for every weight of c
        self.w_c = init_w(n_inp+n_out + n_hidden) # (inp+prev_out + prev_hidden)
        self.b_c = init_b()
        # Initialize attributes for every weight of o
        self.w_o = init_w(n_inp+n_out + n_hidden + n_hidden) # (inp+prev_out + prev_hidden + CURRENT_c)
        self.b_o = init_b()
        # Intialize attributes for weights of y (the real output)
        self.w_y = init_w(n_hidden,n_out)
        self.b_y = init_b(n_out)
        # Congrats. Now this is initialized.
    
    # Provide a list of all parameters to train
    def list_params(self):
        return [self.w_i,self.b_i,self.w_f,self.b_f,self.w_c,self.b_c,self.w_o,self.b_o,self.w_y,self.b_y]
    
    # Write methods for calculating the value of each of these playas at a given step
    def calc_i(self,combined_inputs):
        return T.nnet.sigmoid( T.dot( self.w_i, combined_inputs ) + self.b_i )
    def calc_f(self,combined_inputs):
        return T.nnet.sigmoid( T.dot( self.w_f, combined_inputs ) + self.b_f )
    def calc_c(self,prev_c,curr_f,curr_i,combined_inputs):
        return curr_f*prev_c + curr_i*T.tanh( T.dot( self.w_c, combined_inputs ) + self.b_c )
    def calc_o(self,combined_inputs):
        return T.nnet.sigmoid( T.dot( self.w_o, combined_inputs ) + self.b_o )
    def calc_h(self,curr_o,curr_c):
        return curr_o * T.tanh( curr_c )
    def calc_y(self,curr_h):
        return T.dot( self.w_y, curr_h ) + self.b_y
    # Put this together in a method for updating c, h, and y
    def step(self, inp, prev_c, prev_h, prev_y):
        i = self.calc_i( T.concatenate((inp,prev_y,prev_h,prev_c)) )
        f = self.calc_f( T.concatenate((inp,prev_h,prev_c)) )
        c = self.calc_c( prev_c, f, i, T.concatenate((inp,prev_y,prev_h)) )
        o = self.calc_o( T.concatenate((inp,prev_y,prev_h,c)) )
        h = self.calc_h( o, c )
        y = self.calc_y( h )
        return c, h, y

In [6]:
class LSTM_stack:
    """A stack of LSTMs"""
    def __init__(self,inp_dim,layer_spec_list):
        # Create each layer. Store them as a list.
        self.layers = []
        for K,spec in enumerate(layer_spec_list):
            if K==0: my_inps = inp_dim
            else:    my_inps = layer_spec_list[K-1][1]
            self.layers = self.layers + [LSTM_layer(my_inps,spec[0],spec[1])]
            
    def list_params(self):
        # Return all the parameters in this stack.... You sure?
        P = []
        for L in self.layers: P = P + L.list_params()
        return P
            
    def process(self,inp_sequence):
        # Go through the whole input and return the concatenated outputs of the stack after it's all said and done
        outs = []
        for K,layer in enumerate(self.layers):
            if K==0: curr_seq = inp_sequence
            else:    curr_seq = Y # (from previous layer)
            out_init = [
                T.alloc( np.zeros(1).astype(theano.config.floatX), layer.n_hidden),
                T.alloc( np.zeros(1).astype(theano.config.floatX), layer.n_hidden),
                T.alloc( np.zeros(1).astype(theano.config.floatX), layer.n_out)
                ]
            ([C,H,Y],updates) = theano.scan(fn = layer.step,
                                            sequences = curr_seq,
                                            outputs_info = out_init)
            outs = outs + [Y[-1]]
        return T.concatenate( tuple(outs) )

In [7]:
class soft_reader:
    """A softmax layer"""
    def __init__(self,n_in,n_out):
        # This is a simple layer, described just by a single weight matrix (no bias)
        self.w = theano.shared( np.random.uniform(
                low = -1. / np.sqrt(n_in),
                high = 1. / np.sqrt(n_in),
                size = (n_out,n_in) ).astype(theano.config.floatX) )
    
    def list_params(self):
        # Easy.
        return [self.w]
    
    def process(self,inp):
        # Do your soft max kinda thing.
        return T.nnet.softmax(T.dot(self.w,inp))

# The Network Kingpin

In [8]:
class full_net:
    """The full input to output network"""
    def __init__(self,inp_dim,LSTM_spec_list,final_out_size):
        # Get you your LSTM stack
        self.LSTM_stack = LSTM_stack(inp_dim,LSTM_spec_list)
        LSTM_out_size = 0
        for L in LSTM_spec_list: LSTM_out_size += L[1]
        # Get you your softmax readout
        self.soft_reader = soft_reader(LSTM_out_size,final_out_size)
        
        ### ARTICULATE THE NETWORK GRAPH ###
        # Input is a sequence represented by a matrix
        inpSeq = T.dmatrix('inp')
        # Output is a scalar indicating the correct answer
        target = T.iscalar('target')
        
        # Through the LSTM stack, then soft max
        y = self.LSTM_stack.process(inpSeq)
        p = self.soft_reader.process(y)[0]
        
        # Give this class a process function
        self.process = theano.function([inpSeq],p)
        
        # Cost is based on the probability given to the correct answer
        # (this is like cross-entropy and still involves the whole w_v matrix because of softmax)
        cost = -T.log(p[target])
        ###
        
        ### For creating easy functions ###
        self.__p = p
        self.__cost = cost
        self.__inp_list = [inpSeq,target]
        self.__param_list = self.LSTM_stack.list_params() + self.soft_reader.list_params()
        # For just getting your cost on a training example
        self.cost = theano.function(self.__inp_list, self.__cost)
        # For making training functions
        self.__f_adam_helpers, self.__f_adam_train =\
            adam_loves_theano(self.__inp_list, self.__cost, self.__param_list) #adam
        self.__f_adadelta_helpers, self.__f_adadelta_train =\
            adadelta_fears_committment(self.__inp_list, self.__cost, self.__param_list) #adadelta
        
    # These functions implements that sequential calling into one trianing step:
    def adam_step(self,S,T):
        self.__f_adam_helpers(S,T)
        return self.__f_adam_train(S,T)
    def adadelta_step(self,S,T):
        self.__f_adadelta_helpers(S,T)
        return self.__f_adadelta_train(S,T)

In [9]:
spec_list = 5*np.ones(shape=(2,2)).astype('int')

In [10]:
network = full_net(5, spec_list, 5)



In [12]:
dir(network)

['LSTM_stack',
 '__doc__',
 '__init__',
 '__module__',
 '_full_net__cost',
 '_full_net__f_adadelta_helpers',
 '_full_net__f_adadelta_train',
 '_full_net__f_adam_helpers',
 '_full_net__f_adam_train',
 '_full_net__inp_list',
 '_full_net__p',
 '_full_net__param_list',
 'adadelta_step',
 'adam_step',
 'cost',
 'process',
 'soft_reader']

In [33]:
input = np.random.normal(size=(5,5))
input.shape

(5, 5)

In [34]:
network.process(input)

array([ 0.20543645,  0.19191193,  0.19097494,  0.20620995,  0.20546673])