In [647]:
import numpy as np

In [415]:
def relu(x,derivative=False):
    if derivative:
        np.where(x>0,1,0.)
    return np.where(x>0,x,0.)

def sigmoid(x,derivative=False):
    f = 1/(1+np.exp(-x))
    if derivative:
        return f*(1-f)
    return f

def tanh(x,derivative=False):
    f = (np.exp(2*x)-1)/(np.exp(2*x)+1)
    if derivative:
        return 1-f**2
    return f

def elu(x,derivative=False):
    if derivative:
        np.where(x>0,1,np.exp(x))
    return np.where(x>0,x,np.exp(x)-1)

def linear(x,derivative=False):
    if derivative:
        return sign(x)
    return x

def softmax(x,axis=-1):
    shift_x = x - np.max(x,axis,keepdims=True)
    return np.exp(shift_x)/np.sum(np.exp(shift_x),axis,keepdims=True)

def cross_entropy_loss(outputs,ground_truth,epsilon=1e-8):
    return -np.mean(np.sum(ground_truth*np.log(outputs+epsilon),1))

def mse(outputs,ground_truth):
    return np.mean(0.5*np.sum((outputs-ground_truth)**2,1))

def clip_by_global_norm(param_gradients,clip_value):
    global_norm = np.sum([np.linalg.norm(grad) for grad in param_gradients])
    return [grad*clip_value/max(clip_value,global_norm) for grad in param_gradients]
    

In [416]:
class Adam:
    def __init__(self,eta,beta1=0.9,beta2=0.999,epsilon=10e-8):
        self.eta = eta
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
    def __call__(self,iteration,last_m,last_v,gradient):
        adaptive_eta = self.eta*np.sqrt(1-self.beta2**iteration)/(1-self.beta1**iteration)
        current_m = self.beta1*last_m+(1-self.beta1)*gradient
        current_v = self.beta2*last_v+(1-self.beta2)*gradient**2
        return -adaptive_eta*current_m/(np.sqrt(current_v)+self.epsilon),current_m,current_v

In [594]:
class GRUCell:
    def __init__(self,num_features,state_size,output_size,random_seed):
        self._num_features = num_features
        self.state_size = state_size
        self.output_size = output_size
        self.params,self.moments = self.initialize_parameters(random_seed)
    def initialize_parameters(self,random_seed):
        np.random.seed(random_seed)
        params = dict()
        moments = dict()
        for label in ['update','reset','hidden']:
            params[label] = {'weights':np.random.randn(self._num_features+self.state_size,self.state_size)*0.1,
                           'bias' : np.zeros((1,self.state_size))}
            moments[label] = {'weights': [np.zeros((self._num_features+self.state_size,self.state_size))]*2,
                           'bias' : [np.zeros((1,self.state_size))]*2}
        params['output'] = {'weights':np.random.randn(self.state_size,self.output_size)*0.1,
                           'bias' : np.zeros((1,self.output_size))}
        moments['output'] = {'weights':[np.zeros((self.state_size,self.output_size))]*2,
                           'bias' : [np.zeros((1,self.output_size))]*2}
        np.random.seed(None)
        return params,moments
    def __call__(self,features,last_state,output_activation):
        # features and last_state comes in batch x features
        concat = np.hstack((features,last_state))
        update = sigmoid(np.dot(concat,self.params['update']['weights'])+self.params['update']['bias'])
        reset = sigmoid(np.dot(concat,self.params['reset']['weights'])+self.params['reset']['bias'])
        hidden = tanh(np.dot(np.hstack((features,last_state*reset)),self.params['hidden']['weights'])+self.params['hidden']['bias'])
        state = (1-update)*last_state+update*hidden
        output = output_activation(np.dot(state,self.params['output']['weights'])+self.params['output']['bias'])
        return update,reset,hidden,state,output
    def backpropogate(self,ds_next,inpt,previous_state,hidden,reset,update):
        
        concat = np.hstack((inpt,previous_state))
        
        ds = ds_next
        dh = ds * update * tanh(hidden,True)
        
        dWh = np.dot(np.hstack((inpt,previous_state*reset)).T,dh)
        dbh = np.sum(dh,0)
        
        dri = np.dot(dh,self.params['hidden']['weights'][self._num_features:].T)
        dr = previous_state * dri * sigmoid(reset,True)
        
        dWr = np.dot(concat.T,dr)
        dbr = np.sum(dr,0)
        
        dz = (previous_state - hidden) * sigmoid(update,True)
        
        dWz = np.dot(concat.T,dz)
        dbz = np.sum(dz,0)
        
        ds_next = np.dot(dz,self.params['update']['weights'][self._num_features:].T) +\
                    (1-update)*ds + dri*reset + np.dot(dr,self.params['reset']['weights'][self._num_features:].T)
        return ds_next, dWh, dbh, dWr, dbr, dWz, dbz
               

In [637]:
class RNN:
    def __init__(self,num_features,state_size,output_size,random_seed=None):
        self.cell = GRUCell(num_features,state_size,output_size,random_seed)
    def predict(self,features,initial_state=None):
        # features come in as batch x seqlen x features
        batch_size, seqlen = features.shape[:2]
        
        resets = np.zeros((batch_size,seqlen,self.cell.state_size))
        updates = np.zeros((batch_size,seqlen,self.cell.state_size))
        hiddens = np.zeros((batch_size,seqlen,self.cell.state_size))
        states = np.zeros((batch_size,seqlen+1,self.cell.state_size))
        outputs = np.zeros((batch_size,seqlen,self.cell.output_size))
        
        if initial_state is not None:
            states[:,0,:] = initial_state
            
        for t in range(1,seqlen+1):
            updates[:,t-1,:],resets[:,t-1,:],hiddens[:,t-1,:],states[:,t,:],outputs[:,t-1,:] = self.cell(features[:,t-1,:],states[:,t-1,:],linear)
        return outputs,states,hiddens,resets,updates
    
    def backpropogate(self,iteration,adam,cost_function,outputs,labels,features,states,hiddens,resets,updates):
        batch_size,seqlen,_ = features.shape
        
        loss = cost_function(outputs[:,seqlen-1,:],labels)
        dloss = (outputs[:,seqlen-1,:] - labels)/batch_size
        
        dWo = np.dot(states[:,seqlen,:].T,dloss)
        dbo = np.sum(dloss,0)
        
        ds_next = np.dot(dloss,self.cell.params['output']['weights'].T)
        AdWo, Adbo, AdWh, Adbh, AdWr, Adbr, AdWz, Adbz = 0, 0, 0, 0, 0, 0, 0, 0
        for t in reversed(range(1,seqlen+1)):
            
            ds_next, dWh, dbh, dWr, dbr, dWz, dbz = self.cell.backpropogate(
                                                        ds_next,features[:,t-1,:],states[:,t-1,:],hiddens[:,t-1,:],
                                                        resets[:,t-1,:],updates[:,t-1,:])
            AdWo+=dWo; Adbo += dbo; AdWh += dWh; Adbh += dbh; AdWr += dWr; Adbr += dbr; AdWz += dWz; Adbz += dbz
        dWo, dbo, AdWh, Adbh, AdWr, Adbr, AdWz, Adbz = clip_by_global_norm([dWo, dbo, AdWh, Adbh, AdWr, Adbr, AdWz, Adbz],10)
        #outputs
        m, v = self.cell.moments['output']['weights']
        update, m, v = adam(iteration,m,v,dWo)
        self.cell.moments['output']['weights'] = [m,v]
        self.cell.params['output']['weights'] = self.cell.params['output']['weights']+update
        m, v = self.cell.moments['output']['bias']
        update, m, v = adam(iteration,m,v,dbo)
        self.cell.moments['output']['bias'] = [m,v]
        self.cell.params['output']['bias'] = self.cell.params['output']['bias']+update
        
        #update
        m, v = self.cell.moments['update']['weights']
        update, m, v = adam(iteration,m,v,AdWz)
        self.cell.moments['update']['weights'] = [m,v]
        self.cell.params['update']['weights'] = self.cell.params['update']['weights']+update
        m, v = self.cell.moments['update']['bias']
        update, m, v = adam(iteration,m,v,Adbz)
        self.cell.moments['update']['bias'] = [m,v]
        self.cell.params['update']['bias'] = self.cell.params['update']['bias']+update
        
        #reset
        m, v = self.cell.moments['reset']['weights']
        update, m, v = adam(iteration,m,v,AdWr)
        self.cell.moments['reset']['weights'] = [m,v]
        self.cell.params['reset']['weights'] = self.cell.params['reset']['weights']+update
        m, v = self.cell.moments['reset']['bias']
        update, m, v = adam(iteration,m,v,Adbr)
        self.cell.moments['reset']['bias'] = [m,v]
        self.cell.params['reset']['bias'] = self.cell.params['reset']['bias']+update
        
        #hidden
        m, v = self.cell.moments['hidden']['weights']
        update, m, v = adam(iteration,m,v,AdWh)
        self.cell.moments['hidden']['weights'] = [m,v]
        self.cell.params['hidden']['weights'] = self.cell.params['hidden']['weights']+update
        m, v = self.cell.moments['hidden']['bias']
        update, m, v = adam(iteration,m,v,Adbh)
        self.cell.moments['hidden']['bias'] = [m,v]
        self.cell.params['hidden']['bias'] = self.cell.params['hidden']['bias']+update
        
        return loss
          
    def train(self,train_inputs,seqlen,batch_size,epochs,lr,cost_function):
        adam = Adam(lr)
        
        sinputs = np.array([train_inputs[t-seqlen+1:t+1,:] for t in range(seqlen-1,train_inputs.shape[0]-1)])
        stargets = train_inputs[seqlen:,:]
        
        num_datum = sinputs.shape[0]
        
        iteration = 1
        
        losses = []
        for i in range(epochs):
            batch_num = 0
            random_indices = np.random.permutation(num_datum)
            while batch_num < num_datum:
                if batch_num + 2*batch_size > num_datum:
                    current_batch = random_indices[batch_num:]
                else:
                    current_batch = random_indices[batch_num:(batch_num+batch_size)]
                x_batch, y_batch = sinputs[current_batch,:,:],stargets[current_batch,:]
                
                #forward_pass
                outputs,states,hiddens,resets,updates = self.predict(x_batch)
                
                #backprop
                #print(outputs)
                losses.append(self.backpropogate(iteration,adam,mse,outputs,y_batch,x_batch,states,hiddens,resets,updates))
                
                batch_num += batch_size
                iteration+=1
        return losses
        
    