In [9]:
import numpy as np
from ConvNetwork import IdentityActivator, element_wise_op



class RecurrentLayer(object):
    def __init__(self, input_width, state_width,
                 activator, learning_rate):
        self.input_width = input_width
        self.state_width = state_width
        self.activator = activator
        self.learning_rate = learning_rate
        self.times = 0       # 当前时刻初始化为t0
        self.state_list = [] # 保存各个时刻的state
        self.state_list.append(np.zeros(
            (state_width, 1)))           # 初始化s0
        self.U = np.random.uniform(-1e-4, 1e-4,
            (state_width, input_width))  # 初始化U
        self.W = np.random.uniform(-1e-4, 1e-4,
            (state_width, state_width))  # 初始化W
        self.gradient = np.zeros((state_width,state_width))
        
    def forward(self, input_array):
        '''
        根据『式2』进行前向计算
        '''
        self.times += 1
        
        state = (np.dot(self.U, input_array) +
                 np.dot(self.W, self.state_list[-1]))
        element_wise_op(state, self.activator.forward)
        self.state_list.append(state)
        

    def backward(self, sensitivity_array, 
                 activator):
        '''
        实现BPTT算法
        '''
        self.calc_delta(sensitivity_array, activator)
        self.calc_gradient()
        
    def calc_delta(self, sensitivity_array, activator):
        self.delta_list = []  # 用来保存各个时刻的误差项
        for i in range(self.times):
            self.delta_list.append(np.zeros(
                (self.state_width, 1)))
        print("start backward")
        self.delta_list.append(sensitivity_array)
        print(self.delta_list)
        # 迭代计算每个时刻的误差项
        for k in range(self.times - 1, 0, -1):
            self.calc_delta_k(k, activator)
        print("backward delta") 
        print(self.delta_list)
            
    def calc_delta_k(self, k, activator):
        '''
        根据k+1时刻的delta计算k时刻的delta
        '''
        print("calc_delta_k")
        print(k)
        state = self.state_list[k+1].copy()
        element_wise_op(self.state_list[k+1],
                    activator.backward)
        self.delta_list[k] = np.dot(
            np.dot(self.delta_list[k+1].T, self.W),
            np.diag(state[:,0])).T
        print(self.delta_list[k])
        
    def calc_gradient(self):
        self.gradient_list = [] # 保存各个时刻的权重梯度
        for t in range(self.times + 1):
            self.gradient_list.append(np.zeros(
                (self.state_width, self.state_width)))
        print("orginal grad")
        print(self.gradient_list)
        
        for t in range(self.times, 0, -1):
            self.calc_gradient_t(t)
        # 实际的梯度是各个时刻梯度之和
        #self.gradient = reduce(
        #    lambda a, b: a + b, self.gradient_list,
        #    self.gradient_list[0]) # [0]被初始化为0且没有被修改过
        for i in range(len(self.gradient_list)):   
            self.gradient = self.gradient + self.gradient_list[i]
        print("backward grad")    
        print(self.gradient)
        
    def calc_gradient_t(self, t):
        '''
        计算每个时刻t权重的梯度
        '''
        gradient = np.dot(self.delta_list[t],
            self.state_list[t-1].T)
        self.gradient_list[t] = gradient
        
        
        
    def update(self):
        '''
        按照梯度下降，更新权重
        '''
        self.W -= self.learning_rate * self.gradient
        
        
    
    
    def reset_state(self):
        self.times = 0       # 当前时刻初始化为t0
        self.state_list = [] # 保存各个时刻的state
        self.state_list.append(np.zeros(
            (self.state_width, 1)))      # 初始化s0

def data_set():
    x = [np.array([[1], [2], [3]]),
         np.array([[2], [3], [4]])]
    d = np.array([[1], [2]])
    return x, d

        
def gradient_check():
    '''
    梯度检查
    '''
    # 设计一个误差函数，取所有节点输出项之和
    error_function = lambda o: o.sum()
    rl = RecurrentLayer(3, 2, IdentityActivator(), 1e-3)
    # 计算forward值
    x, d = data_set()

    rl.forward(x[0])
    rl.forward(x[1])
    print("carl after forward statelist")
    print(rl.state_list)

    # 求取sensitivity map
    sensitivity_array = np.ones(rl.state_list[-1].shape,
                                dtype=np.float64)
    
    print(sensitivity_array)
 
    # 计算梯度
    rl.backward(sensitivity_array, IdentityActivator())
    print("after backward:")
    # 检查梯度
    epsilon = 10e-4
    for i in range(rl.W.shape[0]):
        for j in range(rl.W.shape[1]):
            rl.W[i,j] += epsilon
            rl.reset_state()
            rl.forward(x[0])
            rl.forward(x[1])
            err1 = error_function(rl.state_list[-1])
            rl.W[i,j] -= 2*epsilon
            rl.reset_state()
            rl.forward(x[0])
            rl.forward(x[1])
            err2 = error_function(rl.state_list[-1])
            expect_grad = (err1 - err2) / (2 * epsilon)
            rl.W[i,j] += epsilon
            print ('weights(%d,%d): expected - actural %f - %f' % (
                i, j, expect_grad, rl.gradient[i,j]))
            
        

if __name__ == '__main__':
    #train_and_evaluate()
    gradient_check()

carl after forward statelist
[array([[ 0.],
       [ 0.]]), array([[ -1.97692043e-06],
       [  1.11474772e-04]]), array([[  3.51361288e-05],
       [  1.75012993e-04]])]
[[ 1.]
 [ 1.]]
start backward
[array([[ 0.],
       [ 0.]]), array([[ 0.],
       [ 0.]]), array([[ 1.],
       [ 1.]])]
calc_delta_k
1
[[  3.83899458e-09]
 [  1.01772478e-09]]
backward delta
[array([[ 0.],
       [ 0.]]), array([[  3.83899458e-09],
       [  1.01772478e-09]]), array([[ 1.],
       [ 1.]])]
orginal grad
[array([[ 0.,  0.],
       [ 0.,  0.]]), array([[ 0.,  0.],
       [ 0.,  0.]]), array([[ 0.,  0.],
       [ 0.,  0.]])]
backward grad
[[ -1.97692043e-06   1.11474772e-04]
 [ -1.97692043e-06   1.11474772e-04]]
after backward:
weights(0,0): expected - actural -0.000002 - -0.000002
weights(0,1): expected - actural 0.000111 - 0.000111
weights(1,0): expected - actural -0.000002 - -0.000002
weights(1,1): expected - actural 0.000111 - 0.000111
