[toc]

# Numpy LSTM

In [25]:
import tensorflow as tf
import numpy as np

np.random.seed(123)
n_samples = 2
n_sequences = 3
n_features = 5
x = np.random.randn(n_samples, n_sequences, n_features)
inputs = tf.constant(x, dtype=tf.float64)

lstm = tf.keras.layers.LSTM(4, return_sequences=True, return_state=True)
whole_seq_output, final_memory_state, final_carry_state = lstm(inputs)

print(whole_seq_output.shape)
print(final_memory_state.shape)
print(final_carry_state.shape)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

(2, 3, 4)
(2, 4)
(2, 4)


In [29]:
tf.keras.layers.LSTM?

In [26]:
def sigmoid(x):
    return 1 / (1 + exp(-x + 10e-5))

def softmax(x):
    c = np.max(x)
    exp_x = np.exp(x-c)
    sum_exp_x = np.sum(exp_x)
    return exp_x / (sum_exp_x + 10e-5)

In [31]:
x.shape

(2, 3, 5)

In [35]:
def lstm_forward(x, a0, parameters):    
    """
    Arguments:
    x -- Input data for every time-step, of shape (n_x, m, T_x).
    a0 -- Initial hidden state, of shape (n_a, m)
    parameters -- python dictionary containing:
                 Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                 bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
                 Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                 bi -- Bias of the update gate, numpy array of shape (n_a, 1)
                 Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
                 bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
                 Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
                 bo -- Bias of the output gate, numpy array of shape (n_a, 1)
                 Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                 by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    Returns:
    a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
    y -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
    caches -- tuple of values needed for the backward pass, contains (list of all the caches, x)
    """
    # 初始化缓存列表
    caches = []    
    # 获取 x 和 参数 Wy 的维度大小
    n_x, m, T_x = x.shape
    n_y, n_a = parameters['Wy'].shape    
    # 初始化 a, c 和 y 的值
    a = np.zeros((n_a, m, T_x))
    c = np.zeros((n_a, m, T_x))
    y = np.zeros((n_y, m, T_x))    
    # 初始化 a_next 和 c_next
    a_next = a0
    c_next = np.zeros(a_next.shape)    
    # 循环所有时间步
    for t in range(T_x):        
    # 更新下一时间步隐状态值、记忆值并计算预测 
        a_next, c_next, yt, cache = lstm_cell_forward(x[:,:,t], a_next, c_next, parameters)        
        # 在 a 中保存新的激活值 
        a[:,:,t] = a_next        
        # 在 a 中保存预测值
        y[:,:,t] = yt        
        # 在 c 中保存记忆值
        c[:,:,t]  = c_next        
        # 添加到缓存列表
        caches.append(cache)    
        # 保存各计算值供反向传播调用
    caches = (caches, x)    
    return a, y, c, caches

In [36]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):    
    """
    Implement a single forward step of the LSTM-cell as described in Figure (4)

    Arguments:
    xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    c_prev -- Memory state at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
    Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
    bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
    Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
    bi -- Bias of the update gate, numpy array of shape (n_a, 1)
    Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
    bc --  Bias of the first "tanh", numpy array of shape (n_a, 1)
    Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
    bo --  Bias of the output gate, numpy array of shape (n_a, 1)Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
    by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

    Returns:
    a_next -- next hidden state, of shape (n_a, m)
    c_next -- next memory state, of shape (n_a, m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    cache -- tuple of values needed for the backward pass, contains (a_next, c_next, a_prev, c_prev, xt, parameters)
    """

    # 获取参数字典中各个参数
    Wf = parameters["Wf"]
    bf = parameters["bf"]
    Wi = parameters["Wi"]
    bi = parameters["bi"]
    Wc = parameters["Wc"]
    bc = parameters["bc"]
    Wo = parameters["Wo"]
    bo = parameters["bo"]
    Wy = parameters["Wy"]
    by = parameters["by"]    
    # 获取 xt 和 Wy 的维度参数
    n_x, m = xt.shape
    n_y, n_a = Wy.shape    
    # 拼接 a_prev 和 xt
    concat = np.zeros((n_a + n_x, m))
    concat[: n_a, :] = a_prev
    concat[n_a :, :] = xt    
    # 计算遗忘门、更新门、记忆细胞候选值、下一时间步的记忆细胞、输出门和下一时间步的隐状态值
    ft = sigmoid(np.matmul(Wf, concat) + bf)
    it = sigmoid(np.matmul(Wi, concat) + bi)
    cct = np.tanh(np.matmul(Wc, concat) + bc)
    c_next = ft*c_prev + it*cct
    ot = sigmoid(np.matmul(Wo, concat) + bo)
    a_next = ot*np.tanh(c_next)    
    # 计算 LSTM 的预测输出
    yt_pred = softmax(np.matmul(Wy, a_next) + by)    
    # 保存各计算结果值
    cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)    
    return a_next, c_next, yt_pred, cache

In [37]:
def lstm_cell_backward(da_next, dc_next, cache):    
    """
    Arguments:
    da_next -- Gradients of next hidden state, of shape (n_a, m)
    dc_next -- Gradients of next cell state, of shape (n_a, m)
    cache -- cache storing information from the forward pass

    Returns:
    gradients -- python dictionary containing:
     dxt -- Gradient of input data at time-step t, of shape (n_x, m)
     da_prev -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
     dc_prev -- Gradient w.r.t. the previous memory state, of shape (n_a, m, T_x)
     dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
     dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
     dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
     dWo -- Gradient w.r.t. the weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
     dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
     dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
     dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
     dbo -- Gradient w.r.t. biases of the output gate, of shape (n_a, 1)
    """

    # 获取缓存值
    (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters) = cache    # 获取 xt 和 a_next 的维度大小
    n_x, m = xt.shape
    n_a, m = a_next.shape    
    # 计算各种门的梯度
    dot = da_next * np.tanh(c_next) * ot * (1 - ot)
    dcct = dc_next * it + ot * (1 - np.tanh(c_next) ** 2) * it * da_next * cct * (1 - np.tanh(cct) ** 2)
    dit = dc_next * cct + ot * (1 - np.tanh(c_next) ** 2) * cct * da_next * it * (1 - it)
    dft = dc_next * c_prev + ot * (1 - np.tanh(c_next) ** 2) * c_prev * da_next * ft * (1 - ft)    # 计算各参数的梯度 
    dWf = np.dot(dft, np.concatenate((a_prev, xt), axis=0).T)
    dWi = np.dot(dit, np.concatenate((a_prev, xt), axis=0).T)
    dWc = np.dot(dcct, np.concatenate((a_prev, xt), axis=0).T)
    dWo = np.dot(dot, np.concatenate((a_prev, xt), axis=0).T)
    dbf = np.sum(dft, axis=1, keepdims=True)
    dbi = np.sum(dit, axis=1, keepdims=True)
    dbc = np.sum(dcct, axis=1, keepdims=True)
    dbo = np.sum(dot, axis=1, keepdims=True)

    da_prev = np.dot(parameters['Wf'][:,:n_a].T, dft) + np.dot(parameters['Wi'][:,:n_a].T, dit) + np.dot(parameters['Wc'][:,:n_a].T, dcct) + np.dot(parameters['Wo'][:,:n_a].T, dot)
    dc_prev = dc_next*ft + ot*(1-np.square(np.tanh(c_next)))*ft*da_next
    dxt = np.dot(parameters['Wf'][:,n_a:].T,dft)+np.dot(parameters['Wi'][:,n_a:].T,dit)+np.dot(parameters['Wc'][:,n_a:].T,dcct)+np.dot(parameters['Wo'][:,n_a:].T,dot) 

    # 将各梯度保存至字典
    gradients = {"dxt": dxt, "da_prev": da_prev, "dc_prev": dc_prev, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi, 
                   "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}    
    return gradients

In [38]:
def lstm_backward(da, caches):    
    """
    Arguments:
    da -- Gradients w.r.t the hidden states, numpy-array of shape (n_a, m, T_x)
    dc -- Gradients w.r.t the memory states, numpy-array of shape (n_a, m, T_x)
    caches -- cache storing information from the forward pass (lstm_forward)

    Returns:
    gradients -- python dictionary containing:
           dx -- Gradient of inputs, of shape (n_x, m, T_x)
           da0 -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
           dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
           dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
           dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
           dWo -- Gradient w.r.t. the weight matrix of the save gate, numpy array of shape (n_a, n_a + n_x)
           dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
           dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
           dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
           dbo -- Gradient w.r.t. biases of the save gate, of shape (n_a, 1)
    """

    # 获取第一个缓存值
    (caches, x) = caches
    (a1, c1, a0, c0, f1, i1, cc1, o1, x1, parameters) = caches[0]    # 获取 da 和 x1 的形状大小
    n_a, m, T_x = da.shape
    n_x, m = x1.shape    
    # 初始化各梯度值
    dx = np.zeros((n_x, m, T_x))
    da0 = np.zeros((n_a, m))
    da_prevt = np.zeros((n_a, m))
    dc_prevt = np.zeros((n_a, m))
    dWf = np.zeros((n_a, n_a+n_x))
    dWi = np.zeros((n_a, n_a+n_x))
    dWc = np.zeros((n_a, n_a+n_x))
    dWo = np.zeros((n_a, n_a+n_x))
    dbf = np.zeros((n_a, 1))
    dbi = np.zeros((n_a, 1))
    dbc = np.zeros((n_a, 1))
    dbo = np.zeros((n_a, 1))    
    # 循环各时间步
    for t in reversed(range(T_x)):        
        # 使用 lstm 单元反向传播计算各梯度值
        gradients = lstm_cell_backward(da[:, :, t] + da_prevt, dc_prevt, caches[t])        
        # 保存各梯度值
        dx[:,:,t] = gradients['dxt']
        dWf = dWf + gradients['dWf']
        dWi = dWi + gradients['dWi']
        dWc = dWc + gradients['dWc']
        dWo = dWo + gradients['dWo']
        dbf = dbf + gradients['dbf']
        dbi = dbi + gradients['dbi']
        dbc = dbc + gradients['dbc']
        dbo = dbo + gradients['dbo']

    da0 = gradients['da_prev']

    gradients = {"dx": dx, "da0": da0, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,                
    "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}    
    return gradients

In [39]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):
    """
    xt shape: (n_samples, n_features)
    a_prev shape: (n_samples, n_features_a)
    c_prev shape: (n_samples, n_features_a)
    """
    x = np.concate([xt, a_prev])
    # Wf, bf
    # Wi, bi
    # Wo, bo
    # Wc, bc
    new_x  = np.concate([xt, a_prev])
    Wf, bf = parameters['Wf'], parameters['bf']
    Wi, bi = parameters['Wi'], parameters['bi']
    Wo, bo = parameters['Wo'], parameters['bo']
    Wc, bc = parameters['Wc'], parameters['bcc']
    
    ft = sigmoid(np.matmul(Wf, new_x) + bf)
    it = sigmoid(np.matmul(Wi, new_x) + bi)
    ot = sigmoid(np.matmul(Wo, new_x) + bo)
    ct = np.tanh(np.matmul(Wc, new_x) + bc)
    c_next = ft * c_prev + it * ct
    a_next = ot * np.tanh(c_next)
    y = softmax(a_next)
    return a_next, c_next

In [None]:
lstm_cell_forward(x)