In [90]:
import numpy as np
from io import StringIO
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


In [207]:
def init_orthogonal(param):
    """
    Initializes weight parameters orthogonally.
    This is a common initiailization for recurrent neural networks.
    
    Refer to this paper for an explanation of this initialization:
    https://arxiv.org/abs/1312.6120
    """
    if param.ndim < 2:
        raise ValueError("Only parameters with 2 or more dimensions are supported.")

    rows, cols = param.shape
    
    new_param = np.random.randn(rows, cols)
    
    if rows < cols:
        new_param = new_param.T
    
    # Compute QR factorization
    q, r = np.linalg.qr(new_param)
    
    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
    d = np.diag(r, 0)
    ph = np.sign(d)
    q *= ph

    if rows < cols:
        q = q.T
    
    new_param = q
    
    return new_param

def sigmoid(x, derivative=False):
    """
    Computes the element-wise sigmoid activation function for an array x.

    Args:
     `x`: the array where the function is applied
     `derivative`: if set to True will return the derivative instead of the forward pass
    """
    x_safe = x + 1e-12
    f = 1 / (1 + np.exp(-x_safe))
    
    if derivative: # Return the derivative of the function evaluated at x
        return f * (1 - f)
    else: # Return the forward pass of the function at x
        return f
    
def tanh(x, derivative=False):
    """
    Computes the element-wise tanh activation function for an array x.

    Args:
     `x`: the array where the function is applied
     `derivative`: if set to True will return the derivative instead of the forward pass
    """
    x_safe = x + 1e-12
    f = (np.exp(x_safe)-np.exp(-x_safe))/(np.exp(x_safe)+np.exp(-x_safe))
    
    if derivative: # Return the derivative of the function evaluated at x
        return 1-f**2
    else: # Return the forward pass of the function at x
        return f
    
    
def softmax(x, derivative=False):
    """
    Computes the softmax for an array x.
    
    Args:
     `x`: the array where the function is applied
     `derivative`: if set to True will return the derivative instead of the forward pass
    """
    x_safe = x + 1e-12
    f = np.exp(x_safe) / np.sum(np.exp(x_safe))
    
    if derivative: # Return the derivative of the function evaluated at x
        pass # We will not need this one
    else: # Return the forward pass of the function at x
        return f
    
def clip_gradient_norm(grads, max_norm=0.25):
    """
    Clips gradients to have a maximum norm of `max_norm`.
    This is to prevent the exploding gradients problem.
    """ 
    # Set the maximum of the norm to be of type float
    max_norm = float(max_norm)
    total_norm = 0
    
    # Calculate the L2 norm squared for each gradient and add them to the total norm
    for grad in grads:
        grad_norm = np.sum(np.power(grad, 2))
        total_norm += grad_norm
    
    total_norm = np.sqrt(total_norm)
    
    # Calculate clipping coeficient
    clip_coef = max_norm / (total_norm + 1e-6)
    
    # If the total norm is larger than the maximum allowable norm, then clip the gradient
    if clip_coef < 1:
        for grad in grads:
            grad *= clip_coef
    
    return grads

def update_parameters(params, grads, lr=1e-3):
    # Take a step
    for param, grad in zip(params, grads):
        param -= lr * grad
    
    return params

def mean_squared_error(actual, predicted):
    sum_square_error = 0.0
    for i in range(len(actual)):
        sum_square_error += (actual[i] - predicted[i])**2.0
    mean_square_error = 1.0 / len(actual) * sum_square_error
    return mean_square_error


In [155]:
dataset=StringIO("""Date,Open,High,Low,Close,Volume,Trade_count,Vwap
2015-12-01 09:00:00+00:00,118.88,118.94,118.88,118.94,1145,5,118.902052
2015-12-01 09:15:00+00:00,118.77,118.77,118.77,118.77,200,1,118.77
2015-12-01 09:30:00+00:00,118.69,118.69,118.6,118.6,900,4,118.61
2015-12-01 09:45:00+00:00,118.64,118.65,118.64,118.65,3580,5,118.648883
2015-12-01 10:00:00+00:00,118.65,118.65,118.55,118.55,1820,4,118.611538
2015-12-01 10:15:00+00:00,118.55,118.6,118.55,118.6,880,5,118.5625
2015-12-01 10:30:00+00:00,118.55,118.55,118.5,118.5,1878,5,118.513312
2015-12-01 10:45:00+00:00,118.59,118.72,118.59,118.72,2499,10,118.628431
2015-12-01 11:00:00+00:00,118.71,118.9,118.71,118.9,2842,11,118.86064
2015-12-01 11:15:00+00:00,118.87,118.87,118.87,118.87,300,2,118.87
2015-12-01 11:30:00+00:00,118.78,118.8,118.76,118.8,3914,22,118.785876
2015-12-01 11:45:00+00:00,118.8,118.99,118.77,118.9,7900,37,118.893542
2015-12-01 12:00:00+00:00,118.88,118.98,118.84,118.84,6540,34,118.922648
2015-12-01 12:15:00+00:00,118.82,118.84,118.77,118.77,5603,28,118.804962
2015-12-01 12:30:00+00:00,118.77,118.89,118.76,118.88,7612,31,118.824002
""")
df = pd.read_table(dataset, sep=",")

#ip = np.array([ [1,2,3],[6,8,9],[3,4,5],[4,7,8],[4,2,5],[5,7,4] ])
#op = np.array([[2,8,4,7,2,4]])
#op = op.reshape(6,1)
ip = np.array([ [1],[2],[0],[2],[0],[1],[2],[1] ])
op = np.array([ [10],[11],[12],[11],[12],[10],[11],[10] ])
num_steps = 3
num_features = 3
#ip_shaped = np.reshape(ip, newshape=(-1, num_steps, num_features))

#X = np.array([ [1,2,3, 4, 5, 6] ])
#Y = np.array([[2,3,4,5,6,7]])
print(op)
print(op[3])

[[10]
 [11]
 [12]
 [11]
 [12]
 [10]
 [11]
 [10]]
[11]


In [156]:
def lstm_data_transform(x_data, y_data, num_steps=2):
    """ Changes data to the format for LSTM training 
for sliding window approach """
    # Prepare the list for the transformed data
    X, y = list(), list()
    # Loop of the entire data set
    #print(x_data.shape[0])
    for i in range(x_data.shape[0]):
        # compute a new (sliding window) index
        end_ix = i + num_steps

        # if index is larger than the size of the dataset, we stop
        #print(end_ix)
        if end_ix >= x_data.shape[0]:
            break
        # Get a sequence of data for x
        seq_X = x_data[i:end_ix]
        #print(x_data[i:end_ix])
        # Get only the last element of the sequency for y
        #print(y_data[end_ix])
        seq_y = y_data[end_ix-1]
        # Append the list with sequencies
        X.append(seq_X)
        y.append(seq_y)
        #print(X,y)
    # Make final arrays
    x_array = np.array(X)
    y_array = np.array(y)
    return x_array, y_array

In [157]:
ipt,opt=lstm_data_transform(ip,op)
print(ipt)
print("====")
print(opt)

[[[1]
  [2]]

 [[2]
  [0]]

 [[0]
  [2]]

 [[2]
  [0]]

 [[0]
  [1]]

 [[1]
  [2]]]
====
[[11]
 [12]
 [11]
 [12]
 [10]
 [11]]


In [215]:
# Size of concatenated hidden + input vector
hd = 50
id = 1
z = hd + id

def init_lstm(hd, id, z):
    """
    Initializes our LSTM network.
    
    Args:
     `hidden_size`: the dimensions of the hidden state
     `vocab_size`: the dimensions of our vocabulary
     `z_size`: the dimensions of the concatenated input 
    """
    # Weight matrix (forget gate)
    W_f = np.zeros((hd, z))
   
    # Bias for forget gate
    b_f = np.zeros((hd, 1))

    # Weight matrix (input gate)
    W_i = np.zeros((hd, z))
    
    # Bias for input gate
    b_i = np.zeros((hd, 1))

    # Weight matrix (candidate)
    W_g = np.zeros((hd, z))

    # Bias for candidate
    b_g = np.zeros((hd, 1))

    # Weight matrix of the output gate
    W_o = np.zeros((hd, z))
    
    # Bias for output gate
    b_o = np.zeros((hd, 1))

    # Weight matrix relating the hidden-state to the output
    W_v = np.zeros((id, hd))
    
    # Bias for logits
    b_v = np.zeros((id, 1))
    
    # Initialize weights according to https://arxiv.org/abs/1312.6120
    W_f = init_orthogonal(W_f)
    W_i = init_orthogonal(W_i)
    W_g = init_orthogonal(W_g)
    W_o = init_orthogonal(W_o)
    W_v = init_orthogonal(W_v)

    return W_f, W_i, W_g, W_o, W_v, b_f, b_i, b_g, b_o, b_v


params = init_lstm(hd, id, z)
print('W_f:', params[0].shape)
print('W_i:', params[1].shape)
print('W_g:', params[2].shape)
print('W_o:', params[3].shape)
print('W_v:', params[4].shape)
print('b_i:', params[5].shape)
print('b_g:', params[6].shape)
print('b_o:', params[7].shape)
print('b_v:', params[8].shape)

for param in params:
    assert param.ndim == 2, \
        'all parameters should be 2-dimensional '\
        '(hint: a dimension can simply have size 1)'

W_f: (50, 51)
W_i: (50, 51)
W_g: (50, 51)
W_o: (50, 51)
W_v: (1, 50)
b_i: (50, 1)
b_g: (50, 1)
b_o: (50, 1)
b_v: (50, 1)


In [216]:
hidden_size=hd
def forward(inputs, h_prev, C_prev, p):
    """
    Arguments:
    x -- your input data at timestep "t", numpy array of shape (n_x, m).
    h_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    C_prev -- Memory state at timestep "t-1", numpy array of shape (n_a, m)
    p -- python list containing:
                        W_f -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        b_f -- Bias of the forget gate, numpy array of shape (n_a, 1)
                        W_i -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        b_i -- Bias of the update gate, numpy array of shape (n_a, 1)
                        W_g -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
                        b_g --  Bias of the first "tanh", numpy array of shape (n_a, 1)
                        W_o -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
                        b_o --  Bias of the output gate, numpy array of shape (n_a, 1)
                        W_v -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_v, n_a)
                        b_v -- Bias relating the hidden-state to the output, numpy array of shape (n_v, 1)
    Returns:
    z_s, f_s, i_s, g_s, C_s, o_s, h_s, v_s -- lists of size m containing the computations in each forward pass
    outputs -- prediction at timestep "t", numpy array of shape (n_v, m)
    """
    assert h_prev.shape == (hidden_size, 1)
    assert C_prev.shape == (hidden_size, 1)

    # First we unpack our parameters
    W_f, W_i, W_g, W_o, W_v, b_f, b_i, b_g, b_o, b_v = p
    
    # Save a list of computations for each of the components in the LSTM
    x_s, z_s, f_s, i_s,  = [], [] ,[], []
    g_s, C_s, o_s, h_s = [], [] ,[], []
    v_s, output_s =  [], [] 
    
    # Append the initial cell and hidden state to their respective lists
    h_s.append(h_prev)
    C_s.append(C_prev)
    
    for x in inputs:
        
        # Concatenate input and hidden state
        z = np.row_stack((h_prev, x))
        z_s.append(z)
        
        # Calculate forget gate
        f = sigmoid(np.dot(W_f, z) + b_f)
        f_s.append(f)
        
        # Calculate input gate
        i = sigmoid(np.dot(W_i, z) + b_i)
        i_s.append(i)
        
        # Calculate candidate
        g = tanh(np.dot(W_g, z) + b_g)
        g_s.append(g)
        
        # Calculate memory state
        C_prev = C_prev * f + g * i
        C_s.append(C_prev)
        
        # Calculate output gate
        o = sigmoid(np.dot(W_o, z) + b_o)
        o_s.append(o)
        
        # Calculate hidden state
        h_prev = o * tanh(C_prev)
        h_s.append(h_prev)

        # Calculate logits
        v = np.dot(W_v, h_prev) + b_v


    return z_s, f_s, i_s, g_s, C_s, o_s, h_s, v


# Get first sentence in test set
inputs, targets = ipt[0],opt[0]

# One-hot encode input and target sequence
#inputs_one_hot = one_hot_encode_sequence(inputs, vocab_size)
#targets_one_hot = one_hot_encode_sequence(targets, vocab_size)

# Initialize hidden state as zeros
h = np.zeros((hidden_size, 1))
c = np.zeros((hidden_size, 1))

# Forward pass
z_s, f_s, i_s, g_s, C_s, o_s, h_s, v_s = forward(inputs, h, c, params)

#output_sentence = [idx_to_word[np.argmax(output)] for output in outputs]
print('Input sentence:')
print(inputs)

print('\nTarget sequence:')
print(targets)

print('\nPredicted sequence:')
print(v_s)
outputs = v_s

Input sentence:
[[1]
 [2]]

Target sequence:
[11]

Predicted sequence:
[[0.00872794]]


In [181]:
display(h_s)
display(C_s)
print(outputs[0])

[array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]]),
 array([[ 0.01142451],
        [-0.00536775],
        [ 0.0018322 ],
        [ 0.05113451],
        [ 0.00326823],
        [ 0.0248653 ],
        [ 0.05708893],
        [ 0.01105601],
        [ 0.00979913],
        [-0.03440035],
        [ 0.03356718],
        [ 0.01798711],
        [ 0.04213724],

[array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]]),
 array([[ 0.02222424],
        [-0.01068647],
        [ 0.00331574],
        [ 0.10007769],
        [ 0.0069369 ],
        [ 0.04942714],
        [ 0.106544  ],
        [ 0.02157843],
        [ 0.02084724],
        [-0.07039985],
        [ 0.07584543],
        [ 0.03905357],
        [ 0.09048596],

[-0.04689127]


In [217]:
def backward(z, f, i, g, C, o, h, v, outputs, targets, p = params):
    """
    Arguments:
    z -- your concatenated input data  as a list of size m.
    f -- your forget gate computations as a list of size m.
    i -- your input gate computations as a list of size m.
    g -- your candidate computations as a list of size m.
    C -- your Cell states as a list of size m+1.
    o -- your output gate computations as a list of size m.
    h -- your Hidden state computations as a list of size m+1.
    v -- your logit computations as a list of size m.
    outputs -- your outputs as a list of size m.
    targets -- your targets as a list of size m.
    p -- python list containing:
                        W_f -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        b_f -- Bias of the forget gate, numpy array of shape (n_a, 1)
                        W_i -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        b_i -- Bias of the update gate, numpy array of shape (n_a, 1)
                        W_g -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
                        b_g --  Bias of the first "tanh", numpy array of shape (n_a, 1)
                        W_o -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
                        b_o --  Bias of the output gate, numpy array of shape (n_a, 1)
                        W_v -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_v, n_a)
                        b_v -- Bias relating the hidden-state to the output, numpy array of shape (n_v, 1)
    Returns:
    loss -- crossentropy loss for all elements in output
    grads -- lists of gradients of every element in p
    """

    # Unpack parameters
    W_f, W_i, W_g, W_o, W_v, b_f, b_i, b_g, b_o, b_v = p

    # Initialize gradients as zero
    W_f_d = np.zeros_like(W_f)
    b_f_d = np.zeros_like(b_f)

    W_i_d = np.zeros_like(W_i)
    b_i_d = np.zeros_like(b_i)

    W_g_d = np.zeros_like(W_g)
    b_g_d = np.zeros_like(b_g)

    W_o_d = np.zeros_like(W_o)
    b_o_d = np.zeros_like(b_o)

    W_v_d = np.zeros_like(W_v)
    b_v_d = np.zeros_like(b_v)
    
    # Set the next cell and hidden state equal to zero
    dh_next = np.zeros_like(h[0])
    dC_next = np.zeros_like(C[0])
        
    # Track loss
    loss = 0
    
    for t in reversed(range(len(outputs))):
        
        # Compute the cross entropy
        #print(outputs[t][0], targets[t], -np.mean(np.log(outputs[t][0]) * targets[t]))
        loss += (np.square(outputs[t] - targets[t])).mean(axis=None)
        #loss += -np.mean(np.log(outputs[t]) * targets[t])
        # Get the previous hidden cell state
        C_prev= C[t-1]
        
        # Compute the derivative of the relation of the hidden-state to the output gate
        dv = np.copy(outputs[t])
        dv[np.argmax(targets[t])] -= 1

        # Update the gradient of the relation of the hidden-state to the output gate
        W_v_d += np.dot(dv, h[t].T)
        b_v_d += dv

        # Compute the derivative of the hidden state and output gate
        dh = np.dot(W_v.T, dv)
        dh = dh.reshape(hd,1) ##NEWLYADDED
        dh += dh_next
        do = dh * tanh(C[t])
        do = sigmoid(o[t], derivative=True)*do
        
        # Update the gradients with respect to the output gate
        W_o_d += np.dot(do, z[t].T)
        b_o_d += do

        # Compute the derivative of the cell state and candidate g
        dC = np.copy(dC_next)
        dC += dh * o[t] * tanh(tanh(C[t]), derivative=True)
        dg = dC * i[t]
        dg = tanh(g[t], derivative=True) * dg
        
        # Update the gradients with respect to the candidate
        W_g_d += np.dot(dg, z[t].T)
        b_g_d += dg

        # Compute the derivative of the input gate and update its gradients
        di = dC * g[t]
        di = sigmoid(i[t], True) * di
        W_i_d += np.dot(di, z[t].T)
        b_i_d += di

        # Compute the derivative of the forget gate and update its gradients
        df = dC * C_prev
        df = sigmoid(f[t]) * df
        W_f_d += np.dot(df, z[t].T)
        b_f_d += df

        # Compute the derivative of the input and update the gradients of the previous hidden and cell state
        dz = (np.dot(W_f.T, df)
             + np.dot(W_i.T, di)
             + np.dot(W_g.T, dg)
             + np.dot(W_o.T, do))
        dh_prev = dz[:hidden_size, :]
        dC_prev = f[t] * dC
        
    grads= W_f_d, W_i_d, W_g_d, W_o_d, W_v_d, b_f_d, b_i_d, b_g_d, b_o_d, b_v_d
    
    # Clip gradients
    grads = clip_gradient_norm(grads)
    
    return loss, grads


# Perform a backward pass
loss, grads = backward(z_s, f_s, i_s, g_s, C_s, o_s, h_s, v_s, outputs, targets, params)

print('We get a loss of:')
print(loss)

We get a loss of:
120.8080614520397


In [219]:
# Hyper-parameters
num_epochs = 80

# Initialize a new network
z_size = hd + id # Size of concatenated hidden + input vector
params = init_lstm(hd, id, z_size)

# Initialize hidden state as zeros
hidden_state = np.zeros((hidden_size, 1))

# Track loss
training_loss, validation_loss = [], []

# For each epoch
for i in range(num_epochs):
    
    # Track loss
    epoch_training_loss = 0
    epoch_validation_loss = 0
    count = 0
    
    # For each sentence in validation set
    for inputs in ipt:
        #print(inputs)
        target = opt[count]
        #print(targets)
        # Initialize hidden state and cell state as zeros
        h = np.zeros((hidden_size, 1))
        c = np.zeros((hidden_size, 1))

        # Forward pass
        z_s, f_s, i_s, g_s, C_s, o_s, h_s, outputs = forward(inputs, h, c, params)
        
        # Backward pass
        loss, _ = backward(z_s, f_s, i_s, g_s, C_s, o_s, h_s, v_s, outputs, target, params)

        # Update parameters
        params = update_parameters(params, grads, lr=1e-1)

        # Update loss
        #print(loss)
        epoch_validation_loss += loss
        count+=1
    
#     # For each sentence in training set
#     for inputs, targets in training_set:
        
#         # One-hot encode input and target sequence
#         inputs_one_hot = one_hot_encode_sequence(inputs, vocab_size)
#         targets_one_hot = one_hot_encode_sequence(targets, vocab_size)

#         # Initialize hidden state and cell state as zeros
#         h = np.zeros((hidden_size, 1))
#         c = np.zeros((hidden_size, 1))

#         # Forward pass
#         z_s, f_s, i_s, g_s, C_s, o_s, h_s, v_s, outputs = forward(inputs_one_hot, h, c, params)
        
#         # Backward pass
#         loss, grads = backward(z_s, f_s, i_s, g_s, C_s, o_s, h_s, v_s, outputs, targets_one_hot, params)
        
#         # Update parameters
#         params = update_parameters(params, grads, lr=1e-1)
        
#         # Update loss
#         epoch_training_loss += loss
                
#     # Save loss for plot
#     training_loss.append(epoch_training_loss/len(training_set))
    validation_loss.append(epoch_validation_loss/len(ipt))
    #print(epoch_validation_loss)
    # Print loss every 10 epochs
    if i % 10 == 0:
        print(f'Epoch {i}, validation loss: {validation_loss[-1]}')
        print(outputs)
        #print(len(ipt))
#        print(targets)

    
# # Get first sentence in test set
# inputs, targets = ipt[1],opt[1]

# # Initialize hidden state as zeros
# h = np.zeros((hidden_size, 1))
# c = np.zeros((hidden_size, 1))

# # Forward pass
# z_s, f_s, i_s, g_s, C_s, o_s, h_s, outputs = forward(inputs, h, c, params)

# # Print example
# print('Input sentence:')
# print(inputs)

# print('\nTarget sequence:')
# print(targets)

# print('\nPredicted sequence:')
# print(outputs)

# Plot training and validation loss
# epoch = np.arange(len(training_loss))
# plt.figure()
# #plt.plot(epoch, training_loss, 'r', label='Training loss',)
# plt.plot(epoch, validation_loss, 'b', label='Validation loss')
# plt.legend()
# plt.xlabel('Epoch'), plt.ylabel('NLL')
# plt.show()

Epoch 0, validation loss: 125.42283594606177
[[0.00516114]]
Epoch 10, validation loss: 95.82671764544098
[[1.417194]]
Epoch 20, validation loss: 70.16013977797385
[[2.83620672]]
Epoch 30, validation loss: 48.499181421052896
[[4.25680571]]
Epoch 40, validation loss: 30.881054030361497
[[5.67420627]]
Epoch 50, validation loss: 17.297059774955283
[[7.08678128]]
Epoch 60, validation loss: 7.72009782878229
[[8.49532337]]
Epoch 70, validation loss: 2.126248937828942
[[9.90141267]]


In [227]:
inp = ipt[5]
print(inp)
print("====")
z_s, f_s, i_s, g_s, C_s, o_s, h_s, outputs = forward(inp, h, c, params)
print(outputs)
print("$$$$$")

[[1]
 [2]]
====
[[11.18936986]]
$$$$$
