In [3]:
import numpy as np

def sigmoid(v): return 1/(1+np.exp(v))

# returns forward-propagation output, hidden, candidate, and forget values
# params should be object with 1 dim for each h_t, x_t, cin_t and
#   for those values each parameters for the input, forget, candidate and output gates
# candidate gate is special - uses tanh, others sigmoid
# Then we update the candidate state and output the hidden values
# c_t = f_t * c_t-1 + i_t * cin_t
def forward(t):
    # gates
    inp[t] = sigmoid(np.dot(wxi, x[t]) + whi*h[t-1] + bi)
    f[t] = sigmoid(np.dot(wxf, x[t]) + whf*h[t-1] + bf)
    o[t] = sigmoid(np.dot(wxo, x[t]) + who*h[t-1] + bo)
    # input transform
    cin[t] = np.tanh(np.dot(wxcin, x[t]) + whcin*h[t-1] + bcin)
    # state update
    c[t] = cin[t]*f[t] + inp[t]*cin[t]
    h[t] = o[t]*np.tanh(c[t])
    
# initiate i, f, o, cin, c, h for t steps
# they should all be scalar values?

# calculate for all timesteps
# for t in range(1, T):
#     forward(t)
    
def backward(t):
    # calculate loss at some time t, whatever loss function is used
    # calculate the derivative of the loss using h[t] as the predicted value and y[t] as true value
    # this is dL/dh (dLdh)
    dLdh = h[t] - y[t] # FIXME e.g. derivative of the loss function
    dc = o[t]*dLdh
    do = c[t]*dLdh
    # gate output derivatives
    dcin = inp[t]*dc
    di = cin[t]*dc
    df = c[t-1]*dc # not sure about this
    # gate input derivatives
    di_input = (1-inp[t])*inp[t]*di
    df_input = (1-f[t])*f[t]*df
    do_input = (1-o[t])*o[t]*do
    dcin_input = (1-cin[t]**2)*dcin
    # derivatives wrt inputs
    wi_deriv += np.outer(di_input, xc) # xc = (x[t], h[t-1])
    wf_deriv += np.outer(df_input, xc)
    wo_deriv += np.outer(do_input, xc)
    wcin_deriv += np.outer(dcin_input, xc)
    
# x will by a series of letters
word = list("hello")
xchars = word[0:(len(word)-1)]
ychars = word[1:len(word)]

# represent a series of letters as 1 hot vectors
char_to_idx = {}
idx_to_char = {}
for idx, char in enumerate(list(set(word))):
    char_to_idx[char] = idx
    idx_to_char[idx] = char

def word_to_one_hot_matrix(char_arr):
    word_matrix = []
    for char in char_arr:
        char_arr = [0]*len(char_to_idx)
        char_idx = char_to_idx[char]
        char_arr[char_idx] = 1
        word_matrix.append(char_arr)
    return word_matrix

x = word_to_one_hot_matrix(xchars)
y = word_to_one_hot_matrix(ychars)

def random_params():
    return [np.random.rand() for i in range(len(xchars))]

def random_bias(): return np.random.rand()

def zero_vector(): return [0]*len(xchars)

In [106]:
wxi, wxf, wxo, wxcin, why = [random_params() for i in range(5)]
bi, bf, bo, bcin, whi, whf, who, whcin = [random_bias() for i in range(8)]
inp, f, o, cin, c, h =  [zero_vector() for i in range(6)]
t = 1
forward(t)

# predictions for next letter
out_score = np.dot(h[t], why)
y_preds = np.exp(out_score)/np.sum(np.exp(out_score))
print y_preds
# FIXME - but this should be sigmoid?

# loss is true y[t] - y_pred (but should actually be cross entropy,
# although this may be the true derivative of the cross entropy loss?)
yidx = int(np.where(np.array(y[t]) == 1)[0])
loss = -np.log(y_preds[yidx])
print loss
df = y_preds - y[t]
print df

[ 0.25058273  0.25353739  0.23634927  0.25953061]
1.44244459591
[ 0.25058273  0.25353739 -0.76365073  0.25953061]


In [140]:
dW = h[t]*df
reg = 1e-3 # regularization strength
step_size = 1e-0
dW += [why[i]*reg for i in range(len(why))]
why += -step_size * dW
print why

new_out_score = np.dot(h[t], why)
new_y_preds = np.exp(new_out_score)/np.sum(np.exp(new_out_score))
loss = -np.log(new_y_preds[yidx])
print loss

[-0.08242533  0.01134818  2.32191714  0.19794776]
1.20137988934
