In [24]:
import numpy as np

In [25]:
# data I/O
data = open('input.txt', 'r').read()  # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

data has 1115394 characters, 65 unique.


In [26]:

hidden_size = 100
seq_length = 25 
learning_rate = 1e-1

weight_std = 0.01
weight_mean = 0.0

Wf = np.random.randn(hidden_size, vocab_size + hidden_size) * weight_std + weight_mean
Wi = np.random.randn(hidden_size, vocab_size + hidden_size) * weight_std + weight_mean
Wo = np.random.randn(hidden_size, vocab_size + hidden_size) * weight_std + weight_mean
Wg = np.random.randn(hidden_size, vocab_size + hidden_size) * weight_std + weight_mean

Wy = np.random.randn(vocab_size, hidden_size) * weight_std

bf = np.zeros((hidden_size, 1))  # forget bias
bi = np.zeros((hidden_size, 1))  # input bias
bo = np.zeros((hidden_size, 1))  # output gate bias
bg = np.zeros((hidden_size, 1))  # cell state bias

by = np.zeros((vocab_size, 1))

params = {
    'Wf': Wf, 'Wi': Wi, 'Wo': Wo, 'Wg': Wg, 'Wy': Wy, 
    'bf': bf, 'bi': bi, 'bo': bo, 'bg': bg, 'by': by
}

for key in list(params.keys()):
    params['d' + key] = np.zeros_like(params[key])

for key in list(params.keys()):
    if key[0] != 'd':
        params['m_' + key] = np.zeros_like(params[key])



In [27]:

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def zero_grad(params):
    for key in list(params.keys()):
        if key[0] == 'd':
            params[key] = np.zeros_like(params[key])


def clip_grad(params):
    for key in list(params.keys()):
        if key[0] == 'd':
            np.clip(params[key], -1, 1, out=params[key])


def forward(x, hprev, cprev, params):
    z = np.row_stack((hprev, x))
    
    zf = params['Wf'] @ z + params['bf']
    zi = params['Wi'] @ z + params['bi']
    zo = params['Wo'] @ z + params['bo']
    zg = params['Wg'] @ z + params['bg']
    
    f = sigmoid(zf)
    i = sigmoid(zi)    
    o = sigmoid(zo)
    g = np.tanh(zg)

    c = f * cprev +  i * g
    h = o * np.tanh(c)

    y = params['Wy'] @ h + params['by']
    p = np.exp(y) / np.sum(np.exp(y))

    cache = z, f, i, g, o, c, h, y, p

    return cache, params


def backward(target, dhnext, dcnext, cprev, cache, params):
    z, f, i, g, o, c, h, y, p = cache
    dy = np.copy(p)
    dy[target] -= 1  

    params['dWy'] += dy @ h.T
    params['dby'] += dy
    
    dh = params['Wy'].T @ dy + dhnext
    
    do = dh * np.tanh(c)
    dzo = o * (1 - o) * do
    params['dWo'] += dzo @ z.T
    params['dbo'] += dzo

    dc = np.copy(dcnext)
    dc += dh * o * (1 - np.tanh(c) * np.tanh(c))

    dg = dc * i
    dzg = dg * (1 - g ** 2)
    params['dWg'] += dzg @ z.T
    params['dbg'] += dzg

    di = dc * g
    dzi = i * (1 - i) * di
    params['dWi'] += dzi @ z.T
    params['dbi'] += dzi

    df = dc * cprev 
    dzf = f * (1 - f) * df
    params['dWf'] += dzf @ z.T
    params['dbf'] += dzf

    dz = params['Wf'].T @ dzf + params['Wi'].T @ dzi +  params['Wg'].T @ dzg +  params['Wo'].T @ dzo
    dhprev = dz[:hidden_size, :]
    dcprev = f * dc
    return dhprev, dcprev, params


def lossFun(inputs, targets, hprev, cprev, params):
    xs, hs, cs, zs, ys, ps, fs, is_, os, gs = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    cs[-1] = np.copy(cprev)
    loss = 0
    
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1

        cache, params = forward(xs[t], hs[t - 1], cs[t - 1], params)
        zs[t], fs[t], is_[t], gs[t], os[t], cs[t], hs[t], ys[t], ps[t] = cache
    
        loss += -np.log(ps[t][targets[t], 0]) 
    
    zero_grad(params)
    
    dhnext = np.zeros_like(hs[0])
    dcnext = np.zeros_like(cs[0])

    for t in reversed(range(len(inputs))):
        cache = (zs[t], fs[t], is_[t], gs[t], os[t], cs[t], hs[t], ys[t], ps[t] )
        dhnext, dcnext, params = backward(targets[t], dhnext, dcnext, cs[t - 1], cache, params)

    clip_grad(params)
    
       
    return loss, hs[len(inputs) - 1], cs[len(inputs) - 1]


In [28]:

def sample(h, c, first_letter_idx, n, params):
    x = np.zeros((vocab_size, 1))
    x[first_letter_idx] = 1
    
    indexes = []
    for _ in range(n):
        cache, params = forward(x, h, c, params)
        c = cache[-4]
        h = cache[-3]
        p = cache[-1]
        
        letter_index = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[letter_index] = 1
        indexes.append(letter_index)

    return indexes

In [20]:

n, p = 0, 0
smooth_loss = -np.log(1.0 / vocab_size) * seq_length 

while True:
    if p + seq_length + 1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_size, 1))
        cprev = np.zeros((hidden_size, 1))
        p = 0 
    inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
    targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

    
    # forward seq_length characters through the net and fetch gradient
    loss, hprev, cprev = lossFun(inputs, targets, hprev, cprev, params)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    if n % 100 == 0:
        sample_ix = sample(hprev, cprev, inputs[0], 200, params)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt,))

    if n % 100 == 0: 
        print('iter %d, loss: %f' % (n, smooth_loss))

    
    for key in list(params.keys()):
        if key[0] != 'd' and key[0] != 'm':
            mem_key = 'm_' + key
            dkey = 'd' + key
            params[mem_key] += params[dkey] * params[dkey]
            params[key] += -learning_rate * params[dkey] / np.sqrt(params[mem_key] + 1e-8)
    p += seq_length 
    n += 1



IndexError: arrays used as indices must be of integer (or boolean) type