In [465]:
import numpy as np

# data I/O
with open('names.txt', 'r') as f: 
    data = f.read()
chars = sorted(list(set(data)))
data_size, vocab_size = len(data), len(chars)
stoi = {j:i for i, j in enumerate(chars)}
itos = {i:j for i, j in enumerate(chars)}

print(f'Data has {data_size} characters, {vocab_size} unique.')

Data has 228145 characters, 27 unique.


In [466]:
# hyperparameters
seq_length = 25
hidden_size = 100

np.random.seed(1234)
# parameters 
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 
Why = np.random.randn(vocab_size, hidden_size) * 0.01
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

In [467]:
# inputs and targets 
inputs = [stoi[ch] for ch in data[0:seq_length]]
targets = [stoi[ch] for ch in data[1:seq_length+1]]

In [468]:
# forward pass 
xs, hs, ys, ps = {}, {}, {}, {}
loss = 0
hs[-1] = np.zeros((hidden_size, 1))

for t in range(len(inputs)):
    # convert input to one-hot 
    xs[t] = np.zeros((vocab_size, 1))
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(Wxh @ xs[t] + Whh @ hs[t-1] + bh)
    ys[t] = Why @ hs[t] + by
    # cross entropy 
    ps[t] = ys[t]
    #ps[t] -= np.max(ys[t])
    ps[t] = np.exp(ps[t]) / np.sum(np.exp(ps[t]))
    loss += -np.log(ps[t][targets[t]].item())
print(loss)


82.39381010667938


In [469]:
# backward pass
dWhh = np.zeros_like(Whh)
dWxh = np.zeros_like(Wxh)
dWhy = np.zeros_like(Why)
dbh = np.zeros_like(bh)
dby = np.zeros_like(by)
dhnext = np.zeros_like(hs[0])

for t in reversed(range(len(inputs))):
    dtarget = ps[t]
    dtarget[targets[t]] -= 1
    dby += dtarget
    dWhy += dtarget @ hs[t].T
    dh = Why.T @ dtarget + dhnext
    dh_before_tanh = (1 - hs[t]**2) * dh 
    dbh += dh_before_tanh
    dWxh += dh_before_tanh @ xs[t].T
    dWhh += dh_before_tanh @ hs[t-1].T 
    dhnext = Whh.T @ dh_before_tanh

In [470]:
# update
lr = 0.1
for param, dparam in zip([Whh, Wxh, Why, bh, by],
                         [dWhh, dWxh, dWhy, dbh, dby]):
    param += -lr * dparam

In [471]:
# sample 
h = np.zeros((hidden_size, 1))
x = np.zeros((vocab_size, 1))
x[0, 0] = 1

result_idx = []

for _ in range(100):
    h = np.tanh(Wxh @ x + Whh @ h + bh)
    y = Why @ h + by
    p = np.exp(y) / np.sum(np.exp(y))
    idx = np.random.choice(range(vocab_size), p=p.ravel())

    result_idx.append(idx)

    x = np.zeros((vocab_size, 1))
    x[idx] = 1

result_char = [itos[i] for i in result_idx]
result = ''.join(result_char)
print(result)

xafznv
iftygfnxbiqpacnmftmpijxaurzmevwxoejz
czqadskhly
havddyvqxitnleyiizmvgepwirdbyg
wsrajjognoalak


In [476]:
# load data 

load_idx = 0
while load_idx + 1 + seq_length < data_size:
    inputs = data[load_idx:load_idx+seq_length]
    targets = data[load_idx+1:load_idx+seq_length+1]
    load_idx += seq_length

## Put them together

In [12]:
import numpy as np
np.random.seed(1234)

# data I/O
with open('names.txt', 'r') as f: 
    data = f.read()
chars = sorted(list(set(data)))
data_size, vocab_size = len(data), len(chars)
stoi = {j:i for i, j in enumerate(chars)}
itos = {i:j for i, j in enumerate(chars)}

print(f'Data has {data_size} characters, {vocab_size} unique.')

# hyperparameters
seq_length = 50
hidden_size = 300
lr = 1e-5

# parameters initialisation 
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 
Why = np.random.randn(vocab_size, hidden_size) * 0.01
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

# RMSprop 
decay_rate = 0.99
mWhh = np.zeros_like(Whh)
mWxh = np.zeros_like(Wxh)
mWhy = np.zeros_like(Why)
mbh = np.zeros_like(bh)
mby = np.zeros_like(by)

# total iteration counter
iteration = 0
# index for loading data
load_idx = 0
# initial hidden state
hprev = np.zeros((hidden_size, 1))

# print some info
print(f'One epoch = {data_size/seq_length} iterations.')

while True:
    #------------------------------------------------------------------------------
    # reset loading index
    if load_idx + 1 + seq_length >= data_size:
        load_idx = 0
        hprev = np.zeros((hidden_size, 1))

    #------------------------------------------------------------------------------
    # load inputs and targets 
    inputs = [stoi[ch] for ch in data[load_idx:load_idx+seq_length]]
    targets = [stoi[ch] for ch in data[load_idx+1:load_idx+seq_length+1]]
    load_idx += seq_length

    #------------------------------------------------------------------------------
    # forward pass 
    xs, hs, ys, ps = {}, {}, {}, {}
    loss = 0
    hs[-1] = hprev

    for t in range(len(inputs)):
        # convert input to one-hot 
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(Wxh @ xs[t] + Whh @ hs[t-1] + bh)
        ys[t] = Why @ hs[t] + by
        # cross entropy 
        ps[t] = ys[t]
        #ps[t] -= np.max(ys[t])
        ps[t] = np.exp(ps[t]) / np.sum(np.exp(ps[t]))
        loss += -np.log(ps[t][targets[t]].item())
    # save the last hidden state to next initial hidden state
    hprev = hs[len(inputs)-1]

    #------------------------------------------------------------------------------
    # backward pass
    dWhh = np.zeros_like(Whh)
    dWxh = np.zeros_like(Wxh)
    dWhy = np.zeros_like(Why)
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])

    for t in reversed(range(len(inputs))):
        dtarget = ps[t]
        dtarget[targets[t]] -= 1
        dby += dtarget
        dWhy += dtarget @ hs[t].T
        dh = Why.T @ dtarget + dhnext
        dh_before_tanh = (1 - hs[t]**2) * dh 
        dbh += dh_before_tanh
        dWxh += dh_before_tanh @ xs[t].T
        dWhh += dh_before_tanh @ hs[t-1].T 
        dhnext = Whh.T @ dh_before_tanh

    #------------------------------------------------------------------------------
    # update
    # RMSprop

    for param, dparam, mparam in zip([Whh, Wxh, Why, bh, by],
                            [dWhh, dWxh, dWhy, dbh, dby],
                            [mWhh, mWxh, mWhy, mbh, mby]):
        mparam  = decay_rate * mparam + (1 - decay_rate) * dparam**2
        param += -lr * dparam / (np.sqrt(mparam) + 1e-8)

    #------------------------------------------------------------------------------
    print(f'Iteration : {iteration} | Loss : {loss}')
    iteration += 1

Data has 228145 characters, 27 unique.
One epoch = 4562.9 iterations.
Iteration : 0 | Loss : 164.78918074352006
Iteration : 1 | Loss : 164.76558581000256
Iteration : 2 | Loss : 164.73247050722895
Iteration : 3 | Loss : 164.77303516708812
Iteration : 4 | Loss : 164.73849118355986
Iteration : 5 | Loss : 164.73466550692666
Iteration : 6 | Loss : 164.7255443732766
Iteration : 7 | Loss : 164.70502765145832
Iteration : 8 | Loss : 164.68980661459057
Iteration : 9 | Loss : 164.68344936741588
Iteration : 10 | Loss : 164.6881377422864
Iteration : 11 | Loss : 164.67186767407424
Iteration : 12 | Loss : 164.6357510025456
Iteration : 13 | Loss : 164.611542359241
Iteration : 14 | Loss : 164.60223339434097
Iteration : 15 | Loss : 164.59256797393238
Iteration : 16 | Loss : 164.58292723877332
Iteration : 17 | Loss : 164.52156677429596
Iteration : 18 | Loss : 164.54921626861764
Iteration : 19 | Loss : 164.48120347013827
Iteration : 20 | Loss : 164.52616605081064
Iteration : 21 | Loss : 164.4148575929335


KeyboardInterrupt: 

In [13]:
# sample 
h = np.zeros((hidden_size, 1))
x = np.zeros((vocab_size, 1))
x[0, 0] = 1

result_idx = []

for _ in range(100):
    h = np.tanh(Wxh @ x + Whh @ h + bh)
    y = Why @ h + by
    p = np.exp(y) / np.sum(np.exp(y))
    idx = np.random.choice(range(vocab_size), p=p.ravel())

    result_idx.append(idx)

    x = np.zeros((vocab_size, 1))
    x[idx] = 1

result_char = [itos[i] for i in result_idx]
result = ''.join(result_char)
print(result)

xmkefnieqiqapintenpooevg
veo
wpcmcpesngkkeckbgiwbiewy
xnbopcqj
qiccvwshvepezkpxwgeweoxuncbj
pmbshxcn
