In [1]:
import numpy as np
import pandas as pd

In [28]:
df = pd.read_csv('Chinese_Names_120m.csv')

name = df.iloc[:5,0]

name = name.tolist()

name=''.join(name)

len(set(name))

set(name)

{'奥', '安', '昂', '爱', '艾', '阿'}

In [29]:
df = pd.read_csv('names.csv')
data = ' '.join(df.name.tolist())

In [30]:
# data I/O
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f'data has {data_size} characters, {vocab_size} unique.')
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

data has 237097 characters, 53 unique.


In [31]:
# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

In [33]:
hidden_size

100

In [41]:
# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [65]:
def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [59]:
def sample(h, seed_ix, n):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

In [43]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0

In [48]:
np.log(vocab_size)*25

99.25729783880305

In [45]:
seq_length

25

In [49]:
hprev = np.zeros((hidden_size,1))

In [51]:
hprev.shape

(100, 1)

In [52]:
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

In [54]:
data[p:p+seq_length]

'Emma Olivia Sophia Isabel'

In [55]:
data[p+1:p+seq_length+1]

'mma Olivia Sophia Isabell'

In [56]:
inputs[0]

44

In [64]:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt, ))

----
 PfKTwPfmTLjbeVZCLEYRbGltieocPNNtsnTWVZNjehJnZLQRvvrMFQNpxCxHQncWpvjBYZYqasQfIdjPqtrPTEYDcCOxngJamYZCVpjyxVEIQRCbvbqpYtPM ZNVXcNiIDRixwFDVnkWUscUzu juwCkYuBqHeUVcyiWELKcoIr nr TKNCDpxLGyyKENSvJnmvweRQQ 
----


In [66]:
loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)

In [100]:
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0

In [101]:
t=0

In [102]:
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)

In [141]:
hs[-1].shape

(100, 1)

In [138]:
ps[t][targets[t]][0]

0.0398582039920804

In [132]:
ps[t]

array([[1.22462687e-02],
       [3.70884873e-04],
       [1.47718282e-01],
       [4.91778089e-04],
       [1.43579039e-02],
       [1.94613594e-03],
       [3.98582040e-02],
       [1.83239784e-03],
       [1.40377295e-03],
       [2.25524356e-03],
       [1.86338027e-02],
       [1.82163330e-03],
       [1.36034842e-03],
       [5.34832575e-03],
       [5.31882994e-03],
       [2.56723943e-02],
       [2.08954666e-01],
       [1.67814105e-04],
       [5.43953948e-02],
       [1.23416029e-02],
       [2.71522941e-02],
       [7.07506732e-04],
       [3.90479874e-03],
       [3.07619937e-03],
       [6.03594339e-04],
       [2.60599790e-02],
       [3.70108202e-05],
       [2.20467155e-03],
       [8.62683131e-03],
       [6.98119822e-04],
       [5.34681858e-03],
       [2.66708994e-03],
       [1.11460252e-02],
       [1.62131224e-03],
       [4.59798728e-03],
       [2.41688939e-03],
       [7.15391514e-04],
       [1.00846412e-03],
       [7.64864017e-04],
       [1.19025338e-02],


In [126]:
ps[t].argmax()

16

In [117]:
Wxh[:,10]

array([-0.11848706, -0.05649749,  0.02691294,  0.19014995,  0.07061441,
       -1.16252144,  0.12697324, -0.08889963, -0.06267175, -0.40105323,
       -0.18747102, -0.18140617,  0.07432117,  0.34334437,  0.06726984,
        0.20165935,  0.32747234,  0.04495337,  0.1142929 , -0.05521551,
        0.00885307,  0.33516984,  0.13067932, -0.10904185,  0.19814699,
        0.0913149 , -0.06236525, -0.01253117,  0.41185781, -0.3912567 ,
       -0.1494085 ,  0.36435163, -0.44824475, -0.09278199,  0.34402929,
       -0.15928979, -0.26921809,  0.05811367, -0.13787257,  0.20121986,
        0.01388096,  0.07998907,  0.05387593,  0.0689815 ,  0.26390591,
        0.0065182 , -0.09641561,  0.20798219,  0.07988806,  0.20825779,
       -0.26811509, -0.03696904, -0.16780612, -0.38446364,  0.28463213,
        0.14157797, -0.19357392, -0.16630077,  0.11378831,  0.25725707,
        0.17818427,  0.008875  , -0.06520907,  0.12799508,  0.22632408,
       -0.17792747,  0.19119546, -0.04671774, -0.45145593,  0.05

In [74]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0: 
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

  # sample from the model now and then
  if n % 100 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt, ))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 100 == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter 

----
 SNPHDlck jNDcyFlcRmyiBOycoA ZJvPDWJNXuXDwPESsqEvRiFdp  iVxf NdPaR oQAjRlyKkZwcRNiOfGPpJdhIicBVRaGBzJzADedAlTVTBxShJXPIdoGPRpmcXpUqzDXJzqcowKGmvcY  xGtDRdSKUFIatgziyihFlhHYWjdzrCRfKhimxMIRKfOdHgngXtoXY 
----
iter 0, loss: 99.257297
----
 ig  yCa HiniAsc arm X sKPDhaaaiey AACsi JsLea n HalrLyTiPraiLeyilFm a LsnASg rarikabn ErH aml ise linrCyLrKgark yslaLuAahrlaJfcePrexsAl ljhei ile al ay aeLaa cMASheKlaaysaS VoiJli dEHaHdaen Asinl SKes 
----
iter 100, loss: 99.583680
----
  esi Aauei hl ClnImlti EZiWeZeniyiivliai aesaelistsnsLWemiM C el ega a use shniiiisarnDaeatiandeyllid Zi ataytne Aa  en gm iyioa AZa Vlie KieinhlisauaseaJ a AsisKa  Jm ne rlnrrtash asna lisaterali Aae 
----
iter 200, loss: 97.445989
----
 a Cyl Mla LKila Ksieniyhn feplialent ikunea MiaJ YMel Koallmitn Rorony Kmyeni Kriaeelna Cyttl Ala HaArl Aoeas Fsbly loney Ca AKanrmen nhe ngA Con CMlnn Naa Cyatsyna oenGey Rhls naeahn ZMPvnai yryiir h 
----
iter 300, loss: 94.941136
----
 sal Liiiada Mn Kararana lna lil ealei Mr

KeyboardInterrupt: 

In [None]:
df = pd