In [1]:
# Data
import numpy as np

# if __name__ == '__main__':
with open('data/text_data/japan.txt', 'r') as f:
    txt = f.read()

    X = []
    y = []

    char_to_idx = {char: i for i, char in enumerate(set(txt))}
    idx_to_char = {i: char for i, char in enumerate(set(txt))}
    
    X = [char_to_idx[x] for x in txt]
    X = np.array(X)
    y = [char_to_idx[x] for x in txt[1:]]
    y.append(char_to_idx['.'])
    y = np.array(y)

In [2]:
# Model
import impl.layer as l

class RNN:

    def __init__(self, D, H, L, char2idx, idx2char, p_dropout):
        self.D = D
        self.H = H
        self.L = L
        self.char2idx = char2idx
        self.idx2char = idx2char
        self.vocab_size = len(char2idx)
        self.losses = {'train':[], 'smooth train':[]}
        self.p_dropout = p_dropout
        
        # Model parameters
        m = dict(
            Wxh=np.random.randn(D, H) / np.sqrt(D / 2.),
            Whh=np.random.randn(H, H) / np.sqrt(H / 2.),
            Why=np.random.randn(H, D) / np.sqrt(H / 2.),
            bh=np.zeros((1, H)),
            by=np.zeros((1, D))
            )
        self.model = []
        for _ in range(self.L):
            self.model.append(m)
            
    def initial_state(self):
        return np.zeros((1, self.H))
    
    def dropout_forward(self, X, p_dropout):
        u = np.random.binomial(1, p_dropout, size=X.shape) / p_dropout
        #         u = np.random.binomial(1, q, size=X.shape)
        out = X * u
        cache = u
        return out, cache

    def dropout_backward(self, dout, cache):
        dX = dout * cache
        return dX

    def forward(self, X, h, m, train):
        Wxh, Whh, Why = m['Wxh'], m['Whh'], m['Why']
        bh, by = m['bh'], m['by']

        hprev = h.copy()
        X_one_hot = X.copy()
    
        X = (X_one_hot @ Wxh) + (hprev @ Whh) + bh
        h, h_cache = l.tanh_forward(X)
        
        y, y_cache = l.fc_forward(h, Why, by)

        # Dropout for training
        if train:
            y, y_do_cache = self.dropout_forward(X=y, p_dropout=self.p_dropout)
            cache = (X_one_hot, Wxh, hprev, Whh, h_cache, y_cache, y_do_cache)
        else:
            cache = (X_one_hot, Wxh, hprev, Whh, h_cache, y_cache)

        return y, h, cache

    def backward(self, dy, dh, cache):
        #         if train: 
        # Backward is performed for learning/training and not testing or validation.
        X_one_hot, Wxh, hprev, Whh, h_cache, y_cache, y_do_cache = cache
        dy = self.dropout_backward(dout=dy, cache=y_do_cache)

        dh_next = dh.copy()
        
        # Hidden to output gradient
        dh, dWhy, dby = l.fc_backward(dy, y_cache)
        dh += dh_next
        dby = dby.reshape((1, -1))

        # tanh
        dX_one_hot = l.tanh_backward(dh, h_cache)

        # Hidden gradient
        dbh = dX_one_hot * 1.0
        dWhh = hprev.T @ dX_one_hot
        dWxh = X_one_hot.T @ dX_one_hot
        
        dX = dX_one_hot @ Wxh.T
        dh = dX_one_hot @ Whh.T

        grad = dict(Wxh=dWxh, Whh=dWhh, Why=dWhy, bh=dbh, by=dby)
        
        return dX, dh, grad

    def train_forward(self, X_train, h):
        ys, caches = [], []
        h_init = h.copy()
        h = []
        for _ in range(self.L):
            h.append(h_init.copy())
            caches.append([])
            
        for X in X_train:
            X_one_hot = np.zeros(self.D)
            X_one_hot[X] = 1.
            X = X_one_hot.reshape(1, -1)
            for layer in range(self.L):
                y, h[layer], cache = self.forward(X, h[layer], self.model[layer], train=True)
                caches[layer].append(cache)
                X = y.copy() # the output for the previous layer is the input for the next layer
            ys.append(y)
            
        return ys, caches

    def cross_entropy(self, y_pred, y_train):
        m = y_pred.shape[0]

        prob = l.softmax(y_pred)
        log_like = -np.log(prob[range(m), y_train])
        data_loss = np.sum(log_like) / m

        return data_loss

    def dcross_entropy(self, y_pred, y_train):
        m = y_pred.shape[0]

        grad_y = l.softmax(y_pred)
        grad_y[range(m), y_train] -= 1.0
        grad_y /= m

        return grad_y
    
    def loss_function(self, y_train, ys):
        loss, dys = 0.0, []

        for y_pred, y in zip(ys, y_train):
            loss += self.cross_entropy(y_pred, y)
            dy = self.dcross_entropy(y_pred, y)
            dys.append(dy)
            
        return loss, dys

    def train_backward(self, dys, caches):
        dh, grad, grads = [], [], []
        for layer in range(self.L):
            dh.append(np.zeros((1, self.H)))
            grad.append({key: np.zeros_like(val) for key, val in self.model[layer].items()})
            grads.append({key: np.zeros_like(val) for key, val in self.model[layer].items()})
        
        dXs = []
        for t in reversed(range(len(dys))):
            dy = dys[t]
            for layer in reversed(range(self.L)):
                dX, dh[layer], grad[layer] = self.backward(dy, dh[layer], caches[layer][t]) # train=True
                for key in grad[layer].keys():
                    grads[layer][key] += grad[layer][key]
                dy = dX.copy() # The input for the next layer is the output for the previous layer
            dXs.append(dX)
            
        return dXs, grads
    
    def test(self, X_seed, h, size):
        chars = [self.idx2char[X_seed]]
        idx_list = list(range(self.vocab_size))
        X = X_seed
        
        h_init = h.copy()
        h = []
        for _ in range(self.L):
            h.append(h_init.copy())

        for _ in range(size):
            X_one_hot = np.zeros(self.D)
            X_one_hot[X] = 1.
            X = X_one_hot.reshape(1, -1)
            for layer in range(self.L):
                y, h[layer], _ = self.forward(X, h[layer], self.model[layer], train=False)
                X = y.copy()
                
            prob = l.softmax(y)
            idx = np.random.choice(idx_list, p=prob.ravel())
            chars.append(self.idx2char[idx])
            X = idx

        return ''.join(chars)

In [3]:
def get_minibatch(X, y, minibatch_size, shuffle):
    minibatches = []

    for i in range(0, X.shape[0], minibatch_size):
    # for i in range(0, X.shape[0] - minibatch_size + 1, 1):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]
        minibatches.append((X_mini, y_mini))

    return minibatches

def adam_rnn(nn, X_train, y_train, alpha, mb_size, n_iter, print_after):
    M, R = [], []
    for layer in range(nn.L):
        M.append({key: np.zeros_like(val) for key, val in nn.model[layer].items()})
        R.append({key: np.zeros_like(val) for key, val in nn.model[layer].items()})
        
    beta1 = .99
    beta2 = .999
    state = nn.initial_state()
    eps = 1e-8
    smooth_loss = 1.
    minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False)

    # Epochs
    for iter in range(1, n_iter + 1):

        # No full batch or files
        # Minibatches
        for idx in range(len(minibatches)):
            X_mini, y_mini = minibatches[idx]
            ys, caches = nn.train_forward(X_mini, state)
            loss, dys = nn.loss_function(y_mini, ys)
            _, grads = nn.train_backward(dys, caches)
            nn.losses['train'].append(loss)
            smooth_loss = (0.999 * smooth_loss) + (0.001 * loss)
            nn.losses['smooth train'].append(smooth_loss)

            for layer in range(nn.L):
                for key in grads[layer].keys(): #key, value: items
                    M[layer][key] = l.exp_running_avg(M[layer][key], grads[layer][key], beta1)
                    R[layer][key] = l.exp_running_avg(R[layer][key], grads[layer][key]**2, beta2)

                    m_k_hat = M[layer][key] / (1. - (beta1**(iter)))
                    r_k_hat = R[layer][key] / (1. - (beta2**(iter)))

                    nn.model[layer][key] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + eps)
    
        # Print loss and test sample
        if iter % print_after == 0:
            print('Iter-{} loss: {:.4f}'.format(iter, loss))
            sample = nn.test(X_mini[0], state, size=100)
            print(sample)
            
    return nn

In [None]:
# Hyper-parameters
time_step = 100 # width, minibatch size and test sample size as well
num_layers = 1 # depth
n_iter = 10000 # epochs
alpha = 1e-4 # learning_rate
p_dropout = 0.95 # q=1-p, q=keep_prob and p=dropout.
print_after = 10 # n_iter//10 # print training loss, valid, and test
num_hidden_units = 64 # num_hidden_units in hidden layer
num_input_units = len(char_to_idx) # vocab_size = len(char_to_idx)

# Build the network and learning it or optimizing it using SGD
net = RNN(D=num_input_units, H=num_hidden_units, L=num_layers, char2idx=char_to_idx, idx2char=idx_to_char, 
          p_dropout=p_dropout)

# Start learning using BP-SGD-ADAM
adam_rnn(nn=net, X_train=X, y_train=y, alpha=alpha, mb_size=time_step, n_iter=n_iter, print_after=print_after)

# # Display the learning curve and losses for training, validation, and testing
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(net.losses['train'], label='Train loss')
plt.plot(net.losses['smooth train'], label='Train smooth loss')
plt.legend()

plt.show()

Iter-10 loss: 124.0747
e(cgBbHjSe,lm本KbC9MU2WlcmnwNti;cx:%日RiW"e–iw)e本ntcf2EeaW1iLa本kDEiubMMNtd5js7;oL6FtD本4s)hN7duasryS–elt
Iter-20 loss: 106.2611
e日NA3"yTd7sy8rra.NeUout 9cw nwes wfst–PMftrihPhrtrafahota  dO neelews–iheai1 ttafIkb Dtw   ieh a" aeT
Iter-30 loss: 108.0775
ePthmMr BhntbR  Ugk ara i uh2 sbeAuoolehs airttDuncsnt  akprti- nd brdet yspgoodicE8a stf rprclSn 2sa
Iter-40 loss: 106.0726
eE1vOill i-l 8iis iel oiyus  iide 'fa Cedto n eliWelre-thn pidcMi sWnwri8lCh dw g Cil oi  kh 6lstkoef
Iter-50 loss: 104.1235
eoThs t ttaahs mtagpbse smnom aa var rtoyr dp a-tip sfhhise tlaaldo nisttadd z lic'a.eii  itlS  i Sh 
Iter-60 loss: 104.0994
e.(vra xuusrJ nttese iyr tn vperhwrdiracredaia1 plgnfoandvi  paiaterearooewemdandroe  ri ners eaa,an 
Iter-70 loss: 99.3134
eBs1ni1uhts1 rerfhn erf Ir7f etnt tJchwgl5 ia  odotcdnierh inmeohiehntta6 he ilhta na ieor,ag's cpg h
Iter-80 loss: 100.9485
e"RDtf 's gpl
dBioyxte.aaotheortnteai diat.e ia  oe i日  iemT sot  ofynd etr  ar1Jvesaea "e  )cvaftsut
I

Iter-670 loss: 65.2916
eelhiventhe oi9gkof wer se nioyts. eabl JheaSestcwofit condd, the 1agte ralegeat in Rasincewor sicnie
Iter-680 loss: 57.5149
e'n fasd aane congert wipan eatae Winecinrm taro wan ala aus fboas ,the, Tatcalee. thr Rnsfeirtr ws b
Iter-690 loss: 64.6545
ecGar  file lople matr es aatdosinas an 18etn fhr .yprt warad tard tf arrtey aso oid ,ond ,hys  arcok
Iter-700 loss: 64.6340
e, aod Eurso%f rh terrir tan and ioNin Oyg
n.Mpartiatc nsich' ro4n, tht 1r. phr Tsi5he RoC,lae  acato
Iter-710 loss: 62.6590
eb2Ty Javooes aiutd r94tran coI pob8 bi an fedand it the Rlfmiehtinb eree oNit ,oft itegd opothi ithn
Iter-720 loss: 58.3846
enIree. ropJongd1so angisoal theyRh lomd th isN ily lomiss ano1desLesine cerld rBthi fNLsxin1 gad fnP
Iter-730 loss: 59.9207
e the Serrhxdiby anenIeilgcred 3eaead i1un avgstumise ifgr. guveuch 6 pupgrkti teinyd colenciand O"va
Iter-740 loss: 59.3005
e9oce ooktdnl ohise pan faperuth  Eevelcho Uert- iolrd Ry )i5. andena9e at eapan ra, toredf cultstmin


Iter-1330 loss: 42.2477
ew,ius JapeO, eika  of liven ald Ghe berthe the cofli's Icumore lut3h.5J-panc mrrex and Japan insea a
Iter-1340 loss: 47.2673
eJ phn ef ahd R IngeHllohok. Jaiancipanusm agdtid in oito, mhe fopetrrasticeOtobowil Aored War of Eap
Iter-1350 loss: 45.6910
e War of Napk本ats ingloea. thw seuAnt5 ine an 1yr h othithe 18jc wrmpupceataly erlloesturgusofiy eala
Iter-1360 loss: 41.8005
eNp8rusth ioald TotcouAst and chnclfbusy in o-ingond ri, th. s. fira anoT. an Se tho f oey.Rr, istar 
Iter-1370 loss: 44.9484
e6. and coand argise ar onne9ebt bf Japioes xi tienter uratrgs fhes tand is 18Skidsy pandeoche Wigg a
Iter-1380 loss: 51.6137
e:"bopnd Ie mpegt8ry Ashac an CR whe tares fof thin liag Kt the worrored Jipateitis torldes ap.nil ci
Iter-1390 loss: 47.3942
e Nof on khd urpis inUNso"oke –her of onteela wit bin Ciglerooat. baly pived eoery If Gho ffrstd tse 
Iter-1400 loss: 43.1758
e. The s2antod ahd the the hio, the Garit, oapet on ioly, As toperlxeanginstarnisgelWoy sived 

Iter-1990 loss: 37.0352
eveapyins diuly'cud Jupes thia tise Gn the bEmbh mouly. ro fn itere, anst-. omllreat the. Theld Sin r
Iter-2000 loss: 34.3061
eg ioty chan, wor he free in Naso"ok.  9ina. pat t"s h, Emituton Natt opel wis  ard Nhesoupery ssclfe
Iter-2010 loss: 32.8434
e6 worla, tho veumbed Gig thopeguman so 9hic afrt eocan, Titshis tha Ersturcen.appen, wareI il Gapiyg
Iter-2020 loss: 32.3857
e Soplobe ts ofOlyPlaxe thon Paland le 19ve 1mmoweol mios rofroen mpreomarldan Sty- aslaceanet2 ss of
Iter-2030 loss: 33.2760
e Jaran' Iyveseriond niry if uurereanctesfldsind in es lurar glliiert cond rudss folopand if the into
Iter-2040 loss: 40.6070
e Solrod desta calae tures inliosouriut in 1861847Japan'se and an iwhe Japand maresicin tur on ichan 
Iter-2050 loss: 35.6714
erlasistr is the tomines imed, wosulup-on A86ryn isrtucopol cuneanin, baccof minlag ft. carycertae an
Iter-2060 loss: 42.1363
e the ocxmalgooftmeh sh fh sr urto-. Wo lory. "h. 6fl9roit lhceceanaro te Sie tf otPlagesfhG B

Iter-2650 loss: 28.6133
e worrd in msepoof ixur Inieseano th fEusbce eoflien itg eceperurb io sn iss ko0本 ", pares war dinay 
Iter-2660 loss: 31.3776
erJ-panky paree i the Riston "ofe er regeamion. n8ry, whi obal Donnthe angoy tyrestat on t9elilila, K
Iter-2670 loss: 25.8980
e: ropsr fiola, inter andend is the nwmrtaenas the almeoke oniagin turevty ingtist isto yaly sith- an
Iter-2680 loss: 27.9598
e, whr h atginly ppacanes wirnd cures s sod fven od Ing bor d, hh comDd itig Win ortd f"Runce-taml1bn
Iter-2690 loss: 35.9091
ed che Word sn riea, The constury. Tis c:ugny rive Japan th whes sur trokad the Gfury. Aseandestor wa
Iter-2700 loss: 27.9956
e Sof ist  smargins a aed wistd wevelichin thity-lereot wof Hropantile woploun he Nthe Nlate  irthe f
Iter-2710 loss: 35.8363
e Sirgns cofboundiand D anrei olme in rur UChirin wom 1stcynpext roulhopan, wakeb, ghe fds af The Pow
Iter-2720 loss: 46.2626
e Japan'in andpcroutrd s8vy fO, pan tarins andeIon bed bofedin thevelWog the Esperess rrcorxUs