In [1]:
import numpy as np
import os
import gzip
import random
import time
import signal

import scipy.ndimage as nd
import PIL.Image
from IPython.display import clear_output, Image, display
from io import BytesIO

In [2]:
data = ""
ddir = "../data/books/witcher_rus/"
for fn in sorted(os.listdir(ddir)):
    if fn.endswith(".gz"):
        with gzip.open(ddir + fn, "rb") as f:
            data += f.read().decode("utf-8").replace("\r", "")
charset = sorted(set(data))
chidx = {ch: i for i, ch in enumerate(charset)}
chvec = {ch: np.array([chidx[ch] == i for i in range(len(charset))], dtype=np.float32) for ch in charset}
print(charset)
print("%s chars, %s unique" % (len(data), len(charset)))

['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Ы', 'Ь', 'Э', 'Ю', 'Я', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'ё']
5088376 chars, 140 unique


In [520]:
def tanh_deriv(x):
        return 1/np.cosh(x)**2

def softmax(y):
    #y = y - np.max(y)
    ey = np.exp(y)
    return ey/np.sum(ey)

def cross_entropy_loss(y, ai):
    return -np.log(y[ai])

In [550]:
x = np.array([0.1, 0.9, 0.1, 0.1])
print(softmax(x))
np.random.choice(['a', 'b', 'c', 'd'], p=softmax(x))

[0.19136775 0.42589676 0.19136775 0.19136775]


'a'

In [565]:
class Cell:
    def __init__(self, sx, sh, sy, mag=1e-1):
        self.sizes = (sx, sh, sy)
        layer_sizes = [(sx, sh), (sh, sh), (sh, sy)] # W_xh, W_hh, W_hy
        self.params = [ mag*np.random.randn(*s) for s in layer_sizes]
    
    def step(self, h, x, mem=False):
        W_xh, W_hh, W_hy = self.params
        v = np.dot(x, W_xh) + np.dot(h, W_hh)
        a = np.tanh(v)
        y = np.dot(a, W_hy)
        if mem:
            m = [x, h, v, a, y]
        else:
            m = None
        return a, softmax(y), m
    
    def newgrad(self):
        return [np.zeros_like(v) for v in self.params]
    
    def newstate(self):
        return np.zeros(self.sizes[1], dtype=np.float64)
    
    def backprop(self, grad, eh, m, ey):
        W_xh, W_hh, W_hy = self.params
        dW_xh, dW_hh, dW_hy = grad
        x, h, v, a, y = m
        
        dW_hy += np.outer(a, ey)
        ea = np.dot(W_hy, ey) + eh
        ev = ea*tanh_deriv(v)
        dW_xh += np.outer(x, ev)
        dW_hh += np.outer(h, ev)
        eh = np.dot(W_hh, ev)
        
        return eh
    
    def learn(self, grad, learning_rate, adagrad=None):
        if adagrad is None:
            for W, dW in zip(self.params, grad):
                W -= learning_rate*dW
        else:
            for W, dW, aW in zip(self.params, grad, adagrad):
                aW += dW**2
                W -= learning_rate*dW/np.sqrt(aW + 1e-8)
        # rmsprop: aW += rho*aW + (1 - rho)*grad**2

In [566]:
sx, sh, sy = 20, 10, 5
cell = Cell(sx, sh, sy)
x = np.random.randn(sx)
h = cell.newstate()
nsteps = 10

mem = []
for i in range(nsteps):
    h, y, m = cell.step(h, x, mem=True)
    mem.append(m)

print(x)
print(h)
print(y)

[ 0.26320309  0.91429123  0.31532077  0.29169201 -0.97312506 -0.86623378
 -0.73840531  0.30238288 -0.25954162 -0.23406041  1.56373461  1.87434848
  1.97406212 -1.82394484  1.84878496 -0.75463792 -1.27107995 -2.1896793
  0.31094416  1.50679702]
[-0.11202765 -0.54942735  0.15565051  0.65898392 -0.3227755   0.07139345
 -0.38709182 -0.15130839  0.48390714  0.45629931]
[0.22764417 0.21168232 0.1824541  0.17352255 0.20469685]


In [567]:
nlearn = 1

for i in range(nlearn): 
    grad = cell.newgrad()
    eh = cell.newstate()
    for m in reversed(mem):
        #print(m)
        eh = cell.backprop(grad, eh, m, y - [1, 0, 0, 0, 0])
    #[np.clip(dW, -5, 5, out=dW) for dW in grad]
    cell.learn(grad, 1e-3)
    print(eh)
    #print(grad)

[ 0.00835738 -0.01145026  0.00694131 -0.00048273  0.00360324 -0.03403861
  0.03760375  0.0349259  -0.01131075 -0.02413754]


In [574]:
net = Cell(len(charset), 100, len(charset))
grad = net.newgrad()

In [575]:
seqlen = 100
rate = 1e-1

loss = 0.0
mem = []
iepoch = 0
pos = 0
h = net.newstate()
adagrad = net.newstate()

In [576]:
done = False
def signal_handler(signal, frame):
    global done
    done = True
signal.signal(signal.SIGINT, signal_handler)

while not done:
    h, y, m = net.step(h, chvec[data[pos]], mem=True)
    mem.append((m, y, data[pos + 1]))
    loss = 0.99*loss + 0.01*cross_entropy_loss(y, chidx[data[pos + 1]])
    
    if (pos + 1) % seqlen == 0:
        [dW.fill(0) for dW in grad]
        eh = net.newstate()
        for m, y, ch in reversed(mem):
            eh = net.backprop(grad, eh, m, y - chvec[ch])
        [np.clip(dW, -5, 5, out=dW) for dW in grad] # mitigate exploding gradient
        net.learn(grad, rate/seqlen, adagrad=adagrad)
        mem = []
    
    if pos % 10000 == 0:
        clear_output()
        print("%s - %s/%s: %s" % (iepoch, pos, len(data), loss))
        
        ch = data[pos]
        _h = h
        for i in range(100):
            _h, y, _ = net.step(_h, chvec[ch])
            ch = np.random.choice(charset, p=y)
            print(ch, end='')
        print()
        
    pos += 1
    if pos >= len(data) - 1:
        pos = 0
        iepoch += 1
        
print("done")

0 - 1130000/5088376: 2.257617982927423
одируж. iреим трак тра жно булит, мот за породе же дложно. I8оне ем й евцель ва ботыы та обтантом ст
done
