Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy). Adapted to micrograd by @bradklingensmith
BSD License

# Data and Hyperparameters

In [1]:
# data I/O
def remove_comments(data): # This makes the data easier to learn
    return '\n'.join([(l[:l.index('#')] if '#' in l else l).rstrip() for l in data.splitlines()])
data = remove_comments(open('micrograd/engine.py', 'r').read()) # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f'data has {data_size} characters, {vocab_size} unique.')
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
data_char_ixs = [char_to_ix[ch] for ch in data]

data has 2357 characters, 50 unique.


In [2]:
# hyperparameters
hidden_size = 16 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for

# All the helpers!

In [3]:
import math
from micrograd.engine import Value
import micrograd.nn
import numpy as np
import random

def make_unary(f, df, name):
    def unary(x):
        out = Value(f(x.data), (x,), name)

        def _backward():
            x.grad += df(x.data) * out.grad
        out._backward = _backward

        return out
    return unary

tanh = make_unary(math.tanh, lambda x: 1. - math.tanh(x)**2., 'tanh')
exp = make_unary(math.exp, math.exp, 'exp')

def softmax(xs):
    max_x = max([x.data for x in xs]) # for numerical stability, subtract off the max
    zs = [x - max_x for x in xs]

    exp_zs = list(map(exp, zs))
    sum_exp_zs = sum(exp_zs)
    ps = [z / sum_exp_zs for z in exp_zs] # probabilies
    return ps

def softmax_cross_entropy(xs, target_i):
    max_x = max(x.data for x in xs) # for numerical stability, subtract off the max
    exp_xs = list(map(math.exp, (x.data - max_x for x in xs)))
    sum_exp_xs = sum(exp_xs)
    ps = [exp_x / sum_exp_xs for exp_x in exp_xs] # probabilies
    loss = -math.log(ps[target_i])

    out = Value(loss, xs, 'softmax cross entropy')
    def backward():
        for i, x in enumerate(xs):
            x.grad += ps[i] - (i == target_i)
    out._backward = backward
    return out

def one_hot(index, length):  # encode in 1-of-k representation
    return [0.] * index + [1.] + [0.] * (length - index - 1)

def lerp(a, b, fraction):
    return a + (b - a) * fraction

# The Recurrent Network

In [4]:
class RNNNeuron(micrograd.nn.Module):
    ''' Takes x (prev layer output) and h (previous hidden state) as 
    input and computes dot(xs, wxhs) + dot(hs, whhs) + b'''

    def __init__(self, nin, nh, recurrent=True, nonlin=True):
        ''' if recurrent is False, this will act like a plain neuron '''
        self.wxs = [Value(random.gauss(0.,1.)) for _ in range(nin)]
        self.whs = [Value(random.gauss(0.,1.)) for _ in range(nh)] if recurrent else []
        self.b = Value(0.)
        self.nonlin = nonlin

    def __call__(self, xs, hs):
        data = lambda x: x.data if type(x) == Value else x
        act = Value(0., [v for v in self.parameters() + xs + hs if type(v) == Value], 'RNNNeuron')
        act.data = sum((w.data*data(x) for w, x in zip(self.wxs+self.whs, xs+hs)), self.b.data)
        
        def backward():
            for w, x in zip(self.wxs + self.whs, xs + hs):
                w.grad += data(x) * act.grad
                if type(x) == Value:
                    x.grad += w.data * act.grad
            self.b.grad += act.grad
        act._backward = backward
            
        return tanh(act) if self.nonlin else act

    def parameters(self):
        return self.wxs + self.whs + [self.b]

    def __repr__(self):
        return f"{'Tanh' if self.nonlin else 'Linear'}RNNNeuron({len(self.wxs)}, {len(self.whs)})"

class RNNLayer(micrograd.nn.Module):

    def __init__(self, nin, nout, **kwargs):
        self.neurons = [RNNNeuron(nin, nout, **kwargs) for _ in range(nout)]

    def init_hidden(self):
        return [0.] * len(self.neurons)

    def __call__(self, xs, hs):
        return [n(xs, hs) for n in self.neurons]

    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

    def __repr__(self):
        return f"RNNLayer of [{', '.join(str(n) for n in self.neurons)}]"


class RNN(micrograd.nn.Module):
    """ A network with a sequence of RNNLayer
    
    Note that the last layer is not recurrent and acts like a plain Layer."""

    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        def make_layer(i):
            is_not_last = i<len(nouts)-1
            return RNNLayer(sz[i], sz[i+1], recurrent=is_not_last, nonlin=is_not_last)
        self.layers = list(map(make_layer, range(len(nouts))))

    def __call__(self, x, prev_hiddens=None):
        prev_hiddens = prev_hiddens if prev_hiddens else []
        hiddens = []
        for i, layer in enumerate(self.layers):
            x = layer(x, prev_hiddens[i] if len(prev_hiddens) > i else layer.init_hidden())
            hiddens.append(x)
        return x, hiddens

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

    def __repr__(self):
        return f"RNN of [{', '.join(str(layer) for layer in self.layers)}]"
    
rnn = RNN(vocab_size, [hidden_size, vocab_size])
rnn

RNN of [RNNLayer of [TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16), TanhRNNNeuron(50, 16)], RNNLayer of [LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0), LinearRNNNeuron(16, 0

# Sampling (Inference)

In [5]:
def sample_ixs(hs, ix, n):
    """
    sample a sequence of character indices from the model
    hs is memory state, ix is seed character index for first time step
    """
    ixes = []
    for t in range(n):
        ys, hs = rnn(one_hot(ix, vocab_size), hs)
        ix = random.choices(range(vocab_size), weights=[p.data**2. for p in softmax(ys)])[0]
        ixes.append(ix)
    return ixes

def sample_text(hs, seed_char, n):
    ixs = sample_ixs(hs, char_to_ix[seed_char], n)
    return ''.join(ix_to_char[ix] for ix in ixs)

# Training

## Optimizer

In [6]:
class AdamW:
    def __init__(self, params):
        self.params, self.m, self.v, self.t = params, [0.]*len(params), [0.]*len(params), 0
        
    def update_params(self, lr=0.005, beta1=0.9, beta2=0.999, weight_decay=1e-2):
        self.t += 1
        for i, param in enumerate(self.params):
            self.m[i] = lerp(param.grad, self.m[i], beta1) # momentum
            self.v[i] = lerp(param.grad**2., self.v[i], beta2) # grad magnitudes
            
            m = self.m[i] / (1. - beta1**self.t)
            v = self.v[i] / (1. - beta2**self.t)            
            param.data -= lr * (m / math.sqrt(v + 1e-8) + weight_decay * param.data)

## Loss Function

In [7]:
def calc_loss(inputs, target_is, hs=None):
    """
    inputs, target_is are both list of character indices.
    hs is the initial hidden state (list of hidden_size floats or None)
    returns the loss and last hidden state
    """
    loss = 0.
    # forward pass
    for input_t, target_i in zip(inputs, target_is):
        xs = one_hot(input_t, vocab_size)
        ys, hs = rnn(xs, hs)
        loss += softmax_cross_entropy(ys, target_i)
    loss /= len(inputs) # avg over sequence so loss/gradients are sequence-length-independent
    hs = [[h.data for h in hs[0]]] # make plain data copy to break _backward chain
    return loss, hs

## Training Loop

In [8]:
import sys        

def train():
    n, p, hs = 0, 0, None
    adam = AdamW(rnn.parameters())
    
    print(f'\n----\n {sample_text(None, data[0], 200)} \n----')
    
    smooth_loss = -math.log(1.0/vocab_size) # loss at iteration 0
    while smooth_loss > 0.5: # ~40-200k iterations depending on how intently you watch it
        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if p+seq_length+1 >= len(data) or n == 0:
            p, hs = random.randint(0, seq_length-1), None # go from start of data
        inputs, targets = data_char_ixs[p:p+seq_length], data_char_ixs[p+1:p+seq_length+1]

        # forward seq_length characters through the net and fetch gradient
        rnn.zero_grad()
        loss, hs = calc_loss(inputs, targets, hs)
        loss.backward()
        for param in rnn.parameters():
            param.grad = max(-5., min(5., param.grad)) # clip to mitigate exploding gradients
        smooth_loss = lerp(smooth_loss, loss.data, 0.001)
        adam.update_params()
        
        if (n+1) % 10 == 0:
            sys.stdout.write(f'\riter {n+1}, loss: {smooth_loss}      ') # print progress
            sys.stdout.flush()
        
        # sample from the model now and then
        if (n+1) % 1000 == 0:
            print(f'\n----\n {sample_text(hs, ix_to_char[inputs[0]], 200)} \n----')

        p += seq_length # move data pointer
        n += 1 # iteration counter
train()


----
 +luo0k<e]
p{{]{]1
 gg(<*(u_*]y(
tom.>L_*].(ytu>..ue(fn(fn(fn(fn(f/(
hud*.:o0k<0>y(]
pVViN*.:*.do0k<e]
"{"p{]1h1><e]
p{{p1nh "]Uyec]0>yh1<e.y*.:*.do.mo*f<.vu*11<e>=>yn*1<*(Loe"mNt*>L_p1<e>=]moe]<+,u*(y 
----
iter 1000, loss: 3.4395131818633593      
----
 he                                                                                                                                                                                                       
----
iter 2000, loss: 2.623248002667137       
----
 ,                                                                                                                            
                                                                           
----
iter 3000, loss: 2.1513223763664278      
----
 = self._e__t)
                                                                   def __p(iederther.d warself.__a_ethers_(self.:
                                                                         
----
iter 4000, loss: 1.84

iter 33000, loss: 0.5910133543378255      
----
 ifl_(self, other):
            f   self.grad += (self.data * other.grad += (other = other if  out._backward():
        return self.data * other.grad += (other.grad v)
    """        self.data * out

  
----
iter 34000, loss: 0.5892072288067064      
----
 ckward()
        other + out = Value(self.data, (self.data * other * self._backward = _backward = _backward

        out._backward

        self.grad += other.grad += other * self.data < 0

        ou 
----
iter 35000, loss: 0.5916507806410812      
----
 er if  other int ata * other.data, (self, other):
        out._backward

        self.data * other if  out._backward = _backward = _backward

        return out.grad
        other + (-out.grad = = a s 
----
iter 36000, loss: 0.5827917270633604      
----
 ladd(self, other):
        return self * other.grad
        self.grad += other + other + (-other):
    def __rtrelf.data + other.datadd__(v)
            self.grad += other + other):
 

In [9]:
# We've finally finished training; let's sample a big chunk of text!
print(f'\n----\n {data[0] + sample_text(None, data[0], 2000)} \n----')


----
 

        def _backward():
            self.grad += (opopow"
            out = Value(self.data, (self, other):
        return self.data, (self, other):
        def _backward():
            out(r):

            out = Value(self.data * out.grad
        out._backward = _backward = _backward():
            bui d nchildrat ='):
        out._backward

        other.grad
        out = Value(0 instar* other + other.data, (self, other):
        out = Value(0 nckward

        self._backward = _backward = _backward

        other.grad
        out._backward = _backward

        return out._backward = _backward

            out

        return out

            out = Value(self.data, _topo(out

    def __ren):
            out = Value(self.data * out.grad
        out._backward = _backward = _backward = _pself):
        out._backward = _backward = _backward

        return other.data * out.grad
            out = Value(self.data * out.grad += out.grad
        self.data * out.grad
        out._ba