In [56]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *
from fastai.lm_rnn import repackage_var

# Prelim

In [2]:
!mkdir 'data/nietzsche'
PATH = Path('data/nietzsche')

In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

nietzsche.txt: 606kB [00:03, 164kB/s]                             

corpus length: 600893





In [4]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [5]:
chars = sorted(set(text))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


In [8]:
chars.insert(0, '_pad_')
''.join(chars)

'_pad_\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzÆäæéë'

In [9]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

In [10]:
idx = [char_indices[c] for c in text]
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

# Data

In [12]:
cs = 10
c_in_data  = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]
c_out_data = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

xs = np.stack(c_in_data)
ys = np.stack(c_out_data)
print(xs.shape, ys.shape)

xs[:cs,:cs], ys[:cs,:cs]

(60089, 10) (60089, 10)


(array([[40, 42, 29, 30, 25, 27, 29,  1,  1,  1],
        [43, 45, 40, 40, 39, 43, 33, 38, 31,  2],
        [73, 61, 54, 73,  2, 44, 71, 74, 73, 61],
        [ 2, 62, 72,  2, 54,  2, 76, 68, 66, 54],
        [67,  9,  9, 76, 61, 54, 73,  2, 73, 61],
        [58, 67, 24,  2, 33, 72,  2, 73, 61, 58],
        [71, 58,  2, 67, 68, 73,  2, 60, 71, 68],
        [74, 67, 57,  1, 59, 68, 71,  2, 72, 74],
        [72, 69, 58, 56, 73, 62, 67, 60,  2, 73],
        [61, 54, 73,  2, 54, 65, 65,  2, 69, 61]]),
 array([[42, 29, 30, 25, 27, 29,  1,  1,  1, 43],
        [45, 40, 40, 39, 43, 33, 38, 31,  2, 73],
        [61, 54, 73,  2, 44, 71, 74, 73, 61,  2],
        [62, 72,  2, 54,  2, 76, 68, 66, 54, 67],
        [ 9,  9, 76, 61, 54, 73,  2, 73, 61, 58],
        [67, 24,  2, 33, 72,  2, 73, 61, 58, 71],
        [58,  2, 67, 68, 73,  2, 60, 71, 68, 74],
        [67, 57,  1, 59, 68, 71,  2, 72, 74, 72],
        [69, 58, 56, 73, 62, 67, 60,  2, 73, 61],
        [54, 73,  2, 54, 65, 65,  2, 69, 61, 62]

In [14]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [15]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [23]:
*xs,yt = next(iter(md.trn_dl))
# xs => 10 x [512]
# yt = tensor(512,10)

In [None]:
t = m(*V(xs))

# RNN

In [74]:
class CustomRnn(nn.Module):
    def __init__(self, vocab_size, n_hidden):
        super().__init__()
        self.n_hidden = n_hidden
        
        self.embed = nn.Embedding(vocab_size, n_hidden)
        self.l_in  = nn.Linear(n_hidden+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_weights()
        self.h = torch.tensor([])    # need this to maintain state past 10 chars
        
    def init_weights(self):
        # use this to avoid gradient explosion (matrix multiplier increasing/decreasing activations exponentially)
        self.l_hidden.weight.data.copy_(torch.eye(self.n_hidden))
        self.l_hidden.bias.data.fill_(0)
        
    def forward(self, *cs):
        bs = cs[0].size(0)  #512
        if self.h.size(0) != bs: self.h = V(torch.zeros(bs, self.n_hidden))
        res = []
        for c in cs:
            inp = torch.cat((self.h, self.embed(c)), 1)
            inp = torch.relu(self.l_in(inp))
            self.h = torch.tanh(self.l_hidden(inp))   # hidden state
            out = torch.log_softmax(self.l_out(self.h), dim=-1)
            res.append(out)
            
        self.h = repackage_var(self.h)   # bptt
        return torch.stack(res)

In [75]:
m = CustomRnn(vocab_size, 256)
opt = optim.Adam(m.parameters(), 1e-3)

In [76]:
# original implementation expects 2 minibatches of vectors (2 rank 2 tensors)
# we have a rank 3 tensor: we have 8 time steps of 84 probabilities for 512 items in the minibatch
def nll_loss_seq(inp, targ):
    # input: (10,512,85) tensor
    sl,bs,nc = inp.size()
    # target: (512,10) -> need to transpose
    targ = targ.permute(1,0).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nc), targ, reduction='sum')/bs

In [77]:
fit(m, md, 1, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                             
    0      24.523248  22.657841 


[22.657840973865575]

In [78]:
fit(m, md, 3, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                             
    0      21.508363  20.948729 
    1      20.600999  20.149365                            
    2      19.912185  19.613193                            


[19.613192843795566]

## Test

In [105]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))[None]
    p = m(*VV(idxs))
    outs = np.argmax(to_np(p), axis=-1)[0]
    return ''.join([chars[i] for i in outs])

In [108]:
get_next('aimlessl')

'tneyr  y'

In [79]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [80]:
get_next_n('for thos', 40)

NameError: name 'get_next' is not defined