In [77]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [78]:
PATH='data/nietzsche/'

In [79]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600893


In [None]:
text[:400]

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars) + 1
print('total chars:', vocab_size)

total chars: 85


In [6]:
chars.insert(0, '\0')

In [7]:
''.join(chars[1:-5])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

In [8]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
indices_char[0]

'\x00'

In [9]:
idx = [char_indices[c] for c in text]

In [10]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [11]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

In [12]:
cs=3
c1_dat = [idx[i]    for i in range(0, len(idx) - 1 - cs, cs)]
c2_dat = [idx[i+1]    for i in range(0, len(idx) - 1 - cs, cs)]
c3_dat = [idx[i+2]    for i in range(0, len(idx) - 1 - cs, cs)]
c4_dat = [idx[i+3]    for i in range(0, len(idx) - 1 - cs, cs)]


In [13]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

In [14]:
y = np.stack(c4_dat[:-2])

In [15]:
y[:5]

array([30, 29,  1, 40, 43])

In [16]:
x1.shape, y.shape

((200295,), (200295,))

In [17]:
n_hidden = 256 # number of hidden activations
n_fac = 42     # embedding matrix size

In [86]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        
        self.l_in = nn.Linear(n_fac, n_hidden)
        
        # square weight matrix
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self,  c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))
    

In [22]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1, x2, x3], axis=1), y, bs=512)

In [88]:
m = Char3Model(vocab_size, n_fac).cuda()

In [89]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [90]:
xs[0]


 67
  9
 74
  1
 67
 54
 78
  2
 74
  2
 28
 54
 69
  2
  1
 66
  2
 67
  2
 67
  8
 68
 71
  2
 58
  2
  2
 65
 72
 71
 73
 62
 58
 10
 72
 68
 60
  2
 65
 54
  2
 78
 73
 67
 63
 58
 72
 72
 71
  1
  2
 74
 58
  8
 54
 76
 58
 62
 65
 73
 67
 31
 54
  2
 62
 67
 73
 57
 71
  9
 58
 61
 56
 73
 68
 76
 72
 71
 58
 71
 76
 72
 71
 68
  2
 46
 72
 62
 65
 68
 58
 78
 69
 67
 61
 58
 74
  2
 61
 54
 67
 72
 62
 58
 74
 10
 10
 58
 62
 58
  2
 72
 56
 60
 73
 61
 10
 61
 67
  2
 68
 73
 71
 44
 58
  2
 78
 62
  2
 64
 73
 60
 56
 71
  2
 59
 44
  9
 78
 61
  2
  2
  2
  2
 55
 62
 58
 54
 73
  2
 54
 58
  1
  8
 68
  2
 62
  2
 78
 72
 56
 68
 78
 57
 68
 66
  2
 58
 54
 69
 77
 10
 58
  8
 62
 73
 61
 67
 68
 58
 71
 58
  2
 56
 72
 73
  1
 57
  2
 56
 73
 59
 58
 74
  2
 55
 68
  2
 67
 73
 68
 54
 55
 29
  2
 73
 58
 57
 62
 45
 68
 68
 56
 59
 73
 67
 65
 72
 62
 72
 67
 54
 69
 58
 58
 66
 58
  1
  2
 59
 78
 62
 59
 69
 69
 57
 62
 58
  2
 72
 68
 54
  8
 59
 56
 72
 59
  2
 67
 67

In [63]:
t

Variable containing:
-4.3346 -4.5751 -4.5236  ...  -4.6200 -4.6187 -4.1334
-4.3079 -4.5833 -4.4472  ...  -4.5442 -4.4554 -4.2949
-4.4424 -4.5095 -4.3915  ...  -4.5107 -4.3939 -4.1684
          ...             ⋱             ...          
-4.3834 -4.7395 -4.4694  ...  -4.6188 -4.4127 -4.3297
-4.2271 -4.2028 -4.5030  ...  -4.5055 -4.2388 -4.3235
-4.3346 -4.5751 -4.5236  ...  -4.6200 -4.6187 -4.1334
[torch.cuda.FloatTensor of size 512x85 (GPU 0)]

In [91]:
opt = optim.Adam(m.parameters(), 1e-2)

In [92]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                               
    0      2.098631   5.497483  



[array([ 5.49748])]

In [93]:
set_lrs(opt, 0.001)
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                               
    0      1.854464   4.958208  



[array([ 4.95821])]

In [94]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

get_next(' th')

'e'

In [20]:
cs=8

In [21]:
c_in_dat = [[idx[i + j] for i in range(cs)] for j in range(len(idx)-cs-1)]

In [30]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs-1)]

In [27]:
xs = np.stack(c_in_dat, axis=0)

In [29]:
xs.shape

(600884, 8)

In [31]:
y = np.stack(c_out_dat)

In [32]:
y.shape

(600884,)

In [34]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

In [35]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

In [61]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [62]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

IndexError: index 480312 is out of bounds for axis 1 with size 3

In [38]:
class CharLoopModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
            
        return F.log_softmax(self.l_out(h))
    

In [39]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [40]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                               
    0      2.022062   2.009667  



[array([ 2.00967])]

In [41]:
set_lrs(opt, 0.001)

In [42]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                               
    0      1.732963   1.733791  



[array([ 1.73379])]

In [47]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
            
        return F.log_softmax(self.l_out(h))

In [48]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [49]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [50]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                               
    0      2.116697   5.206744  



[array([ 5.20674])]

In [51]:
set_lrs(opt, 1e-4)

In [52]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                               
    0      1.995689   5.007091  



[array([ 5.00709])]

In [53]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [57]:
get_next('for thos')

'e'

In [58]:
get_next('ello my ')

't'

In [59]:
get_next('my name ')

't'

# Using nn.RNN

In [63]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp, h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]))

In [64]:
m = CharRnn(vocab_size, n_fac).cuda()

In [65]:
opt = optim.Adam(m.parameters(), 1e-3)

In [66]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [67]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([3, 512, 42])

In [68]:
ht = V(torch.zeros(1, 512, n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([3, 512, 256]), torch.Size([1, 512, 256]))

In [69]:
t = m(*V(xs));
t.size()

torch.Size([512, 85])

In [70]:
fit(m, md, 4, opt, F.nll_loss)

epoch      trn_loss   val_loss                               
    0      2.190345   3.843704  
    1      1.968257   4.464006                              
    2      1.86239    5.010817                               
    3      1.796672   5.027993                               



[array([ 5.02799])]

In [71]:
set_lrs(opt, 1e-4)
fit(m, md, 2, opt, F.nll_loss)

epoch      trn_loss   val_loss                               
    0      1.753373   5.177734  
    1      1.745874   5.28944                                



[array([ 5.28944])]

In [72]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [73]:
get_next('for thos')

'e'

In [75]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [76]:
get_next_n('for thos', 40)

'for thosenter and thereneran an an an an an an a'