# Character-based language model from scratch

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.io import *
from fastai.conv_learner import *
from fastai.column_data import *

## Dataset - Nietzsche

In [3]:
PATH = 'data/nietzsche/'

In [4]:
text = open(f'{PATH}nietzsche.txt', encoding="utf-8").read()

In [5]:
len(text)

600893

In [6]:
text[:300]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, ha'

In [7]:
chars = sorted(list(set(text)))

In [8]:
chars.insert(0, "\0")  # zeror value, e.g. for padding
voc_size = len(chars)
print('total number of chars: ', voc_size)

total number of chars:  85


In [9]:
''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

**Mapping from char to id and vice versa:**

In [10]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

**Convert dataset to indices**

In [11]:
idx = [char_indices[c] for c in text]

In [12]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

**Quick test to make sure this is working as expected:**

In [13]:
''.join(indices_char[i] for i in idx[:50])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what th'

## Predicting the 4th char from the first 3

### Creating the inputs and output

In [14]:
cs = 3
# Inputs:
c1_dat = [idx[i] for i in range(0, len(idx) - 1 - cs, cs)]  # datasets: every 3 chars, shifted by one
c2_dat = [idx[i+1] for i in range(0, len(idx) - 1 - cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx) - 1 - cs, cs)]
# Output:
c4_dat = [idx[i+3] for i in range(0, len(idx) - 1 - cs, cs)]

In [15]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [16]:
c1_dat[:3], c2_dat[:3], c3_dat[:3], c4_dat[:3]

([40, 30, 29], [42, 25, 1], [29, 27, 1], [30, 29, 1])

Inputs:

In [17]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

In [18]:
x1, x2, x3

(array([40, 30, 29, ..., 67, 68, 72]),
 array([42, 25,  1, ..., 72, 59, 62]),
 array([29, 27,  1, ...,  2,  2, 67]))

Output:

In [19]:
y = np.stack(c4_dat[:-2])

In [20]:
y

array([30, 29,  1, ..., 68, 72, 59])

**We will try to predicht 30 from 40, 42, and 29 etc.**

### Create and train the model

In [21]:
n_hidden = 256

In [22]:
n_emb = 42

In [23]:
class Char3Model(nn.Module):
    def __init__(self, voc_size, n_emb):
        super().__init__()
        self.e = nn.Embedding(voc_size, n_emb)
        
        # input to hidden
        self.l_in = nn.Linear(n_emb, n_hidden)

        # hidden to hidden
        self.l_hid = nn.Linear(n_hidden, n_hidden)

        # hidden to output
        self.l_out = nn.Linear(n_hidden, voc_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size())).cuda()
        h = torch.tanh(self.l_hid(h + in1))
        h = torch.tanh(self.l_hid(h + in2))
        h = torch.tanh(self.l_hid(h + in3))
        
        return F.log_softmax(self.l_out(h))

In [24]:
modeldata = ColumnarModelData.from_arrays('.', [-1], np.stack([x1, x2, x3], axis=1), y, bs=512)

In [25]:
model = Char3Model(voc_size, n_emb).cuda()  # standard pytorch model, not fastai

In [26]:
# Let's look at the output from the model
it = iter(modeldata.trn_dl)
*xs, yt = next(it)
print(len(xs))
t = model(*V(xs))
print(t.shape)  # 85 probs for chars, bs = 512

3
torch.Size([512, 85])


In [27]:
opt = optim.Adam(model.parameters(), 1e-2)

In [28]:
fit(model, modeldata, 1, opt, F.nll_loss)  # negative log likelihood loss

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      2.11806    4.820609  



[4.820609092712402]

In [29]:
set_lrs(opt, 0.001)

In [30]:
fit(model, modeldata, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.866076   5.203579  



[5.203579425811768]

### Test the model

In [31]:
def get_next_char(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = model(*V(idxs))  # log probability
    predicted_char = np.argmax(p.data.cpu().numpy())
    return indices_char[predicted_char]

In [32]:
get_next_char('y. ')

'T'

In [33]:
get_next_char(' th')

'e'

In [34]:
get_next_char('ppl')

'e'

In [35]:
get_next_char('wom')

'a'

In [36]:
get_next_char('oma')

'n'

**Seems to work quite well!**

## Creating a first RNN

In [37]:
cs = 8  # size of the RNN

In [38]:
idx[:20]  # The first 20 chars converted to indices

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2]

In [39]:
char_inp_data = [[idx[i + j] for i in range(cs)] for j in range(len(idx) - cs)]

In [40]:
char_inp_data[:4]  # Sequences of cs = 8 chars, shifted by 1 (0 to 7, 1 to 8, 2 to 9)

[[40, 42, 29, 30, 25, 27, 29, 1],
 [42, 29, 30, 25, 27, 29, 1, 1],
 [29, 30, 25, 27, 29, 1, 1, 1],
 [30, 25, 27, 29, 1, 1, 1, 43]]

In [41]:
char_outp_data = [idx[j + cs] for j in range(len(idx) - cs)]

In [42]:
char_outp_data[:4]

[1, 1, 43, 45]

In [43]:
xs = np.stack(char_inp_data, axis=0)

In [44]:
xs.shape

(600885, 8)

In [45]:
xs[:4]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43]])

In [46]:
y = np.stack(char_outp_data)

In [47]:
y[:4]

array([ 1,  1, 43, 45])

### Create and train the model

In [48]:
val_idxs = get_cv_idxs(len(idx) - cs - 1)

In [49]:
modeldata = ColumnarModelData.from_arrays('.', val_idxs, xs, y, bs=512)

In [50]:
class CharLoopModel(nn.Module):
    # RNN
    def __init__(self, voc_size, n_emb, n_hidden):
        super().__init__()
        self.e = nn.Embedding(voc_size, n_emb)
        self.l_in = nn.Linear(n_emb, n_hidden)
        self.l_hid =  nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, voc_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = torch.tanh(self.l_hid(inp + h))  # tanh is good here becaus it limits the hidden state to -1, 1
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [51]:
model = CharLoopModel(voc_size, n_emb, n_hidden).cuda()

In [52]:
opt = optim.Adam(model.parameters(), 1e-2)

In [53]:
fit(model, modeldata, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.984019   1.985254  



[1.9852543738214967]

In [54]:
opt = optim.Adam(model.parameters(), 1e-3)

In [55]:
fit(model, modeldata, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.683578   1.683077  



[1.6830767515162435]

## Concat the input and hidden state together instead of adding them
Concatenating is better than adding since input and hidden state are qualitatively different and you loose information by adding them.

In [56]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, voc_size, n_emb, n_hidden):
        super().__init__()
        self.e = nn.Embedding(voc_size, n_emb)
        self.l_in = nn.Linear(n_emb + n_hidden, n_hidden)
        self.l_hid = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, voc_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), dim=1)
            inp = F.relu(self.l_in(inp))
            h = torch.tanh(self.l_hid(inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [57]:
model = CharLoopConcatModel(voc_size, n_emb, n_hidden).cuda()

In [58]:
opt = optim.Adam(model.parameters(), 1e-3)

In [59]:
it = iter(modeldata.trn_dl)
*xs,yt = next(it)
t = model(*V(xs))

In [60]:
xs[0].size(0)

512

In [61]:
fit(model, modeldata, 2, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.867465   1.834486  
    1      1.656378   1.647034                              



[1.6470337330842681]

In [62]:
opt = optim.Adam(model.parameters(), 1e-4)

In [63]:
fit(model, modeldata, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.565058   1.584099  



[1.5840994116311193]

### Test the model

In [64]:
def get_next_char(inp):
    idxs = T(np.array([char_indices[c] for c in inp])).unsqueeze(1)
    p = model(*VV(idxs))  # log probability
    predicted_char = np.argmax(p.data.cpu().numpy())
    return indices_char[predicted_char]

In [65]:
get_next_char('for thos')

'e'

In [66]:
get_next_char('part of ')

't'

In [67]:
get_next_char(' and wom')

'a'

## Now the same in pytorch
### pytorch creates the input layers and the loop automatically

In [68]:
class CharRnn(nn.Module):
    def __init__(self, voc_size, n_emb):
        super().__init__()
        self.e = nn.Embedding(voc_size, n_emb)
        self.rnn = nn.RNN(n_emb, n_hidden)
        self.l_out = nn.Linear(n_hidden, voc_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        # hidden state = rank 3 tensor in pytorch
        # needed for bidirectional rnn and multilayer rnns 
        h = V(torch.zeros(1, bs, n_hidden).cuda())
        inp = self.e(torch.stack(cs))
        outp, h = self.rnn(inp, h)
        
        # pytorch appends the output to a list => -1
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [69]:
model = CharRnn(voc_size, n_emb).cuda()
opt = optim.Adam(model.parameters(), 1e-3)

Let's look at the dimensionality of the output:

In [70]:
it = iter(modeldata.trn_dl)
*xs, yt = next(it)

embedding:

In [71]:
t = model.e(V(torch.stack(xs)))

In [72]:
t.size()

torch.Size([8, 512, 42])

Hidden state for input:

In [73]:
ht = V(torch.zeros(1, 512, n_hidden))

Output of the rnn

In [74]:
outp, hn = model.rnn(t, ht)

In [75]:
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

Dimensionality of output:

In [76]:
t = model(*V(xs))

In [77]:
t.size()

torch.Size([512, 85])

In [78]:
fit(model, modeldata, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.871305   1.846348  
    1      1.680553   1.671313                              
    2      1.588609   1.597636                              
    3      1.541642   1.553672                              



[1.5536721405184963]

In [79]:
opt = optim.Adam(model.parameters(), 1e-4)

In [80]:
fit(model, modeldata, 2, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.473815   1.511779  
    1      1.46036    1.506225                              



[1.5062247376168334]

### Test the model

In [81]:
def get_next_char(inp):
    idxs = T(np.array([char_indices[c] for c in inp])).unsqueeze(1)
    p = model(*VV(idxs))  # log probability
    predicted_char = np.argmax(p.data.cpu().numpy())
    return indices_char[predicted_char]

In [82]:
def get_next_n_chars(inp, n):
    result = inp
    for i in range(n):
        c = get_next_char(inp)
        result += c
        inp = inp[1:] + c
    return result

In [83]:
get_next_n_chars('Human', 40)

'Humanity of the such and and and and and and '

## Multi-output model
Until now, we used 8 chars to predict the next one. Then, we shifted by one char to predict the following char. However, this means that 7 chars overlapped!

**Let's take a non-overlapping set of characters this time!**

Let's create the input:

(`idx` is the entire text converted to indices.)

In [84]:
c_in_dat = [[idx[i + j] for i in range(cs)] for j in range (0, len(idx) - cs - 1, cs)]

And the output (everything shifted by one char):

In [85]:
c_out_dat= [[idx[i + j] for i in range(cs)] for j in range(1, len(idx) - cs, cs)]

In [86]:
xs = np.stack(c_in_dat)
ys = np.stack(c_out_dat)

xs.shape, ys.shape

((75111, 8), (75111, 8))

In [87]:
xs[:4,]  # Reading top left to bottom right this is the nietzsche text

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62]])

In [88]:
ys[:4,]  # the same apart from the missing first character

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72]])

### Build the model

In [89]:
val_idxs = get_cv_idxs(len(xs) - cs - 1)

In [90]:
modeldata = ColumnarModelData.from_arrays('.', val_idxs, xs, ys, bs=512)

In [91]:
class CharSeqRnn(nn.Module):
    def __init__(self, voc_size, n_emb, n_hidden):
        super().__init__()
        self.e = nn.Embedding(voc_size, n_emb)
        self.rnn = nn.RNN(n_emb, n_hidden)
        self.l_out = nn.Linear(n_hidden, voc_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp, h = self.rnn(inp, h)

        # only difference is that in the last model 
        # we only returned ... of outp[-1]
        return F.log_softmax(self.l_out(outp), dim=-1)

In [92]:
model = CharSeqRnn(voc_size, n_emb, n_hidden).cuda()

In [93]:
opt = optim.Adam(model.parameters(), 1e-3)

Negative log likelyhood expects two rank two tensors, but because we are considering the timestep, we have a rank 3 tensor.

In [94]:
def nll_loss_seq(pred, target):
    # pytorch rnn appends the outputs, sequence length sl is the size of the list = 8 timesteps
    sl, bs, n_char = pred.size()  # torch.Size([8, 512, 85])
    
    # torch.Size([8, 512, 85]) => torch.Size([4096, 85])
    pred = pred.view(-1, n_char)
    
    # targ has one label for each of the 8 timesteps
    # torch.Size([512, 8]) => torch.Size([4096])
    target = target.transpose(0, 1).contiguous().view(-1)
    # when transposing, pytorch actually does not change 
    # the memory, just updates the stride..
    # contiguous() makes pytorch update the memory so that
    # the transposed object is contiguous in memroy
    
    return F.nll_loss(pred.view(-1, n_char), target)

In [95]:
fit(model, modeldata, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.582693   2.399541  
    1      2.284425   2.195281                              
    2      2.133081   2.081026                              
    3      2.045372   2.011189                              



[2.0111888978833683]

In [96]:
opt = optim.Adam(model.parameters(), 1e-4)

In [97]:
fit(model, modeldata, 2, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.994704   1.993445  
    1      1.985808   1.986614                              



[1.9866136107400318]

**Note: this model is not any better than the previous, but it is more efficient because we use 8 chars to predict 8 chars and now 8 chars to predicht one char.**

## Improve the model: better initialization

Problem: if the weights of the hidden to hidden layer are significantly smaller than 1, the gradients might vanish, if they are significantly larger, they might explode.

Solution: initialize with identity matrix

(*A simple way to initialize recurrent networks of rectified linear units*, Navdeep and Geoffrey Hinton, 2015)

In [98]:
model = CharSeqRnn(voc_size, n_emb, n_hidden).cuda()
opt = optim.Adam(model.parameters(), 1e-2)

# We can even use a larger learning rate now

In [99]:
model.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')

In [100]:
fit(model, modeldata, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.39149    2.205234  
    1      2.111292   2.03962                               
    2      2.002707   1.982047                              
    3      1.949826   1.936901                              



[1.9369013573929728]

In [101]:
fit(model, modeldata, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                             
    0      1.902322   1.925076  
    1      1.893846   1.916647                              
    2      1.870862   1.879582                              
    3      1.854034   1.881128                              



[1.8811284509701989]