# Character-based language model from scratch

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.io import *
from fastai.conv_learner import *
from fastai.column_data import *

## Dataset - Nietzsche

In [3]:
PATH = 'data/nietzsche/'

In [6]:
text = open(f'{PATH}nietzsche.txt').read()

In [7]:
len(text)

600893

In [8]:
text[:300]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, ha'

In [11]:
chars = sorted(list(set(text)))

In [13]:
chars.insert(0, "\0")  # zeror value, e.g. for padding
voc_size = len(chars)
print('total number of chars: ', voc_size)

total number of chars:  85


In [15]:
''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

**Mapping from char to id and vice versa:**

In [17]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

**Convert dataset to indices**

In [18]:
idx = [char_indices[c] for c in text]

In [19]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

**Quick test to make sure this is working as expected:**

In [20]:
''.join(indices_char[i] for i in idx[:50])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what th'

## Predicting the 4th char from the first 3

### Creating the inputs and output

In [21]:
cs = 3
# Inputs:
c1_dat = [idx[i] for i in range(0, len(idx) - 1 - cs, cs)]  # datasets: every 3 chars, shifted by one
c2_dat = [idx[i+1] for i in range(0, len(idx) - 1 - cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx) - 1 - cs, cs)]
# Output:
c4_dat = [idx[i+3] for i in range(0, len(idx) - 1 - cs, cs)]

In [22]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [26]:
c1_dat[:3], c2_dat[:3], c3_dat[:3], c4_dat[:3]

([40, 30, 29], [42, 25, 1], [29, 27, 1], [30, 29, 1])

Inputs:

In [35]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

In [36]:
x1, x2, x3

(array([40, 30, 29, ..., 67, 68, 72]),
 array([42, 25,  1, ..., 72, 59, 62]),
 array([29, 27,  1, ...,  2,  2, 67]))

Output:

In [37]:
y = np.stack(c4_dat[:-2])

In [38]:
y

array([30, 29,  1, ..., 68, 72, 59])

**We will try to predicht 30 from 40, 42, and 29 etc.**

### Create and train the model

In [40]:
n_hidden = 256

In [41]:
n_emb = 42

In [74]:
class Char3Model(nn.Module):
    def __init__(self, voc_size, n_emb):
        super().__init__()
        self.e = nn.Embedding(voc_size, n_emb)
        
        # input to hidden
        self.l_in = nn.Linear(n_emb, n_hidden)

        # hidden to hidden
        self.l_hid = nn.Linear(n_hidden, n_hidden)

        # hidden to output
        self.l_out = nn.Linear(n_hidden, voc_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size())).cuda()
        h = torch.tanh(self.l_hid(h + in1))
        h = torch.tanh(self.l_hid(h + in2))
        h = torch.tanh(self.l_hid(h + in3))
        
        return F.log_softmax(self.l_out(h))

In [75]:
modeldata = ColumnarModelData.from_arrays('.', [-1], np.stack([x1, x2, x3], axis=1), y, bs=512)

In [76]:
model = Char3Model(voc_size, n_emb).cuda()  # standard pytorch model, not fastai

In [90]:
# Let's look at the output from the model
it = iter(modeldata.trn_dl)
*xs, yt = next(it)
print(len(xs))
t = model(*V(xs))
print(t.shape)  # 85 probs for chars, bs = 512

3
torch.Size([512, 85])


In [82]:
opt = optim.Adam(model.parameters(), 1e-2)

In [85]:
fit(model, modeldata, 1, opt, F.nll_loss)  # negative log likelihood loss

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      2.048035   3.704608  



[3.7046079635620117]

In [86]:
set_lrs(opt, 0.001)

In [87]:
fit(model, modeldata, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.790576   4.471944  



[4.4719438552856445]

### Test the model

In [115]:
def get_next_char(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = model(*V(idxs))  # log probability
    predicted_char = np.argmax(p.data.cpu().numpy())
    return indices_char[predicted_char]

In [116]:
get_next_char('y. ')

'T'

In [117]:
get_next_char(' th')

'e'

In [118]:
get_next_char('ppl')

'e'

In [119]:
get_next_char('wom')

'a'

In [120]:
get_next_char('oma')

'n'

**Seems to work quite well!**

## Creating a first RNN

In [121]:
cs = 8  # size of the RNN

In [124]:
idx[:20]  # The first 20 chars converted to indices

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2]

In [127]:
char_inp_data = [[idx[i + j] for i in range(cs)] for j in range(len(idx) - cs)]

In [129]:
char_inp_data[:4]  # Sequences of cs = 8 chars, shifted by 1 (0 to 7, 1 to 8, 2 to 9)

[[40, 42, 29, 30, 25, 27, 29, 1],
 [42, 29, 30, 25, 27, 29, 1, 1],
 [29, 30, 25, 27, 29, 1, 1, 1],
 [30, 25, 27, 29, 1, 1, 1, 43]]

In [130]:
char_outp_data = [idx[j + cs] for j in range(len(idx) - cs)]

In [131]:
char_outp_data[:4]

[1, 1, 43, 45]

In [134]:
xs = np.stack(char_inp_data, axis=0)

In [135]:
xs.shape

(600885, 8)

In [136]:
xs[:4]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43]])

In [137]:
y = np.stack(char_outp_data)

In [138]:
y[:4]

array([ 1,  1, 43, 45])

### Create and train the model

In [139]:
val_idxs = get_cv_idxs(len(idx) - cs - 1)

In [140]:
modeldata = ColumnarModelData.from_arrays('.', val_idxs, xs, y, bs=512)

In [175]:
class CharLoopModel(nn.Module):
    # RNN
    def __init__(self, voc_size, n_emb, n_hidden):
        super().__init__()
        self.e = nn.Embedding(voc_size, n_emb)
        self.l_in = nn.Linear(n_emb, n_hidden)
        self.l_hid =  nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, voc_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = torch.tanh(self.l_hid(inp + h))  # tanh is good here becaus it limits the hidden state to -1, 1
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [176]:
model = CharLoopModel(voc_size, n_emb, n_hidden).cuda()

In [177]:
opt = optim.Adam(model.parameters(), 1e-2)

In [178]:
fit(model, modeldata, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.016024   1.996083  



[1.9960825715460515]

In [179]:
opt = optim.Adam(model.parameters(), 1e-3)

In [180]:
fit(model, modeldata, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.730551   1.732614  



[1.7326143387233413]

## Concat the input and hidden state together instead of adding them
Concatenating is better than adding since input and hidden state are qualitatively different and you loose information by adding them.

In [225]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, voc_size, n_emb, n_hidden):
        super().__init__()
        self.e = nn.Embedding(voc_size, n_emb)
        self.l_in = nn.Linear(n_emb + n_hidden, n_hidden)
        self.l_hid = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, voc_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), dim=1)
            inp = F.relu(self.l_in(inp))
            h = torch.tanh(self.l_hid(inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [226]:
model = CharLoopConcatModel(voc_size, n_emb, n_hidden).cuda()

In [227]:
opt = optim.Adam(model.parameters(), 1e-3)

In [228]:
it = iter(modeldata.trn_dl)
*xs,yt = next(it)
t = model(*V(xs))

In [229]:
xs[0].size(0)

512

In [230]:
fit(model, modeldata, 2, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.845675   1.822715  
    1      1.656798   1.645763                              



[1.6457629061589591]

In [231]:
opt = optim.Adam(model.parameters(), 1e-4)

In [232]:
fit(model, modeldata, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.566494   1.572049  



[1.572049370175147]

### Test the model

In [263]:
def get_next_char(inp):
    idxs = T(np.array([char_indices[c] for c in inp])).unsqueeze(1)
    print(idxs.shape)
    p = model(*VV(idxs))  # log probability
    predicted_char = np.argmax(p.data.cpu().numpy())
    return indices_char[predicted_char]

In [264]:
get_next_char('for thos')

torch.Size([8, 1])


'e'

In [265]:
get_next_char('part of ')

torch.Size([8, 1])


't'

In [272]:
get_next_char(' and wom')

torch.Size([8, 1])


'a'

## Now the same in pytorch
### pytorch creates the input layers and the loop automatically