In [1]:
import requests
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from torch import nn

In [2]:
seq_len = 32
batch_size = 256
nb_iters = 10000
eval_iters = 10

In [3]:
url_tomes = [
    'https://www.gutenberg.org/ebooks/17489.txt.utf-8',
    'https://www.gutenberg.org/ebooks/17493.txt.utf-8',
    'https://www.gutenberg.org/ebooks/17494.txt.utf-8',
    'https://www.gutenberg.org/ebooks/17518.txt.utf-8',
    'https://www.gutenberg.org/ebooks/17519.txt.utf-8'
    ]
les_miserables = ''
for url in url_tomes:
    response = requests.get(url)
    response.encoding = 'utf-8-sig'
    tome = response.text
    tome = tome.replace('\r\n', ' ')
    les_miserables += tome
print(les_miserables[10000:10500])

nt monseigneur Bienvenu   Le palais épiscopal de Digne était attenant à l'hôpital.  Le palais épiscopal était un vaste et bel hôtel bâti en pierre au commencement du siècle dernier par monseigneur Henri Puget, docteur en théologie de la faculté de Paris, abbé de Simore, lequel était évêque de Digne en 1712. Ce palais était un vrai logis seigneurial. Tout y avait grand air, les appartements de l'évêque, les salons, les chambres, la cour d'honneur, fort large, avec promenoirs à arcades, selon l'an


In [4]:
characters = sorted(set(les_miserables))
vocab_size = len(characters)
print(f"{vocab_size} characters:\n{''.join(characters)}")

117 characters:
 !"#$%'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz«°º»ÀÂÇÈÉÊÔàâæçèéêëîïñôöùûü—‘’“”•™


In [5]:
char_to_int = {c: i for i, c in enumerate(characters)}
encode = lambda all_c: [char_to_int[c] for c in all_c]
int_to_char = {i: c for i, c in enumerate(characters)}
decode = lambda all_i: ''.join([int_to_char[i] for i in all_i])

print(''.join(characters))
print(encode(characters))
print(decode(encode(characters)))

 !"#$%'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz«°º»ÀÂÇÈÉÊÔàâæçèéêëîïñôöùûü—‘’“”•™
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116]
 !"#$%'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz«°º»ÀÂÇÈÉÊÔàâæçèéêëîïñôöùûü—‘’“”•™


In [6]:
class LesMiserablesDataset(Dataset):
    def __init__(self, str_data, seq_len):
        self.data = torch.tensor(encode(str_data))
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        context = self.data[idx:idx+self.seq_len]
        target = self.data[idx+1:idx+self.seq_len+1]
        return context, target

train_dataset = LesMiserablesDataset(les_miserables[:int(0.9*len(les_miserables))], seq_len)
test_dataset = LesMiserablesDataset(les_miserables[int(0.9*len(les_miserables)):], seq_len)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

print([decode([i.item() for i in train_dataset[4356][0]])])
print([decode([i.item() for i in train_dataset[4356][1]])])


["ous avons à raconter, il n'est p"]
["us avons à raconter, il n'est pe"]


In [10]:
@torch.no_grad()
def evaluate():
    model.eval()
    losses = {'train': torch.zeros(eval_iters), 'test': torch.zeros(eval_iters)}
    for split, dataloader in zip(['train', 'test'], [train_dataloader, test_dataloader]):
        for i, (x, y) in enumerate(dataloader):
            y_pred = model(x)
            loss = F.cross_entropy(y_pred.view(-1, vocab_size), y.view(-1))
            losses[split][i] = loss.item()
            if i >= eval_iters-1:
                break
    losses = {split: losses[split].mean() for split in ['train', 'test']}
    return losses

In [11]:
class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, seq_len, embed_size, nb_heads, head_size):
        super().__init__()
        self.nb_heads = nb_heads
        self.head_size = head_size
        self.query = nn.Linear(embed_size, nb_heads*head_size, bias=False)
        self.key = nn.Linear(embed_size, nb_heads*head_size, bias=False)
        self.value = nn.Linear(embed_size, nb_heads*head_size, bias=False)
        self.mask = (torch.tril(torch.ones(seq_len, seq_len)) == 0)
        self.projection = nn.Linear(nb_heads*head_size, embed_size)
    
    def forward(self, x):
        batch_size, seq_len, _ = x.shape # x: batch_size x seq_len x embed_size
        # compute q, k, v
        q = self.query(x) # batch_size x seq_len x nb_heads*head_size
        q = q.view(batch_size, seq_len, self.nb_heads, self.head_size) # batch_size x seq_len x nb_heads x head_size
        q = q.permute(0, 2, 1, 3) # batch_size x nb_heads x seq_len x head_size
        k = self.key(x).view(batch_size, seq_len, self.nb_heads, self.head_size).permute(0, 2, 1, 3)
        v = self.value(x).view(batch_size, seq_len, self.nb_heads, self.head_size).permute(0, 2, 1, 3)
        
        # compute multi-head attention
        att = q @ k.transpose(2, 3) / self.head_size**0.5 # batch_size x nb_heads x seq_len x seq_len
        att.masked_fill_(mask=self.mask[:seq_len, :seq_len], value=float('-inf'))
        att = F.softmax(att, dim=-1)
        att = att @ v # batch_size x nb_heads x seq_len x head_size
        
        # concatenate heads and project
        att = att.permute(0, 2, 1, 3).reshape(batch_size, seq_len, self.nb_heads*self.head_size) # batch_size x seq_len x nb_heads*head_size
        att = self.projection(att) # batch_size x seq_len x embed_size
        return att


class Block(nn.Module):
    def __init__(self, seq_len, embed_size, nb_heads, head_size):
        super().__init__()
        self.masked_multi_head_attention = MaskedMultiHeadAttention(seq_len, embed_size, nb_heads, head_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, 4*embed_size),
            nn.ReLU(),
            nn.Linear(4*embed_size, embed_size))
        self.layer_norm = nn.LayerNorm(embed_size)
        
    def forward(self, x):
        x = x + self.masked_multi_head_attention(x)
        x = self.layer_norm(x)
        x = x + self.feed_forward(x)
        x = self.layer_norm(x)
        return x
        
        
class LesMiserablesLanguageModel(nn.Module):
    def __init__(self, vocab_size, seq_len, embed_size, nb_heads, head_size, n_blocks):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(seq_len, embed_size)
        self.blocks = nn.Sequential(*[Block(seq_len, embed_size, nb_heads, head_size) for _ in range(n_blocks)])
        self.linear = nn.Linear(embed_size, vocab_size)
        self.seq_len = seq_len
    
    def forward(self, x):
        seq_len = x.size(1)
        x = self.token_embedding(x) + self.position_embedding(torch.arange(seq_len)) # batch_size x seq_len x embedding_dim
        x = self.blocks(x) # batch_size x seq_len x embedding_dim
        x = self.linear(x) # batch_size x seq_len x vocab_size
        return x
    
    def generate(self, x, nb_tokens):
        if type(x) == str:
            x = torch.tensor(encode(x)).unsqueeze(0)
        for _ in range(nb_tokens):
            logits = self(x[:, -self.seq_len:])
            logits = logits[:, -1, :]
            probas = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probas, 1)
            x = torch.cat((x, next_token), dim=1)
        txt = decode(x.tolist()[0])
        return txt

In [12]:
model = LesMiserablesLanguageModel(vocab_size, seq_len, embed_size=64, nb_heads=4, head_size=16, n_blocks=6)
optimizer = torch.optim.AdamW(model.parameters())

for iter, (x, y) in enumerate(train_dataloader):
    
    y_pred = model(x)
    loss = F.cross_entropy(y_pred.view(-1, vocab_size), y.view(-1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if iter % 500 == 0:
        losses = evaluate()
        print(f"iter {iter}: train loss = {losses['train'].item():.3f}, test loss = {losses['test'].item():.3f}, generated text = {model.generate('Jean', 50)}")
        
    if iter >= nb_iters-1:
        break

iter 0: train loss = 4.450, test loss = 4.456, generated text = JeansÇ:kF°?à'_Boo™Læñ#H ï°ö_jAj)*aöm':#«ññ8ñ+IôS-»s4SR
iter 500: train loss = 1.849, test loss = 1.863, generated text = Jean pivitisté ses vie.  était-à pour aformante, un pe
iter 1000: train loss = 1.640, test loss = 1.642, generated text = Jean Valjean était dans la fhaume un gage entendant en
iter 1500: train loss = 1.546, test loss = 1.534, generated text = Jean IV ête au coups de me voulu des aux aternicares.s
iter 2000: train loss = 1.471, test loss = 1.492, generated text = Jean Van jamais, les archés approchants:  --Et jamaçon
iter 2500: train loss = 1.449, test loss = 1.460, generated text = Jean Val, c'est se grise qui avait eu rentent qu'un qu
iter 3000: train loss = 1.432, test loss = 1.446, generated text = Jean Valjean qui sentait auxisités carrés dans les mes
iter 3500: train loss = 1.402, test loss = 1.420, generated text = Jean Valjean, et croisant traversa la garde le frère e
iter 4000: train loss = 1.38