In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
import os
if not os.path.exists('data'):
    os.makedirs('data')
!wget -O data/shakespeare.txt  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt 

--2025-03-19 14:49:31--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘data/shakespeare.txt’


2025-03-19 14:49:32 (3.38 MB/s) - ‘data/shakespeare.txt’ saved [1115394/1115394]



In [4]:
with open('data/shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


<h2>Tokenization</h2>

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(' '.join(chars))
print(vocab_size)


   ! $ & ' , - . 3 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z
65


<p>
    Convert the raw text, which is a string into sequence of integers according to some vocabulary of possible elelments.
</p>

In [10]:
stoi = {char:idx for idx, char in enumerate(chars)} # dictionnary of character as key and index as value 
itos = {idx:char for idx, char in enumerate(chars)} # dictionnary of character as value and index as key

In [11]:
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [12]:
print(encode('Best Transformer ever'))
print(decode(encode('Best Transformer ever')))

[14, 43, 57, 58, 1, 32, 56, 39, 52, 57, 44, 53, 56, 51, 43, 56, 1, 43, 60, 43, 56]
Best Transformer ever


In [13]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


<h2>Train and Validation Set</h2>

In [14]:
# first 90% will be train, rest for val

train_ds = data[:int(0.9*len(data))]
val_ds = data [int(0.9*len(data)) :]

In [32]:
# Define the numbe of sequence that will be processed
batch_size = 16
# The maximum context length for prediction
block_size = 8

def get_batch(set, batch_size = batch_size , block_size =block_size):
    data = train_ds if set == 'train_ds' else val_ds
    idx = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in idx])
    y =  torch.stack([data[i + 1 : i + block_size + 1] for i in idx])

    return x.to(device), y.to(device) # input and target

x, y = get_batch("train_ds")


<h2> Create the initial model</h2>

In [22]:
class ShakespeareLangueModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets = None):
        # idx and targets are both (B, T) tensors of integers
        logits = self.token_embedding_table(idx) # (B, T, C), batch, time, channel)

        if targets is None:
            loss = None
        else:
            B,T, C = logits.shape #(batch_size =16, Block_size = 8, vocab_size =65)
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # get the vector in the embedding space
            logits, loss = self(idx)
            #focus only on the last time step token embedding
            # (Here we feed all the character block but we just check the value of the last to generate the one after.)
            logits = logits[:, -1, :]
            # apply softmax to get probabilities 
            probs =F.softmax(logits, dim = 1)
            # sample from distribution to get an index number
            idx_next = torch.multinomial(probs, num_samples= 1) # (B, 1) because num_samples equal 1
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim =1)
            # now index is a tensor of integer when the input was
            # torch.zeros((1,1), dtypes = torch.long)
        return idx

m = ShakespeareLangueModel(vocab_size).to(device)
logits, loss = m(x, y)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long).to(device), max_new_tokens=100)[0].tolist())) # zeros((1,1)) for generate from the first charater       

torch.Size([128, 65])
tensor(4.8517, device='cuda:0', grad_fn=<NllLossBackward0>)

dTu&!HjOpI!dmi. yD&ZwIuwjWmUvegtRWcrqC&bSZltK:.UIjxQQICa;us.hVkXU
DaP:,'ZOx3wXHNZwCxzqTy?gVtFXaWUA&G


**Let's delve into the embedding layer**

Embeddings serve as a method to represent data, such as tokens, in a high-dimensional continuous space. In this case, the space is represented by $\mathbb{R}^{\text{vocab size}}$, cause the second parameter of `nn.Embedding` is the vocabulary size. The input has to be one-hot-encode and that is why we need to precise the vocabulary size in the first parameter. Training this layer involves shifting each vector within this space.

One of the simplest ways to visualize this concept is by attempting to determine whether certain words are positive or negative, and whether they are commonly used or formal. Imagine projecting your words (or tokens) onto a two-dimensional plane, where each hyperplane from the canonic base represents a particular state. For instance, if a vector falls within $\mathbb{R}^{+,+}$, it signifies that the word is both positive and formal.

In the task of predicting the next word, you can utilize the block of encode words as the input. By adding all the emdedding vectors together, you can then decode the nearest emdedding token to this resultant addition in this space, yielding the output.

<h3>Train the model</h3>

In [48]:
epochs = 10000

In [43]:
@torch.no_grad() #context manager 
def estimate_loss(model=m, epochs=epochs):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(epochs)
        for k in range(epochs):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [44]:
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [45]:
for _ in range(epochs):
    if _ % 500 == 0 or iter ==epochs -1:
        losses = estimate_loss()
        print(f"step {_}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    x , y = get_batch('train_ds')

    logits , loss = m(x ,y)
    optimizer.zero_grad(set_to_none= True)
    loss.backward()
    optimizer.step()

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long).to(device), max_new_tokens=100)[0].tolist())) # zeros((1,1)) for generate from the first charater

step 0: train loss 2.8615, val loss 2.8628
step 500: train loss 2.7451, val loss 2.7470
step 1000: train loss 2.6735, val loss 2.6734
step 1500: train loss 2.6241, val loss 2.6265
step 2000: train loss 2.5890, val loss 2.5900
step 2500: train loss 2.5644, val loss 2.5669
step 3000: train loss 2.5462, val loss 2.5448
step 3500: train loss 2.5326, val loss 2.5315
step 4000: train loss 2.5215, val loss 2.5216
step 4500: train loss 2.5106, val loss 2.5139
step 5000: train loss 2.5086, val loss 2.5057
step 5500: train loss 2.5025, val loss 2.5015
step 6000: train loss 2.4978, val loss 2.4994
step 6500: train loss 2.4934, val loss 2.4955
step 7000: train loss 2.4941, val loss 2.4926
step 7500: train loss 2.4897, val loss 2.4902
step 8000: train loss 2.4871, val loss 2.4876
step 8500: train loss 2.4860, val loss 2.4867
step 9000: train loss 2.4863, val loss 2.4838
step 9500: train loss 2.4829, val loss 2.4843

hoouearele d the. tilouly utemeds IVO:
In hinot lt ves the r gs t f.
Wascke, SCu th

# ADD Attention is all you need !

In [46]:
B , T, C = 4, 8, 2
z =torch.randn(B,T,C)

<h4>Get the Average of the precedent tokens ("bag of words)</h4>

In [47]:
zbow = torch.zeros((B, T, C))
for b in range (B):
    for t in range (T):
        zprev = z [b, : t+1]
        zbow[b, t] = torch.mean(zprev, 0)
print(z[0])
print(zbow[0])

tensor([[ 0.5295,  1.8478],
        [-0.8244,  1.6689],
        [-1.0310, -1.1666],
        [ 0.5904, -0.9030],
        [ 0.0213, -1.4140],
        [-0.0675, -1.2013],
        [ 1.2422, -0.9646],
        [ 0.3955,  0.6795]])
tensor([[ 0.5295,  1.8478],
        [-0.1474,  1.7584],
        [-0.4420,  0.7834],
        [-0.1839,  0.3618],
        [-0.1428,  0.0066],
        [-0.1303, -0.1947],
        [ 0.0658, -0.3047],
        [ 0.1070, -0.1817]])


Let's optmize the code with a mathematical trick

In [49]:
# Exemple
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
print(a) # so a@b is the average of the precedent time of b

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])


In [52]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
zbow2 = wei @ z # here torch will convert this (T, T)@(B, T, C) product to a (B, T, T)@(B, T, C) to match the dimension --> (B, T, C)
torch.allclose(zbow, zbow2)

True

**Let's delve into self-attention every single** 

Each token will emit independently two new vectors : 
- `query` : "what I am looking for"
- `key` : "what do I contain"

So, to ensure that one token's query is correctly "aligned" with another token's key, we need to check whether these two vectors are LITERALLY aligned. This is why dot product have been created. So now the weights of the matrice is representing by the dot product between the query of the token to predict and the key of all the precedent ones.

Note that the `query` and the `key` vectors are created from the emdedding vector of the token and not directly from the token.

Let's see a single Head perform self-attention !

In [53]:
head_size = 16 # The length of the input query vector

In [54]:
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
k = key(z) # (B,T, 16)
q = query(z) 

In [55]:
# dot product <x,y> can be write as x @ y.T for row vectors
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -->  (B, T, T)
print(wei.shape)
wei[0]

torch.Size([4, 8, 8])


tensor([[-1.3986, -0.4121,  1.3382,  0.1287,  0.7916,  0.7284, -0.2621, -0.6455],
        [ 1.5054,  2.3810, -0.4043, -1.4015, -1.4864, -1.1957, -1.9765,  0.3962],
        [ 2.3637,  1.7327, -1.7075, -0.8930, -1.6771, -1.4512, -0.7649,  0.9313],
        [-1.1212, -1.5933,  0.3974,  0.9264,  1.0481,  0.8523,  1.2622, -0.3229],
        [ 0.1637, -0.5862, -0.4959,  0.3985,  0.1151,  0.0495,  0.7703,  0.1733],
        [ 0.3210, -0.3170, -0.5273,  0.2388, -0.0469, -0.0797,  0.5400,  0.2116],
        [-2.4976, -2.9946,  1.1818,  1.7022,  2.1532,  1.7807,  2.1650, -0.8046],
        [-0.9411, -0.5759,  0.7407,  0.2813,  0.6304,  0.5536,  0.1718, -0.3883]],
       grad_fn=<SelectBackward0>)

 Let's mask the next ones and re distribute that !

In [56]:
tril = torch.tril(torch.ones(T, T))
print(tril)
wei = wei.masked_fill(tril == 0, float('-inf'))
print(wei[0])
wei = F.softmax(wei, dim=-1) # nice distribution equal to one
print(wei[0])

out = wei @ z

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[-1.3986,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [ 1.5054,  2.3810,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [ 2.3637,  1.7327, -1.7075,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-1.1212, -1.5933,  0.3974,  0.9264,    -inf,    -inf,    -inf,    -inf],
        [ 0.1637, -0.5862, -0.4959,  0.3985,  0.1151,    -inf,    -inf,    -inf],
        [ 0.3210, -0.3170, -0.5273,  0.2388, -0.0469, -0.0797,    -inf,    -inf],
        [-2.4976, -2.9946,  1.1818,  1.7022,  2.1532,  1.7807,  2.1650,    -inf],
        [-0.9411, -0.5759,  0.7407,  0.2813,  0.6304,  0.5536,  0.1718, -0.3883]],
      

Cool but when input Q,K are not unit variance, wei will be an explozing variance and Softmax will not stay diffuse but it will saturate too much (creating an one-hot vector, that means that the target token will get information from one unique other vector). We need to force Q, K to be unit variance by normalize with $\sqrt{d_k}$.

In [57]:
wei = q @ k.transpose(-2, -1)* head_size**-0.5 # (B, T, 16) @ (B, 16, T) -->  (B, T, T)
print(wei.shape)
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1) # nice distribution equal to one

out = wei @ z
out.shape

torch.Size([4, 8, 8])


torch.Size([4, 8, 2])

In fact, each token will emit one more vector :
- `values` : "what I will communicate to the token if it find me interesting"

And the output of all of this will be the matrix product between the weights and the values.

In [58]:
value = nn.Linear(C, head_size, bias=False)
v = value(z)

out = wei @ v
out.shape

torch.Size([4, 8, 16])

**Understand why the value layer is necessary with an example**

Imagine the sequence “My black cat died yesterday. In his coffin, he looked TARGET”. Here, we're looking for an adjective to describe the cat. But in theory, the weights of “black” and “dead” should be very close, as they're both adjectives describing the cat. The result could therefore be either an adjective close to black, or an adjective close to dead. That's why we add the value layer: here, death brings much more value than black. What we mean is that even though we're looking for an adjective, we want an adjective to correlate with death, so “death” has to have a higher value than “black”.

**CONCLUSION OF SELF-ATTENTION**

To really understand what's going on here
1. In the time block, at each instant t, i.e. at each new token to be predicted, you take its key vector and search for the most aligned (literraly) previous vectors in the block using the dot product with their query vectors. The weight matrix is now the matrix of each dot product.
2. Now in the embedding space, wei @ z represent the shifting to the **weighted** average of the precedent tokens for the token to predict.
3. But we want more freedom to really understand what matters (and not just what aligns) in a sentence. So we add a new layer name value which represent the value of each token in the sequence. And now the output become wei @ value(z)

So to have a high vector output (so to bring the original embedding token vector to), you have to be interesting for the prediction (represente by the weight) AND add a hugh value to the sequence (representing by the value).

In [59]:
class OneHead(nn.Module):
    def __init__(self, dim_emb, head_size):
        super().__init__()
        self.key = nn.Linear(dim_emb, head_size, bias=False)
        self.query = nn.Linear(dim_emb, head_size, bias=False)
        self.value = nn.Linear(dim_emb, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #is not a paremeter
        
    def forward(self, x):
        B, T, C = x.shape
        
        k = self.key(x) # (B, T, C)
        q = self.query(x) # (B, T, C)
        
        wei = q @ k.transpose(-2, -1)* C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        
        v=self.value(x)

        out = wei @ v

        return out

## Improve the model (part 1)

1. Change the size of the embedding space ! 
2. And get information from position of the tokens ! 
3. 1.+2.
4. Add parralel attention

In [63]:
dim_emb = 32 
head_size = 16

In [68]:
class ShakePT(nn.Module):

    def __init__(self, vocab_size, dim_emb, head_size):
        super().__init__()
        ## 1. change the dimension of the token embedding ####
        self.token_embedding_table = nn.Embedding(vocab_size, dim_emb)
        ## 2. create embedding for the position of the tokens ####
        self.position_embedding_table = nn.Embedding(block_size, dim_emb)
        ##  Add one Attention ##
        self.head = OneHead(dim_emb, head_size=dim_emb) #head_size=dim_emb for the moment cause we do not introduce mutliheading yet (to match the dimensions)
        self.linear = nn.Linear(dim_emb, vocab_size)
        
    def forward(self, idx, targets=None):
        tok_emb = self.token_embedding_table(idx) # (B, T, C)        
        B, T = idx.shape
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))

        x = tok_emb + pos_emb
        x = self.head(x)
        
        logits = self.linear(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) 
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_crop = idx[:, -block_size:] #make sure that the idx that are feed into the model has no more than block size coming in (position_embedding_table)
            logits, loss = self(idx_crop)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1) 
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = ShakePT(vocab_size, dim_emb, head_size).to(device)
logits, loss = m(x, y)
print(logits.shape)
print(loss)

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

torch.Size([128, 65])
tensor(4.1968, device='cuda:0', grad_fn=<NllLossBackward0>)

oeLJ;d;&gTW!Ddqqo!Wz.gh..bLnMSWT
jZNStPbykm?P-JtebYJVonI
;cvqs''Uhc;UoEM
WN-tNkVGY;: BRvlwFa;DqUZS dHHrQra!PQ:?pZrDtZxEO
fJv$pqEYAeSoxJmYaUvygLGtaXTSTs'OMrDB
z-a,cKcr-wKS.NlKo'cqyDclVVX!PaAeZMTc-m;$L,;QyKwKOlXeCNheDSg?OZyULy&fkiySzAmR;zOTq$cz'OcJ?kv?
I'VeSx-'sgWyWPXrGGnEeEELDkcemNZZ-wVF!FazUSp3C'yVKI?mExmd?YJT$.icxlYJ3vhosY-jYcWl?p?'b&,C;n;WtiQ;LfW.!iiWBiZ,xai?K;dPH.DlU
enZecQMO.Ofv!se&.ZLI!Bq'Nc;wG&dSH-;KaamPHxEQKG
oUEezHLj!tf-eszfoGKOPT-T&
.Vs'''3n,'kUUrBdoT$Hz3nmyAcNM!
Sq.bNahTcrZQESMA!nGxOHc
