In [None]:
# Load corporate proxy configuration
import sys
sys.path.insert(0, '..')
try:
    from _proxy_config import *
except ImportError:
    print("Warning: _proxy_config.py not found. Proxy settings may not be configured.")
except Exception as e:
    print(f"Error loading proxy configuration: {e}")

# Data preparation and sampling

In [None]:
import os
import re

## 1. Tokenization

In [None]:
input_dir = os.path.join("data", "0")
with open(os.path.join(input_dir, "the-verdict.txt"), "r", encoding="UTF-8") as f:
    raw_text = f.read()

### 1.1. Simple tokenization

We'll create a simple tokenizer, where each word is a token. Other tokenizers might split text in other ways, such as splitting words in multiple segments, or having entire sentences as tokens.

In [None]:
# regular expression to split the text on whitespaces, commas, periods...
split_text = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
print(split_text)

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '--', 'though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough', '--', 'so', ' ', 'it', ' ', 'was', ' ', 'no', ' ', 'great', ' ', 'surprise', ' ', 'to', ' ', 'me', ' ', 'to', ' ', 'hear', ' ', 'that', ',', '', ' ', 'in', ' ', 'the', ' ', 'height', ' ', 'of', ' ', 'his', ' ', 'glory', ',', '', ' ', 'he', ' ', 'had', ' ', 'dropped', ' ', 'his', ' ', 'painting', ',', '', ' ', 'married', ' ', 'a', ' ', 'rich', ' ', 'widow', ',', '', ' ', 'and', ' ', 'established', ' ', 'himself', ' ', 'in', ' ', 'a', ' ', 'villa', ' ', 'on', ' ', 'the', ' ', 'Riviera', '.', '', ' ', '', '(', 'Though', ' ', 'I', ' ', 'rather', ' ', 'thought', ' ', 'it', ' ', 'would', ' ', 'have', ' ', 'been', ' ', 'Rome', ' ', 'or', ' ', 'Florence', '.', '', ')', '', '\n', '', '\n', '', '"', 'The', ' ', 'height', ' ', 'of', ' ', 'his', ' ', 'glory', '"', '', '--', 'that', ' ', 'was', ' ', '

Depending on the case, whitespaces can be kept. In coding LLM's, for example, these tokens can be more important. We will be removing them to save compute.

In [None]:
tokens = [item.strip() for item in split_text if item.strip()]
print(tokens)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication', '.', '"', 'Of', 'course', 'it', "'", 's', 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'", 'way', 'up', ';', 'but', 'I', 'don', "'", 't', 'think', 'of', 'that', ',

### 1.2. Converting tokens into token ID's

In [None]:
vocab = sorted(set(tokens))
vocab_size = len(vocab)
print(vocab_size)

1130


We need to create a mapping to represent the tokens as numbers.

In [None]:
encoder = {token:i for i, token in enumerate(vocab)}
print(encoder)

{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '--': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Chicago': 25, 'Claude': 26, 'Come': 27, 'Croft': 28, 'Destroyed': 29, 'Devonshire': 30, 'Don': 31, 'Dubarry': 32, 'Emperors': 33, 'Florence': 34, 'For': 35, 'Gallery': 36, 'Gideon': 37, 'Gisburn': 38, 'Gisburns': 39, 'Grafton': 40, 'Greek': 41, 'Grindle': 42, 'Grindles': 43, 'HAD': 44, 'Had': 45, 'Hang': 46, 'Has': 47, 'He': 48, 'Her': 49, 'Hermia': 50, 'His': 51, 'How': 52, 'I': 53, 'If': 54, 'In': 55, 'It': 56, 'Jack': 57, 'Jove': 58, 'Just': 59, 'Lord': 60, 'Made': 61, 'Miss': 62, 'Money': 63, 'Monte': 64, 'Moon-dancers': 65, 'Mr': 66, 'Mrs': 67, 'My': 68, 'Never': 69, 'No': 70, 'Now': 71, 'Nutley': 72, 'Of': 73, 'Oh': 74, 'On': 75, 'Once': 76, 'Only': 77, 'Or': 78, 'Perhaps': 79, 'Poor': 80, 'Professional': 81, 'Renaissance': 82, 'Ri

### 1.3. Encoding

In [None]:
sample_tokens = tokens[0:500]
print(sample_tokens)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication', '.', '"', 'Of', 'course', 'it', "'", 's', 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'", 'way', 'up', ';', 'but', 'I', 'don', "'", 't', 'think', 'of', 'that', ',

In [None]:
encoded_sample = [encoder[word] for word in tokens]
print(encoded_sample)

[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709, 508, 961, 1016, 663, 1016, 535, 987, 5, 568, 988, 538, 722, 549, 496, 5, 533, 514, 370, 549, 748, 5, 661, 115, 841, 1102, 5, 157, 397, 547, 568, 115, 1066, 727, 988, 84, 7, 3, 99, 53, 818, 1003, 585, 1120, 530, 208, 85, 734, 34, 7, 4, 1, 93, 538, 722, 549, 496, 1, 6, 987, 1077, 1089, 988, 1112, 242, 585, 7, 53, 244, 535, 67, 7, 37, 100, 6, 549, 602, 25, 897, 6, 326, 549, 1042, 116, 7, 1, 73, 297, 585, 2, 850, 498, 1016, 866, 988, 1059, 722, 697, 769, 2, 1083, 1051, 9, 239, 53, 359, 2, 970, 998, 722, 987, 5, 66, 7, 83, 6, 988, 646, 1016, 16, 584, 145, 53, 998, 722, 7, 1, 93, 1116, 5, 727, 67, 7, 100, 2, 850, 633, 5, 693, 586, 114, 847, 114, 177, 1002, 994, 1088, 827, 568, 156, 389, 1069, 722, 677, 7, 14, 585, 1077, 711, 731, 988, 67, 7, 101, 1097, 688, 7, 45, 711, 988, 410, 50, 28, 5, 180, 988, 602, 40, 36, 882, 5, 929, 663, 209, 38, 2, 850, 1, 65, 1, 1016, 856, 5, 1108, 976, 568, 539, 4

### 1.4. Decoding

Creating a reverse mapping to get from numbers back to tokens:

In [None]:
decoder = {id:word for word,id in encoder.items()}
print(decoder)

{0: '!', 1: '"', 2: "'", 3: '(', 4: ')', 5: ',', 6: '--', 7: '.', 8: ':', 9: ';', 10: '?', 11: 'A', 12: 'Ah', 13: 'Among', 14: 'And', 15: 'Are', 16: 'Arrt', 17: 'As', 18: 'At', 19: 'Be', 20: 'Begin', 21: 'Burlington', 22: 'But', 23: 'By', 24: 'Carlo', 25: 'Chicago', 26: 'Claude', 27: 'Come', 28: 'Croft', 29: 'Destroyed', 30: 'Devonshire', 31: 'Don', 32: 'Dubarry', 33: 'Emperors', 34: 'Florence', 35: 'For', 36: 'Gallery', 37: 'Gideon', 38: 'Gisburn', 39: 'Gisburns', 40: 'Grafton', 41: 'Greek', 42: 'Grindle', 43: 'Grindles', 44: 'HAD', 45: 'Had', 46: 'Hang', 47: 'Has', 48: 'He', 49: 'Her', 50: 'Hermia', 51: 'His', 52: 'How', 53: 'I', 54: 'If', 55: 'In', 56: 'It', 57: 'Jack', 58: 'Jove', 59: 'Just', 60: 'Lord', 61: 'Made', 62: 'Miss', 63: 'Money', 64: 'Monte', 65: 'Moon-dancers', 66: 'Mr', 67: 'Mrs', 68: 'My', 69: 'Never', 70: 'No', 71: 'Now', 72: 'Nutley', 73: 'Of', 74: 'Oh', 75: 'On', 76: 'Once', 77: 'Only', 78: 'Or', 79: 'Perhaps', 80: 'Poor', 81: 'Professional', 82: 'Renaissance', 83:

In [None]:
decoded_sample = [decoder[id] for id in encoded_sample]
print(decoded_sample)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication', '.', '"', 'Of', 'course', 'it', "'", 's', 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'", 'way', 'up', ';', 'but', 'I', 'don', "'", 't', 'think', 'of', 'that', ',

### 1.5. Wrapping up

Implementing a simple class to perform the steps above.
We are also adapating for the case when we try to encode a word that has not been seen during the creation of the mappings, by adding a "<|unk|>" token that will replace unknown tokens.

"<|endoftext|>" is a special token created so that we can process multiple texts at once. We concatenate the many texts we want to process with this special token in between then, so that the models knows that what is coming next might have no relation to the text it was seeing before.

(Rigorously, tokeninzing refers only to splitting the text into its segments, but we will call the entire class a "tokenizer".)

In [None]:
# moved the code to a module so I can reuse in other notebooks
from build_llm_module.SimpleTokenizer import SimpleTokenizer
SimpleTokenizer??

In [None]:
simple_tokenizer = SimpleTokenizer(raw_text)
unk_token = simple_tokenizer.encode("Say hello to academic speculations!") # some words are not present in the original text
print(unk_token)

NameError: name 're' is not defined

In [None]:
print(simple_tokenizer.decode(unk_token))

<|unk|> <|unk|> to academic speculations!


### 1.6. Byte Pair Encoding

In real world scenarios, we'd use packages such as tiktoken to perform more complex tokenizations, such as byte pair encoding (BPE), which is the tokenizer used to train models such as GPT-2 and GPT-3.

In [None]:
import tiktoken
tiktoken.list_encoding_names()

['gpt2',
 'r50k_base',
 'p50k_base',
 'p50k_edit',
 'cl100k_base',
 'o200k_base',
 'o200k_harmony']

In [None]:
bpe_tokenizer = tiktoken.get_encoding("gpt2")

Since BPE tokenize text up to the single character level, it is capable of handling unknown words in both directions, different from our SimpleTokenizer:

In [None]:
text_w_unk_word = "Here is an unknown word: Alvazska"

encoded_unk_word_simple = simple_tokenizer.encode(text_w_unk_word)
encoded_unk_word_bpe = bpe_tokenizer.encode(text_w_unk_word)
print(f"Original text: {text_w_unk_word}")
print(f"Simple encoding: {encoded_unk_word_simple}")
print(f"Simple decoding: {simple_tokenizer.decode(encoded_unk_word_simple)}")
print(f"BPE encoding: {encoded_unk_word_bpe}")
print(f"BPE decoding: {bpe_tokenizer.decode(encoded_unk_word_bpe)}")

Original text: Here is an unknown word: Alvazska
Simple encoding: [1134, 2, 587, 2, 159, 2, 1134, 2, 1119, 11, 0, 2, 1134]
Simple decoding: <|unk|> is an <|unk|> word: <|unk|>
BPE encoding: [4342, 318, 281, 6439, 1573, 25, 33414, 1031, 82, 4914]
BPE decoding: Here is an unknown word: Alvazska


**Why does the BPE tokenization result in less tokens than the SimpleTokenizer we've implemented?**

In [None]:
simple_count = len(simple_tokenizer.encode(raw_text))
bpe_count = len(bpe_tokenizer.encode(raw_text))
print(f"Simple: {simple_count}, BPE: {bpe_count}")

Simple: 9235, BPE: 5145


In [None]:
first_10_tokens_simple = {i: token for i, token in enumerate(simple_tokenizer.tokenize(raw_text)[:9])}
print(first_10_tokens_simple)

first_10_tokens_bpe = {i: bpe_tokenizer.decode([token]) for i, token in enumerate(bpe_tokenizer.encode(raw_text)[:9])}
print(first_10_tokens_bpe)

{0: 'I', 1: ' ', 2: 'HAD', 3: ' ', 4: 'always', 5: ' ', 6: 'thought', 7: ' ', 8: 'Jack'}
{0: 'I', 1: ' H', 2: 'AD', 3: ' always', 4: ' thought', 5: ' Jack', 6: ' G', 7: 'is', 8: 'burn'}


Our tokenizer splits on whitespaces ('\s'), and BPE doesn't explicitly split on whitespaces - it encodes them as part of the other tokens.

## 2. Data sampling with a sliding window

We need to create input/target pairs for training the LLM. Since LLM's predict the next token in a sequence, we use a sliding window to create these pairs. We can visualize what this means below:

In [None]:
context_length = 10 # how many token are in the input
bpe_tokens = bpe_tokenizer.encode(raw_text)[:15] # just a sample will do
print(f"Corpus:{bpe_tokens}")
corpus_lenght = len(bpe_tokens)
for i in range(1, corpus_lenght):
    context = bpe_tokens[(0 if i < context_length else i - context_length):i]
    next_token = bpe_tokens[i]
    print(context, "--->", next_token)

Corpus:[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016]
[40] ---> 367
[40, 367] ---> 2885
[40, 367, 2885] ---> 1464
[40, 367, 2885, 1464] ---> 1807
[40, 367, 2885, 1464, 1807] ---> 3619
[40, 367, 2885, 1464, 1807, 3619] ---> 402
[40, 367, 2885, 1464, 1807, 3619, 402] ---> 271
[40, 367, 2885, 1464, 1807, 3619, 402, 271] ---> 10899
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899] ---> 2138
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138] ---> 257
[367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257] ---> 7026
[2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026] ---> 15632
[1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632] ---> 438
[1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438] ---> 2016


Visualizing in string format:

In [None]:
context_length = 10 # how many token are in the input
bpe_tokens = bpe_tokenizer.encode(raw_text)[:15] # just a sample will do
print(f"Corpus:{bpe_tokenizer.decode(bpe_tokens)}")
corpus_lenght = len(bpe_tokens)
for i in range(1, corpus_lenght):
    context = bpe_tokens[(0 if i < context_length else i - context_length):i]
    next_token = bpe_tokens[i]
    print(bpe_tokenizer.decode(context), "--->", bpe_tokenizer.decode([next_token]))

Corpus:I HAD always thought Jack Gisburn rather a cheap genius--though
I --->  H
I H ---> AD
I HAD --->  always
I HAD always --->  thought
I HAD always thought --->  Jack
I HAD always thought Jack --->  G
I HAD always thought Jack G ---> is
I HAD always thought Jack Gis ---> burn
I HAD always thought Jack Gisburn --->  rather
I HAD always thought Jack Gisburn rather --->  a
 HAD always thought Jack Gisburn rather a --->  cheap
AD always thought Jack Gisburn rather a cheap --->  genius
 always thought Jack Gisburn rather a cheap genius ---> --
 thought Jack Gisburn rather a cheap genius-- ---> though


### 2.1. Preparing the data using pytorch

Preparing the Dataset:

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:

class GPTDataset(Dataset):
    def __init__(self, corpus, encoder, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = encoder.encode(corpus)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
    
    @staticmethod
    def visualize(corpus, encoder, max_length, stride, max_pairs=20):
        token_ids = encoder.encode(corpus)
        for i in range(0, len(token_ids) - max_length, stride):
            
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            print(f"i={i}", encoder.decode(input_chunk), "---->", encoder.decode(target_chunk))
            if i == max_pairs - 1:
                break 

Visualizing the prepared data using the simple tokenizer:

In [None]:
GPTDataset.visualize(raw_text, simple_tokenizer, 10, 1, max_pairs=5)

i=0 I HAD always thought Jack  ---->  HAD always thought Jack Gisburn
i=1  HAD always thought Jack Gisburn ----> HAD always thought Jack Gisburn 
i=2 HAD always thought Jack Gisburn  ---->  always thought Jack Gisburn rather
i=3  always thought Jack Gisburn rather ----> always thought Jack Gisburn rather 
i=4 always thought Jack Gisburn rather  ---->  thought Jack Gisburn rather a


**We were just saying that, given an input, we want to predict one token at a time. So why are our target tensors as long as our inputs?**

This is for efficiency in training specifically. Even though inference is done one toke at a time, training can be accelerated by calculating multiple positions at the same time. This is achieved by using a combination of *teacher forcing* and *causal masking* techniques. By doing so, the model can learn the 'next-token task' at every position in the sequence simultaneously.

Preparing the DataLoader:

In [None]:
def create_dataloader(corpus, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(corpus, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            num_workers=num_workers,
                            drop_last=drop_last) # prevents loss spike in case the last batch is shorter than the specified batch size
    return dataloader

In [None]:
dataloader = create_dataloader(
    raw_text, batch_size=2, max_length=3, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
display(first_batch)

[tensor([[  40,  367, 2885],
         [ 367, 2885, 1464]]),
 tensor([[ 367, 2885, 1464],
         [2885, 1464, 1807]])]

In [None]:
second_batch = next(data_iter)
display(second_batch)

[tensor([[2885, 1464, 1807],
         [1464, 1807, 3619]]),
 tensor([[1464, 1807, 3619],
         [1807, 3619,  402]])]

## 3. Embeddings

After tokenization and converting to token id's, each id must b e converted to a vector form (embedding). This is needed for backpropagation.

#### 3.1. Token embeddings

Image a vocabulary of only 10 tokens:

In [None]:
corpus_embedding = "We are so screwed when AI gets pissed"
simple_tokenizer_embedding = SimpleTokenizer(corpus_embedding)
display(simple_tokenizer_embedding.encoder)
vocab_size = len(simple_tokenizer_embedding.vocab)
print(f"Vocab size:{vocab_size}")

{' ': 0,
 'AI': 1,
 'We': 2,
 'are': 3,
 'gets': 4,
 'pissed': 5,
 'screwed': 6,
 'so': 7,
 'when': 8,
 '<|endoftext|>': 9,
 '<|unk|>': 10}

Vocab size:11


In [None]:
input_ids = torch.tensor(simple_tokenizer_embedding.encode(corpus_embedding))
display(input_ids)
output_dimensions = 4

torch.manual_seed(69)
embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size,
                                     embedding_dim=output_dimensions)
display(embedding_layer.weight)

tensor([2, 0, 3, 0, 7, 0, 6, 0, 8, 0, 1, 0, 4, 0, 5])

Parameter containing:
tensor([[-0.5300, -1.3035,  0.4438,  1.2221],
        [ 1.0395,  0.9608,  0.4214,  0.7452],
        [-1.8389, -1.2497, -0.2485,  0.1428],
        [-1.0509,  0.3527, -0.0916,  0.0341],
        [-0.8986,  0.1022, -0.6627, -0.1350],
        [-0.3983, -1.7892,  1.2785,  1.3351],
        [-0.3066,  1.0382,  1.2762,  0.0419],
        [-0.9606, -0.9892,  0.0638,  0.5580],
        [ 1.1983, -0.8180, -0.7297,  0.8256],
        [-1.9490,  0.9272,  1.3656, -1.0653],
        [-1.2100, -0.9481,  0.6444,  0.2188]], requires_grad=True)

In [None]:
print(embedding_layer(torch.tensor([0])))
print(embedding_layer(torch.tensor([1])))

tensor([[-0.5300, -1.3035,  0.4438,  1.2221]], grad_fn=<EmbeddingBackward0>)
tensor([[1.0395, 0.9608, 0.4214, 0.7452]], grad_fn=<EmbeddingBackward0>)


Note that no tokens have been fed into the embedding layer in any form. We used the the size of the vocabulary as an specification, and a arbitrary number of dimensions. The layer has been initialized with random weights, because the optimization of the weights to better represent the different dimensions of the tokens is part of the training process, and not something tht is done beforehands.



Note that, for now, each token in represented by an embedding that carries no information about its position in the input sequence. If a token happens to show up more than one time in a sequence, the embeddings will be exactly the same:

In [None]:
token_ids_32 = torch.tensor(bpe_tokenizer.encode("abcabcabc"))
print(token_ids_32)
embedding_layer_32 = torch.nn.Embedding(num_embeddings=bpe_tokenizer.n_vocab, embedding_dim=4)
embeddings_32 = embedding_layer_32(token_ids_32)
print(embeddings_32)


tensor([39305, 39305, 39305])
tensor([[-0.5662,  0.3919, -1.0117, -0.8891],
        [-0.5662,  0.3919, -1.0117, -0.8891],
        [-0.5662,  0.3919, -1.0117, -0.8891]], grad_fn=<EmbeddingBackward0>)


### 3.2. Positional embeddings

As seen in the previous section, token embeddings alone don't give the model any sense of order. There are two main main ways of fixing this - absolute and relative positional embeddings.


The original transformer architecture uses the absolute option, with predefined positional weights that would be added to the token embedding. GPT models also use absolute positional embedding, with the difference that in their case the weights are also trained rather than being fixed.

#### Single input example 
Restarting a token embedding layer:

In [None]:
n_dim = 4
print(f"Vocab size:{bpe_tokenizer.n_vocab}, n_dim: {n_dim}")
token_embedding_layer = torch.nn.Embedding(num_embeddings=bpe_tokenizer.n_vocab, embedding_dim=n_dim)

context_length = 10

input_tokens = torch.tensor([2] * context_length) # 2 is an arbitrary token id that is repeated in the input

token_embeddings = token_embedding_layer(input_tokens)
print(token_embeddings)
print(token_embeddings.shape)

Vocab size:50257, n_dim: 4
tensor([[ 0.9264, -1.3761, -0.2226, -0.4921],
        [ 0.9264, -1.3761, -0.2226, -0.4921],
        [ 0.9264, -1.3761, -0.2226, -0.4921],
        [ 0.9264, -1.3761, -0.2226, -0.4921],
        [ 0.9264, -1.3761, -0.2226, -0.4921],
        [ 0.9264, -1.3761, -0.2226, -0.4921],
        [ 0.9264, -1.3761, -0.2226, -0.4921],
        [ 0.9264, -1.3761, -0.2226, -0.4921],
        [ 0.9264, -1.3761, -0.2226, -0.4921],
        [ 0.9264, -1.3761, -0.2226, -0.4921]], grad_fn=<EmbeddingBackward0>)
torch.Size([10, 4])


The absolute positional embedding layer should have the same embedding dimension as the token embedding layer:

In [None]:
pos_embedding_layer = torch.nn.Embedding(num_embeddings=context_length, embedding_dim=n_dim)

The input to the positional embeddings is usually a placeholder vector, with a sequence of numbers like 0, 2...up to the maximum context length.

In [None]:
pos_embeddings_input = torch.arange(context_length)
print(pos_embeddings_input)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


In [None]:
pos_embeddings = pos_embedding_layer(pos_embeddings_input)
print(pos_embeddings)
print(pos_embeddings.shape)

tensor([[-0.3024, -0.3962, -0.0793, -0.0721],
        [-1.8914, -1.7446,  0.4802,  0.7900],
        [ 0.8484,  1.1454,  0.3422,  1.0031],
        [-1.7736,  0.6015,  0.1835, -1.8762],
        [-0.8080, -0.5584, -0.5299,  1.0088],
        [ 0.3099,  0.0332,  0.7240,  0.5076],
        [-0.1050, -0.0215,  0.8462, -1.6069],
        [-0.5625, -1.5156,  1.3398,  0.7549],
        [ 0.0594, -0.6839, -1.0683, -1.0164],
        [ 0.0237,  1.5124, -0.9171, -2.5131]], grad_fn=<EmbeddingBackward0>)
torch.Size([10, 4])


In [None]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings)

tensor([[ 0.6240, -1.7723, -0.3019, -0.5642],
        [-0.9650, -3.1207,  0.2575,  0.2980],
        [ 1.7748, -0.2308,  0.1196,  0.5110],
        [-0.8472, -0.7746, -0.0392, -2.3682],
        [ 0.1185, -1.9345, -0.7525,  0.5167],
        [ 1.2363, -1.3429,  0.5014,  0.0156],
        [ 0.8215, -1.3976,  0.6236, -2.0990],
        [ 0.3639, -2.8917,  1.1172,  0.2628],
        [ 0.9858, -2.0601, -1.2909, -1.5085],
        [ 0.9501,  0.1363, -1.1397, -3.0051]], grad_fn=<AddBackward0>)


Now the input to the transformer no longer depends only on the token itself, so the repeated tokens in the input all result in different input embeddings to the network.

- **Token embedding layer**: the n_embeddings parameter is the **vocabulary** length.
- **Positional embedding layer**: the n_embeddings parameter is the **context** length.

#### Dataloader example

In [None]:
n_dim = 256
print(f"Vocab size:{bpe_tokenizer.n_vocab}, n_dim: {n_dim}")
token_embedding_layer = torch.nn.Embedding(num_embeddings=bpe_tokenizer.n_vocab, embedding_dim=n_dim)

context_length = 100
pos_embedding_layer = torch.nn.Embedding(context_length, n_dim)

Vocab size:50257, n_dim: 256


In [None]:
max_length = context_length
dataloader = create_dataloader(corpus=raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)

data_iter = iter(dataloader)

input, target = next(data_iter)
display(input, target)

tensor([[   40,   367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,
           257,  7026, 15632,   438,  2016,   257,   922,  5891,  1576,   438,
           568,   340,   373,   645,  1049,  5975,   284,   502,   284,  3285,
           326,    11,   287,   262,  6001,   286,   465, 13476,    11,   339,
           550,  5710,   465, 12036,    11,  6405,   257,  5527, 27075,    11,
           290,  4920,  2241,   287,   257,  4489,    64,   319,   262, 34686,
         41976,    13,   357, 10915,   314,  2138,  1807,   340,   561,   423,
           587, 10598,   393, 28537,  2014,   198,   198,     1,   464,  6001,
           286,   465, 13476,     1,   438,  5562,   373,   644,   262,  1466,
          1444,   340,    13,   314,   460,  3285,  9074,    13, 46606,   536],
        [ 5469,   438, 14363,   938,  4842,  1650,   353,   438,  2934,   489,
          3255,   465, 48422,   540,   450,    67,  3299,    13,   366,  5189,
          1781,   340,   338,  1016,   284,  3758, 

tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,   257,
          7026, 15632,   438,  2016,   257,   922,  5891,  1576,   438,   568,
           340,   373,   645,  1049,  5975,   284,   502,   284,  3285,   326,
            11,   287,   262,  6001,   286,   465, 13476,    11,   339,   550,
          5710,   465, 12036,    11,  6405,   257,  5527, 27075,    11,   290,
          4920,  2241,   287,   257,  4489,    64,   319,   262, 34686, 41976,
            13,   357, 10915,   314,  2138,  1807,   340,   561,   423,   587,
         10598,   393, 28537,  2014,   198,   198,     1,   464,  6001,   286,
           465, 13476,     1,   438,  5562,   373,   644,   262,  1466,  1444,
           340,    13,   314,   460,  3285,  9074,    13, 46606,   536,  5469],
        [  438, 14363,   938,  4842,  1650,   353,   438,  2934,   489,  3255,
           465, 48422,   540,   450,    67,  3299,    13,   366,  5189,  1781,
           340,   338,  1016,   284,  3758,   262, 

In [None]:
token_embeddings = token_embedding_layer(input)
print(token_embeddings, token_embeddings.shape)

pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings, pos_embeddings.shape)

input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings, input_embeddings.shape)

tensor([[[ 1.7500,  0.0949,  0.7555,  ..., -1.7168, -0.2222,  1.3189],
         [-0.3400, -0.2767, -1.2756,  ...,  0.5418,  0.1476, -0.0868],
         [-2.0084, -0.3826, -1.6926,  ...,  3.5156, -0.4859, -0.0400],
         ...,
         [ 0.2419,  0.3293,  1.7993,  ..., -0.2051,  0.8253,  1.5997],
         [ 0.9210, -0.8213,  0.1114,  ..., -1.9186,  1.6374,  1.7808],
         [-0.1235,  0.2241, -0.4767,  ..., -1.2801, -0.0417, -0.5137]],

        [[-1.0381,  1.1964,  1.1508,  ..., -0.2353,  0.3197, -1.6367],
         [ 0.1017,  1.2497, -0.3048,  ...,  0.4308,  0.3939,  0.3675],
         [-0.3300, -0.8894, -1.3587,  ..., -1.0807, -0.4354, -0.5380],
         ...,
         [-1.7406,  0.1560,  0.5604,  ..., -1.3136,  0.7180,  1.6680],
         [-0.8488, -0.2264,  0.9796,  ..., -0.2094,  0.3967,  0.4172],
         [ 1.3863, -1.1203, -0.6162,  ...,  1.0335, -0.5383, -0.3793]],

        [[-0.3654,  0.2450, -0.5387,  ...,  0.2635, -0.8272,  0.4696],
         [ 0.2419,  0.3293,  1.7993,  ..., -0