#### text to tokens

In [2]:
with open ('the-verdict.txt', 'r', encoding= 'utf-8' ) as f:
    raw_text = f.read()

print(f'total number of chars: {len(raw_text)}')
print(raw_text[:99])

total number of chars: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
# splitting char on whitespace (\s)
import re
text = 'Hello, world. This, is a test.'
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [4]:
# we can remove the whitespace also
result = [item for item in result if item.split()]
result

['Hello,', 'world.', 'This,', 'is', 'a', 'test.']

In [5]:
# we can include the , . also
result = re.split(r'([,.]|\s)', text)

#remove whitespace
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [6]:
# t1 = 'dark is one of the best show i have ever watched'
# t1.strip()
# print(t1)

In [7]:
# include all the punctations and apply to our text 
preprocessed = re.split(r'([,.;:?_!"()\']|--|\s)', raw_text)

preprocessed = [item for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [8]:
print(preprocessed[:40])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his']


#### tokens to token IDs

 in order to convert tokens to tokens ids,
 
 we first have to have a vocabulary to map words to int

Creating a vocabulary

In [9]:
# vocab

all_words = sorted(set(preprocessed)) #set: for unique char
vocab_size = len(all_words)
vocab_size

1130

In [10]:
vocab = {token: int for int, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >20:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)


#### Simple Tokenizer

let's functionize all this
- first encode method to convert text to token ids (text -> tokens -> token ids)

- and a decode method to convert token ids back to  text

Regex Pattern
- `\s+`: one or more whitespaces
- `[,.;:?_!"()\']`: matches any of `, . ? ! " ( ) '`

In [11]:
# implementing a simple text tokenizer

class SimpleTokenizerV1():
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.;:?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)   # removes the unnecessary spaces before punctuation
        return text

In [12]:
# let's do it for a example text
tokenizer = SimpleTokenizerV1(vocab)
text = """I HAD always thought Jack Gisburn rather a cheap genius though a
good fellow enough so it was no. It's me"""

ids = tokenizer.encode(text)
print(ids)

[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 1002, 115, 500, 435, 392, 908, 585, 1077, 709, 7, 56, 2, 850, 663]


In [13]:
tokenizer.decode(ids)

"I HAD always thought Jack Gisburn rather a cheap genius though a good fellow enough so it was no. It' s me"

keep in mind that our vocabulary is limited to "The Verdict" txt file.

we will not able to tokenize any word outside our training set..here the short story

In [14]:
# ex
# txt = "Hello, aashutosh"
# print(tokenizer.encode(txt))   # -> raise a KeyError


#### Special Tokens

we can (or should) extend our vocab with additional tokens like 
```<startoftext>, <endoftext>, <unk>``` etc

/ these special tokens are different for different llms

In [15]:
# modify of vocab to include two special tokens -> <|unk|>, <|endoftext|>

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(['<|unk|>', '<|endoftext|>'])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

print(len(vocab.items()))

1132


1130 + 2 special tokens

In [16]:
list(vocab.items())[-5:]

[('younger', 1127),
 ('your', 1128),
 ('yourself', 1129),
 ('<|unk|>', 1130),
 ('<|endoftext|>', 1131)]

In [17]:
# Tokenizer class that includes special tokens

class SimpleTokenizerV2():
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.;:?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        preprocessed = [item if item in self.str_to_int
                        else '<|unk|>' for item in preprocessed]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [18]:
txt1 = 'Hello On reflection, it really was a tempting problem.'
txt2 = 'To accuse his wife would have been too easy--In India'

txt = ' <|endoftext|> '.join((txt1, txt2))
print(txt)

Hello On reflection, it really was a tempting problem. <|endoftext|> To accuse his wife would have been too easy--In India


In [19]:
tokenizer = SimpleTokenizerV2(vocab=vocab)
ids = tokenizer.encode(txt)
print(ids)

[1130, 75, 828, 5, 585, 821, 1077, 115, 981, 796, 7, 1131, 102, 125, 549, 1103, 1120, 530, 208, 1020, 375, 6, 55, 1130]


In [20]:
tokenizer.decode(ids=ids)

'<|unk|> On reflection, it really was a tempting problem. <|endoftext|> To accuse his wife would have been too easy -- In <|unk|>'

additional special tokens
```[BOS], [EOS], [PAD]```

#### BPE

GPT models use a *byte pair encoding* tokenizer

How BTE handles unknown words?
- break down the unk word into characers and subwords, and these subwords can then tokenized.
- so we can assign multiple token ids to a single words.
- no need of <|unk|> token to handle unknwn words

we will use a library called tiktoken
Why tiktoken?
- efficient (written in Rust)
- optimized for openai models

**BPE in a nutshell**
- start with individual characters
- find the most common pair
- merge it to the vocab
- keep repeating

In [21]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.7.0


In [22]:
tokenizer = tiktoken.get_encoding('gpt2')

In [23]:
text = ( "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
        "of someunknownPlace.")

In [24]:
integers = tokenizer.encode(text, allowed_special = {'<|endoftext|>'})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [25]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [29]:
str = 'Akwirw ier'
ids = tokenizer.encode(str)

for id in ids:
    print(f'{tokenizer.decode([id])} -> {id}')

Ak -> 33901
w -> 86
ir -> 343
w -> 86
  -> 220
ier -> 959


#### Data Sampling using Sliding window -> generating input-target pairs

In [31]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [32]:
enc_sample = enc_text[50:]

we will create input-target pairs for the next word prediction task
- x -> input tokens
- y -> targets

In [35]:
context_size = 4  # max number of token in the input
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f'x : {x}')
print(f'y:       {y}')

x : [290, 4920, 2241, 287]
y:       [4920, 2241, 287, 257]


In [36]:
# inputs and targets tokens
print(f'Input Token --> Target Token')
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(f'{context} --> {desired}')

Input Token --> Target Token
[290] --> 4920
[290, 4920] --> 2241
[290, 4920, 2241] --> 287
[290, 4920, 2241, 287] --> 257


In [37]:
print(f'Input text --> Target text')
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(f'{tokenizer.decode(context)} --> {tokenizer.decode([desired])}')

Input text --> Target text
 and -->  established
 and established -->  himself
 and established himself -->  in
 and established himself in -->  a


#### Custom pytorch dataset and dataloader
- `Dataset` -> manages dataset
- `Dataloader` -> feeds data into batches

In [39]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


# 1. custom dataset
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.inputs_ids = []
        self.target_ids = []

        # tokenize the text
        token_ids = tokenizer.encode(txt)

        #uses a sliding window approach
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1: i+1+max_length]

            self.inputs_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    # return total num of rows in the dataset
    def __len__(self):
        return len(self.inputs_ids)
    
    # return a single row
    def __getitem__(self, index):
        return self.inputs_ids[index], self.target_ids[index]


In [40]:
#2. custom dataloader
def create_dataloders_v1(txt,
                         batch_size=4,
                         max_length= 256,
                         stride= 128,  # stride -> num of positions the input shift across batches
                         shuffle= True,
                         drop_last = True,
                         num_workers = 0):
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(dataset,
                            batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last, # drops the last batch if it is shorter than the batch_size
                            num_workers=num_workers)
    
    return dataloader

In [41]:
dataloader = create_dataloders_v1(raw_text,
                                  batch_size=1,
                                  max_length=4,
                                  stride=1,
                                  shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [42]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [46]:
# batch size greater than 1
dataloader = create_dataloders_v1(raw_text,
                                  batch_size=8,
                                  max_length=4,
                                  stride=4,  # stride = max_length (to avoid overlap since more overlap => overfitting)
                                  shuffle= False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print(f'Inputs: \n {inputs}')
print(f'\nTargets: \n {targets}')

Inputs: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets: 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


#### token embeddings
- learned during training and stored as a lookup table (called `embedding matrix`)
- first initialized with random values and then updated during the course of training.

`vocab_size = V`

`output_dim = d`

`embedding matrix (E) shape: (V,d)`

each token's embedding is a row in the embedding matrix, which is learned via backprop

**embedding vector of token at index i in the vocab = E[i]**

in GPT3,
- vocab size, V = 50,257
- hidden size, d = 12,288

GPT3's token embedding matrix has shape `(50,257, 12,288)`

In [47]:
# example
import torch
input_ids = torch.tensor([2,3,5,1])

In [49]:
import torch.nn as nn

vocab_size = 6
output_dim = 3 #d

# initalize a embedding layer
torch.manual_seed(42)
embed_layer = nn.Embedding(vocab_size, output_dim)
embed_layer.weight

Parameter containing:
tensor([[ 1.9269,  1.4873, -0.4974],
        [ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [-0.2316,  0.0418, -0.2516],
        [ 0.8599, -0.3097, -0.3957]], requires_grad=True)

the above matrix is called embedding matrix.

we are essentially converting a single token id into a `d` dim embedding vector

In [50]:
token_id = torch.tensor([3])
embed_layer(token_id)

tensor([[-0.6866,  0.6105,  1.3347]], grad_fn=<EmbeddingBackward0>)

In [50]:
embed_layer(input_ids)

tensor([[ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [ 0.8599, -0.3097, -0.3957],
        [ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)

Shortcoming of these embeddings:

1. same token id always gets mapped to the same vector representation, regardless of position of the token... no positional info

2. also self attention mechanism is **position-agnostic** (means does not have a bulit in sense of word order, therfore need to add positional info to the llm)

` input embedding = token embedding + positional embedding`

Types of pos embeddings:
1. Absolute embeddings (associated with absolute position of tokens)
2. Relative embeddings (based on relative position between words)

GPT uses absolute pos embeddings that are learned during the training process.
Btw, the original transformer model uses fixed pos embedding (*sin and cos formulas in attention paper*)

In [51]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = nn.Embedding(vocab_size, output_dim)

embed each token into a 256-dim vector

In [52]:

max_length = 4
dataloader = create_dataloders_v1(raw_text,
                                  batch_size=8,
                                  max_length=max_length,
                                  stride=max_length,
                                  shuffle = False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print(f'Token Ids: \n {inputs}')
print(f'\nInputs shape: \n {inputs.shape}')

Token Ids: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape: 
 torch.Size([8, 4])


In [53]:
# token id tensor into 256- dim vectors

token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

each token id is embedded into a 256 dim vector

In [57]:
context_length = max_length
pos_embedding_layer = nn.Embedding(context_length, output_dim)

pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)
# print(torch.arange(context_length))

torch.Size([4, 256])


In [60]:
# adding pos_embedding with token_embedding
input_embeddings = token_embeddings + pos_embeddings  # pytorch broadcasting
print(input_embeddings.shape)

torch.Size([8, 4, 256])


now, these input embedding are used as a input for the main LLM layer

In [62]:
print(input_embeddings[0,0,:])

tensor([ 6.8317e-01,  2.1782e+00,  3.8856e+00,  2.5868e+00,  1.4441e+00,
        -7.2869e-01,  2.4407e+00,  3.5799e-01,  3.2556e+00, -1.2108e-02,
        -1.2161e+00, -1.1384e+00,  2.9287e+00,  2.3976e+00,  1.7161e+00,
         2.6013e+00,  8.5243e-01,  1.6218e+00, -2.6531e-01, -9.6503e-01,
        -4.8168e-01,  7.6193e-02, -9.9844e-01, -9.9940e-01, -9.8121e-01,
         6.1716e-01,  1.3799e+00,  3.5311e-01, -6.7868e-01, -1.1240e-01,
        -5.2592e-01, -5.1606e-01,  2.5461e-01,  7.8465e-01, -1.5667e+00,
        -2.3691e-01, -1.3758e-02, -1.4461e+00,  1.7381e+00,  7.3556e-01,
        -2.1020e+00, -1.0255e+00, -5.4484e-01, -3.5187e-01, -2.4607e-01,
         1.5227e+00, -1.2309e+00, -1.4676e-01, -1.2751e+00,  1.8039e+00,
         3.2581e-01, -2.9651e-02, -2.2289e+00, -8.5137e-01,  9.5349e-01,
        -2.6317e+00,  2.0894e+00, -2.3409e+00, -6.6308e-01, -1.3241e+00,
         1.3093e+00, -1.7003e+00,  1.0733e+00,  1.6118e+00,  1.4364e+00,
        -6.5225e-01,  2.6143e+00,  3.6995e-01,  7.1

**Input Embedding pipeline**

input text -> tokens -> token ids -> token embedding + pos embedding -> input embedding