In [1]:
from importlib.metadata import version

import tiktoken
import torch

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.0.1+cu118
tiktoken version: 0.7.0


In [86]:
with open("AROG-ENG-SUBTITLE.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:125])

Total number of character: 60455
-The reason for your visit?
-Touristic.

Was your journey comfortable
commander Logar?

Module crashed into Atlantic ocean.




In [73]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [74]:
result = re.split(r'([,.]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [75]:
# Strip whitespace from each item and then filter out any empty strings.
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [76]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [77]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['-The', 'reason', 'for', 'your', 'visit', '?', '-Touristic', '.', 'Was', 'your', 'journey', 'comfortable', 'commander', 'Logar', '?', 'Module', 'crashed', 'into', 'Atlantic', 'ocean', '.', 'l', 'came', 'with', 'a', 'connected', 'flight', 'from', 'Miami', '.']


In [78]:
print(len(preprocessed))

14734


In [79]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

2276


In [80]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [81]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('%', 1)
("'", 2)
(',', 3)
('-', 4)
('--', 5)
('-17', 6)
('-4', 7)
('-500', 8)
('-A', 9)
('-Ah', 10)
('-All', 11)
('-Amnesty', 12)
('-And', 13)
('-Are', 14)
('-Arif', 15)
('-Arog', 16)
('-Ball', 17)
('-Be', 18)
('-Bring', 19)
('-Bro', 20)
('-But', 21)
('-Calm', 22)
('-Can', 23)
('-Cheers', 24)
('-Come', 25)
('-Cool', 26)
('-Dad', 27)
('-Daddy', 28)
('-Destroy', 29)
('-Did', 30)
('-Dimi', 31)
('-Dimitell', 32)
('-Do', 33)
('-Does', 34)
('-Don', 35)
('-Downstairs', 36)
('-Dry', 37)
('-Ee', 38)
('-Everything', 39)
('-Fake', 40)
('-Fight', 41)
('-Finish', 42)
('-Flirting', 43)
('-Football', 44)
('-For', 45)
('-Frodo', 46)
('-Get', 47)
('-Go', 48)
('-Goal', 49)
('-Good', 50)


In [82]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [94]:
tokenizer = SimpleTokenizerV1(vocab)

text = """ Was your journey comfortable commander Logar? Module crashed into Atlantic ocean."""
ids = tokenizer.encode(text)
print(ids)

[565, 2271, 1345, 872, 876, 383, 183, 403, 905, 1329, 214, 1601, 154]


In [95]:
tokenizer.decode(tokenizer.encode(text))

'Was your journey comfortable commander Logar? Module crashed into Atlantic ocean.'

In [96]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [97]:
len(vocab.items())

2278

In [98]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('yourself', 2273)
('zero', 2274)
('zucchini', 2275)
('<|endoftext|>', 2276)
('<|unk|>', 2277)


In [99]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [100]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Was your journey comfortable commander Logar? Çırağan"
text2 = "Module crashed into Atlantic ocean. "

text = " <|endoftext|> ".join((text1, text2))

print(text)

Was your journey comfortable commander Logar? Çırağan <|endoftext|> Module crashed into Atlantic ocean. 


In [101]:
tokenizer.encode(text)

[565,
 2271,
 1345,
 872,
 876,
 383,
 183,
 2277,
 2276,
 403,
 905,
 1329,
 214,
 1601,
 154]

In [102]:
tokenizer.decode(tokenizer.encode(text))

'Was your journey comfortable commander Logar? <|unk|> <|endoftext|> Module crashed into Atlantic ocean.'

In [103]:
import importlib

In [104]:
tokenizer = tiktoken.encoding_for_model("gpt-4o")

In [105]:
text1 = "Was your journey comfortable commander Logar? Çırağan"
text2 = "Module crashed into Atlantic ocean. "

text = " <|endoftext|> ".join((text1, text2))

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[21124, 634, 12647, 10721, 59743, 4309, 277, 30, 24549, 612, 614, 84135, 220, 199999, 25609, 75481, 1511, 34906, 25472, 13, 220]


In [106]:
strings = tokenizer.decode(integers)

print(strings)

Was your journey comfortable commander Logar? Çırağan <|endoftext|> Module crashed into Atlantic ocean. 


In [107]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

15150


In [108]:
enc_sample = enc_text[70:]

In [111]:
context_size = 6

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [198, 2512, 591, 121729, 395, 1374]
y:      [2512, 591, 121729, 395, 1374, 1715]


In [112]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[198] ----> 2512
[198, 2512] ----> 591
[198, 2512, 591] ----> 121729
[198, 2512, 591, 121729] ----> 395
[198, 2512, 591, 121729, 395] ----> 1374
[198, 2512, 591, 121729, 395, 1374] ----> 1715


In [137]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))


 ----> come

come ---->  from

come from ---->  monkeys

come from monkeys ---->  for

come from monkeys for ---->  real

come from monkeys for real ----> ?




In [115]:
from torch.utils.data import Dataset, DataLoader
import torch
print("PyTorch version:", torch.__version__)


PyTorch version: 2.0.1+cu118


In [116]:

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride=1):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + stride: i + max_length + stride]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [117]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [161]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=2, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  12,  464, 1738,  329]]), tensor([[1738,  329,  534, 3187]])]


In [162]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[1738,  329,  534, 3187]]), tensor([[ 534, 3187,   30,  198]])]


In [163]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=2, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   12,   464,  1738,   329],
        [ 1738,   329,   534,  3187],
        [  534,  3187,    30,   198],
        [   30,   198,    12, 39152],
        [   12, 39152,  2569,    13],
        [ 2569,    13,   198,   198],
        [  198,   198, 16973,   534],
        [16973,   534,  7002,  6792]])

Targets:
 tensor([[ 1738,   329,   534,  3187],
        [  534,  3187,    30,   198],
        [   30,   198,    12, 39152],
        [   12, 39152,  2569,    13],
        [ 2569,    13,   198,   198],
        [  198,   198, 16973,   534],
        [16973,   534,  7002,  6792],
        [ 7002,  6792,   198,  9503]])


In [165]:
input_ids = torch.tensor([2, 3, 5, 1])

In [166]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [167]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [168]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [169]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [170]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [171]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [172]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   12,   464,  1738,   329],
        [  534,  3187,    30,   198],
        [   12, 39152,  2569,    13],
        [  198,   198, 16973,   534],
        [ 7002,  6792,   198,  9503],
        [ 4066,  5972,   283,    30],
        [  198,   198, 26796, 14997],
        [  656, 10596,  9151,    13]])

Inputs shape:
 torch.Size([8, 4])


In [173]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [174]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [175]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [176]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
