In [None]:
import os

In [None]:
file_path = "/content/the-verdict.txt"

with open(file_path, "r") as f:
    text_data = f.read()

print(f"total number of characters: {len(text_data)}")

total number of characters: 20479


In [None]:
import re
result = re.split(r'([,.:;?_!"()\']|--|\s)', text_data)
results = [item.strip() for item in result if item.strip()]

In [None]:
all_tokens = sorted(set(result))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token: integer for integer, token in enumerate(all_tokens)}

In [None]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {integer:token for token, integer in vocab.items()}

  def encode(self, text):
    processed = re.split(r'([,.;:?_!"()\']|--|\s)', text)
    processed = [item.strip() for item in processed if item.strip()]

    processed = [
        item if item in self.str_to_int else "<|unk|>"
        for item in processed
    ]
    ids = [self.str_to_int[token] for token in processed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[id] for id in ids])
    # Replace spacing before punctuation
    text = re.sub(r'\s+([,.?!"()\'])', 'r\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)

In [None]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text =" <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [None]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>r do you like tear <|endoftext|> In the sunlit terraces of the <|unk|>r


In [None]:
'''
[BOS] - beginning of sequence
[EOS] - end of sequence
[PAD] - add padding to varying length sequences for parellel training
'''

In [12]:
!pip install tiktoken



In [13]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.11.0


In [14]:
tokenizer = tiktoken.get_encoding("gpt2")

In [55]:
with open("/content/the-verdict.txt", "r") as f:
  raw_text = f.read()

In [21]:
enc_text = tokenizer.encode(raw_text)

In [22]:
print(len(enc_text))

5145


In [25]:
enc_sample = enc_text[50:]
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [28]:
for i in range(1, context_size + 1):
  context = enc_sample[:i]
  desired = enc_sample[i]

  print(f"{tokenizer.decode(context)} ------> {tokenizer.decode([desired])}")

 and ------>  established
 and established ------>  himself
 and established himself ------>  in
 and established himself in ------>  a


In [29]:
!pip install torch



In [30]:
import torch
from torch.utils.data import DataLoader, Dataset

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        # Stride is the number of steps by which the sliding window moves.
        # i.e if stride = max_length there will be no overlap in the input window
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1: i+1+max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [59]:
def create_dataloader_v1(text, batch_size=4, max_length=256, stride=128, shuffle=False,
                         drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
    data_loader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )
    return data_loader

In [67]:
max_length = 4
data_loader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)

In [68]:
data_iter = iter(data_loader)
inputs, targets = next(data_iter)

In [72]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [74]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [76]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

print(pos_embeddings.shape)

torch.Size([4, 256])


In [77]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
