In [18]:
import torch
import tiktoken
from torch.utils.data import Dataset, DataLoader

tokenizer = tiktoken.get_encoding("gpt2")

with open("JohnAdams.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)

class GPTDatasetV1(Dataset):
    def __init__(self, raw_text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        enc_text = tokenizer.encode(raw_text)
        for i in range(0, len(enc_text) - max_length, stride):
            input_chunk = enc_text[i:i + max_length]
            target_chunk = enc_text[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(raw_text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(raw_text, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

batch_sizes = [1, 4, 8]
max_lengths = [4, 16, 64]
strides = [1, 8, 32]

for batch_size in batch_sizes:
    for max_length in max_lengths:
        for stride in strides:
            print(f"Batch size: {batch_size}, Max length: {max_length}, Stride: {stride}")
            dataloader = create_dataloader_v1(raw_text, batch_size=batch_size, max_length=max_length, stride=stride, shuffle=False)
            data_iter = iter(dataloader)
            inputs, targets = next(data_iter)
            print(f"Input batch shape: {inputs.shape}, Target batch shape: {targets.shape}")
            for i in range(inputs.shape[0]):
                input_text = tokenizer.decode(inputs[i].tolist())
                target_text = tokenizer.decode(targets[i].tolist())
                print(f"Batch {i+1} - Input text: {input_text}")
                print(f"Batch {i+1} - Target text: {target_text}")

input_ids = torch.tensor([2, 3, 5, 1])
vocab_size = 8
output_dim = 4
torch.manual_seed(123)
inputs = torch.nn.Embedding(vocab_size, output_dim)
print(inputs.weight)
print(inputs(input_ids))
d_in = 4
d_out = 4
W_q = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_k = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_v = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
query = inputs(input_ids)[2] @ W_q
keys = inputs(input_ids) @ W_k
values = inputs(input_ids) @ W_v
attention_scores = query @ keys.T
attention_weights = torch.softmax(attention_scores, dim=-1)
context_vector = attention_weights @ values
print("CV:", context_vector)

Batch size: 1, Max length: 4, Stride: 1
Input batch shape: torch.Size([1, 4]), Target batch shape: torch.Size([1, 4])
Batch 1 - Input text: Gentlemen of
Batch 1 - Target text: entlemen of the
Batch size: 1, Max length: 4, Stride: 8
Input batch shape: torch.Size([1, 4]), Target batch shape: torch.Size([1, 4])
Batch 1 - Input text: Gentlemen of
Batch 1 - Target text: entlemen of the
Batch size: 1, Max length: 4, Stride: 32
Input batch shape: torch.Size([1, 4]), Target batch shape: torch.Size([1, 4])
Batch 1 - Input text: Gentlemen of
Batch 1 - Target text: entlemen of the
Batch size: 1, Max length: 16, Stride: 1
Input batch shape: torch.Size([1, 16]), Target batch shape: torch.Size([1, 16])
Batch 1 - Input text: Gentlemen of the Senate and Gentlemen of the House of Representatives:

Batch 1 - Target text: entlemen of the Senate and Gentlemen of the House of Representatives:


Batch size: 1, Max length: 16, Stride: 8
Input batch shape: torch.Size([1, 16]), Target batch shape: torch.Size([