In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)  #1

        for i in range(0, len(token_ids) - max_length, stride):  #2
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):  #3
        return len(self.input_ids)

    def __getitem__(self, idx):  #4
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")  #1
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)  #2
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,  #3
        num_workers=num_workers  #4
    )

    return dataloader

In [2]:
import torch

vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [7]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)  #1
inputs, targets = next(data_iter)
"inputs", inputs, "targets", targets

('inputs',
 tensor([[   40,   367,  2885,  1464],
         [ 1807,  3619,   402,   271],
         [10899,  2138,   257,  7026],
         [15632,   438,  2016,   257],
         [  922,  5891,  1576,   438],
         [  568,   340,   373,   645],
         [ 1049,  5975,   284,   502],
         [  284,  3285,   326,    11]]),
 'targets',
 tensor([[  367,  2885,  1464,  1807],
         [ 3619,   402,   271, 10899],
         [ 2138,   257,  7026, 15632],
         [  438,  2016,   257,   922],
         [ 5891,  1576,   438,   568],
         [  340,   373,   645,  1049],
         [ 5975,   284,   502,   284],
         [ 3285,   326,    11,   287]]))

In [8]:
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

In [13]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
pos_embeddings.shape

torch.Size([4, 256])

In [16]:
input_embeddings = token_embeddings + pos_embeddings
input_embeddings.shape, input_embeddings

(torch.Size([8, 4, 256]),
 tensor([[[-1.8877e+00,  2.9195e-01, -9.2119e-01,  ...,  1.2567e+00,
            2.5795e+00, -6.7325e-01],
          [ 4.8928e-01,  2.4501e+00,  1.3846e+00,  ..., -1.7016e-01,
           -2.4442e+00,  5.4236e-01],
          [ 1.8391e-02, -2.3120e-01, -7.5453e-01,  ..., -1.8812e+00,
            8.9344e-01,  1.4504e+00],
          [ 8.7843e-01,  1.1030e+00,  1.4210e+00,  ...,  2.4513e-01,
           -1.4172e+00, -1.0859e+00]],
 
         [[-7.4917e-01, -2.7147e+00,  6.2152e-01,  ...,  5.6238e-01,
            4.4143e-01,  1.5595e+00],
          [ 1.6096e+00,  2.1191e+00,  2.4365e-01,  ..., -2.5673e+00,
           -1.1974e+00,  1.0552e+00],
          [-4.3857e-01, -2.8041e-01, -3.9837e-01,  ..., -5.4297e-01,
            9.0635e-01,  5.5656e-01],
          [ 1.2936e+00,  1.4021e+00,  1.0828e+00,  ...,  1.6165e+00,
           -1.7687e-01, -1.3249e+00]],
 
         [[-1.0766e+00, -2.2005e+00,  3.3353e-01,  ..., -5.8690e-01,
            9.9560e-01,  1.0741e+00],
     