In [1]:
import tiktoken
from torch.utils.data import Dataset, DataLoader
import torch

In [26]:
with open('./data/the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

In [27]:
MAX_SEQ_LENGTH = 4
STRIDE = 1

In [28]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_seq_length, stride):
        self.token_ids = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})
        self.input_ids = []
        self.target_ids = []
        for i in range(0, len(self.token_ids) - max_seq_length, stride):
            input_chunk = self.token_ids[i: i + max_seq_length]
            target_chunk = self.token_ids[i+1: i + max_seq_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [29]:
def create_dataloader(text: str, max_seq_length: int, 
                      stride: int, batch_size: int, 
                      shuffle: bool):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text, tokenizer, max_seq_length, stride)
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [31]:
dataloader = create_dataloader(raw_text, MAX_SEQ_LENGTH, STRIDE, 2, False)

In [32]:
inp, target = next(iter(dataloader))

In [36]:
inp.shape

torch.Size([2, 4])

In [35]:
target

tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619]])

In [37]:
vocab_size = 6
embed_dim = 3

In [40]:
embeddding_layer = torch.nn.Embedding(num_embeddings=6, embedding_dim=embed_dim)

In [42]:
sample_tensor = torch.tensor([0, 1, 2, 3, 4, 5])

In [43]:
embeddding_layer.weight

Parameter containing:
tensor([[ 0.3030,  0.2982,  1.0058],
        [-1.9074,  1.9568, -0.5934],
        [-0.3699, -0.1898,  1.5559],
        [ 0.6711,  0.1754, -1.9705],
        [ 1.3260,  1.8966, -0.5805],
        [ 0.8884,  0.8905, -0.7011]], requires_grad=True)

In [45]:
embeddding_layer(sample_tensor)

tensor([[ 0.3030,  0.2982,  1.0058],
        [-1.9074,  1.9568, -0.5934],
        [-0.3699, -0.1898,  1.5559],
        [ 0.6711,  0.1754, -1.9705],
        [ 1.3260,  1.8966, -0.5805],
        [ 0.8884,  0.8905, -0.7011]], grad_fn=<EmbeddingBackward0>)