In [6]:
with open('the-verdict.txt','r', encoding='utf-8') as f:
    raw_text = f.read()

print("Lenght of the raw text file is :", len(raw_text))

Lenght of the raw text file is : 20479


In [8]:
#!pip install tiktoken

In [9]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
enc_text = tokenizer.encode(raw_text)
print("The lenght of the encoded token is :", len(enc_text))

The lenght of the encoded token is : 5145


In [10]:
context_size = 6
x = enc_text[:context_size]
y = enc_text[1:context_size+1]
print("X :", x)
print("Y :", y)

X : [40, 367, 2885, 1464, 1807, 3619]
Y : [367, 2885, 1464, 1807, 3619, 402]


In [11]:
for i in range(1, context_size):
    input = enc_text[:i]
    target = enc_text[i]

    print(input, "--->", target)

[40] ---> 367
[40, 367] ---> 2885
[40, 367, 2885] ---> 1464
[40, 367, 2885, 1464] ---> 1807
[40, 367, 2885, 1464, 1807] ---> 3619


In [12]:
enc_sample = enc_text[:20]
for i in range(1, context_size):
    input = enc_sample[:i]
    target = enc_sample[i]

    print(tokenizer.decode(input), "--->", tokenizer.decode([target]))

I --->  H
I H ---> AD
I HAD --->  always
I HAD always --->  thought
I HAD always thought --->  Jack


In [13]:
from torch.utils.data import Dataset, DataLoader
import torch
print(torch.__version__)

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_len, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special= {"<|endoftext|>"})

        for i in range(0, len(token_ids)-max_len, stride):
            input_chunk = token_ids[i:i + max_len]
            target_chunk = [token_ids[i+1:i + max_len + 1]]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

2.5.1+cu121


In [14]:
def create_dataloader(txt, shuffle=True, max_lenght=256, stride=1, batch_size=4, drop_last=True , num_workers=0):
    dataset = GPTDataset(txt, tokenizer, max_lenght, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

In [18]:
dataloader = create_dataloader(raw_text, max_lenght=4, stride=1, batch_size=8, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   40,   367,  2885,  1464],
        [  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257]]), tensor([[[  367,  2885,  1464,  1807]],

        [[ 2885,  1464,  1807,  3619]],

        [[ 1464,  1807,  3619,   402]],

        [[ 1807,  3619,   402,   271]],

        [[ 3619,   402,   271, 10899]],

        [[  402,   271, 10899,  2138]],

        [[  271, 10899,  2138,   257]],

        [[10899,  2138,   257,  7026]]])]


In [19]:
vocab_size = 50257
vector_dim = 256

In [2]:
import torch
embedding = torch.nn.Embedding(vocab_size, vector_dim)

In [20]:
print(embedding)

Embedding(50257, 256)


In [21]:
data_iter = iter(dataloader)
input, target = next(data_iter)

In [22]:
print("Token IDs :", input)
print("Input shape :", input.shape)

Token IDs : tensor([[   40,   367,  2885,  1464],
        [  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257]])
Input shape : torch.Size([8, 4])


In [23]:
token_embedding = embedding(input)

In [24]:
print(token_embedding)
print("Token embedding shape :", token_embedding.shape)

tensor([[[ 0.2949, -0.6371,  1.1273,  ..., -0.5957,  1.0866,  1.2326],
         [-1.1192, -1.4326,  0.3014,  ..., -0.9827, -0.7535, -0.2734],
         [ 0.3082, -0.2574,  1.4615,  ..., -0.7029,  0.9989,  0.3827],
         [-0.9967, -0.9794,  0.7152,  ...,  0.3941, -0.5527, -0.4929]],

        [[-1.1192, -1.4326,  0.3014,  ..., -0.9827, -0.7535, -0.2734],
         [ 0.3082, -0.2574,  1.4615,  ..., -0.7029,  0.9989,  0.3827],
         [-0.9967, -0.9794,  0.7152,  ...,  0.3941, -0.5527, -0.4929],
         [ 0.8151,  0.8998, -0.9503,  ..., -0.5637,  0.6895, -1.8415]],

        [[ 0.3082, -0.2574,  1.4615,  ..., -0.7029,  0.9989,  0.3827],
         [-0.9967, -0.9794,  0.7152,  ...,  0.3941, -0.5527, -0.4929],
         [ 0.8151,  0.8998, -0.9503,  ..., -0.5637,  0.6895, -1.8415],
         [ 0.9071, -1.5982, -0.5144,  ..., -0.4843, -0.0918,  0.7356]],

        ...,

        [[ 0.9071, -1.5982, -0.5144,  ..., -0.4843, -0.0918,  0.7356],
         [-0.6158, -1.4339, -0.0114,  ..., -0.4675,  0.13

In [25]:
contex_lenght = 4
pos_embedding_layer = torch.nn.Embedding(contex_lenght, vector_dim)
pos_embedding = pos_embedding_layer(torch.arange(contex_lenght))
print(pos_embedding)
print("Position embedding shape :", pos_embedding.shape)

tensor([[-0.3392,  0.1320,  0.6272,  ...,  1.2820, -0.9503, -0.8034],
        [ 0.4378, -0.6656, -0.9534,  ...,  0.5473, -0.1866,  0.6743],
        [ 0.6322,  1.0047,  1.1380,  ..., -0.0900, -0.6761, -0.1072],
        [ 0.0190,  0.0084,  1.8822,  ...,  0.4092,  0.9876,  0.2764]],
       grad_fn=<EmbeddingBackward0>)
Position embedding shape : torch.Size([4, 256])
