In [62]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("number of characters: ", len(text))
print(text[:100])

number of characters:  1115393
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [63]:
vocab = sorted(list(set(text)))

In [64]:
class Tokenizer:
    def __init__(self, vocab):
        self.stoi = { ch:i for i,ch in enumerate(vocab) }
        self.itos = { i:ch for i,ch in enumerate(vocab) }

    def encode(self, text):
        ids = [self.stoi[c] for c in text]
        return ids

    def decode(self, ids):
        text = ''.join([self.itos[i] for i in ids])
        return text

tokenizer = Tokenizer(text)
text = """we are building an agi"""
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))

[1115386, 1115374, 1115385, 1115387, 1115383, 1115374, 1115385, 1115309, 1115380, 1115389, 1115373, 1115349, 1115389, 1115390, 1115391, 1115385, 1115387, 1115390, 1115385, 1115387, 1115391, 1115389]
we are building an agi


In [65]:
import tiktoken
enc = tiktoken.get_encoding("o200k_base")

In [66]:
text = "we are building an agi"
ids = enc.encode(text)
print(ids)
print(enc.decode(ids))

[854, 553, 6282, 448, 1017, 72]
we are building an agi


In [67]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):

    def __init__(self, text, tokenizer, max_length, stride):

        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text)

        for i in range(0, len(token_ids) - max_length, stride):

            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1: i+max_length+1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


In [68]:
data = CustomDataset(text, enc, max_length=4, stride=1)

In [69]:
len(data)

2

In [80]:

dataloader = DataLoader(dataset=data, 
                        batch_size=1,
                        num_workers=0,
                        drop_last=True, # drops last batch if its shorter than specified batch_size
                        shuffle=True)

In [81]:
data_iter = iter(dataloader)
x, y = next(data_iter)
x, y

(tensor([[ 854,  553, 6282,  448]]), tensor([[ 553, 6282,  448, 1017]]))

In [92]:
vocab_size = 50257
output_dim = 256
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embd = embedding_layer(x)

max_length=4
pos_embedding_layer = torch.nn.Embedding(max_length, output_dim)
pos_embd = pos_embedding_layer(torch.arange(max_length)) # 0 1 ... max_length-1

input_embd = token_embd + pos_embd
input_embd.shape

torch.Size([1, 4, 256])