# Read Raw Text

In [1]:
import os

In [2]:
RESOURCE_DIR = "Resources"
HARRY_POTTER_SS_FILE = "Harry_Potter_and_Sorcerer's_Stone.txt"
FILE_PATH = os.path.join(RESOURCE_DIR, HARRY_POTTER_SS_FILE)

In [3]:
with open(FILE_PATH, 'r', encoding='windows-1252') as file:
    raw_text = file.read()

In [4]:
print(f"Length of text: {len(raw_text)} characters")
print(f"First 200 characters:\n{raw_text[:200]}")

Length of text: 442745 characters
First 200 characters:
Harry Potter and the Sorcerer's Stone 

CHAPTER ONE 

THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. Th


# Input-Target Creation Logic

In [5]:
import tiktoken

In [6]:
tokenizer = tiktoken.get_encoding("gpt2")

In [7]:
token_ids = tokenizer.encode(raw_text)
print(f"Total number of tokens: {len(token_ids)}")

Total number of tokens: 113794


In [8]:
print(f"first 20 token IDs: {token_ids[:20]}")
print(f"Tokens for first 20 IDs: {[tokenizer.decode([id]) for id in token_ids[:20]]}")

first 20 token IDs: [18308, 14179, 290, 262, 30467, 338, 8026, 220, 198, 198, 41481, 16329, 220, 198, 198, 10970, 16494, 56, 19494, 406]
Tokens for first 20 IDs: ['Harry', ' Potter', ' and', ' the', ' Sorcerer', "'s", ' Stone', ' ', '\n', '\n', 'CHAPTER', ' ONE', ' ', '\n', '\n', 'THE', ' BO', 'Y', ' WHO', ' L']


In [9]:
sample_token_ids = token_ids[15005:]
sample_tokens = [tokenizer.decode([id]) for id in sample_token_ids]
print(f"Sample tokens:\n{sample_tokens[:30]}")

Sample tokens:
[' We', "'re", ' going', ' away', '.', ' Just', ' pack', ' some', ' clothes', '.', ' No', ' arguments', '!"', ' ', '\n', '\n', 'He', ' looked', ' so', ' dangerous', ' with', ' half', ' his', ' mustache', ' missing', ' that', ' no', ' one', ' dared', ' argue']


In [10]:
test = "Harry"
test_ids = tokenizer.encode(test)
print(f"Token IDs for '{test}': {test_ids}")
print(f"Decoded tokens for IDs {test_ids}: {[tokenizer.decode([id]) for id in test_ids]}")

Token IDs for 'Harry': [18308]
Decoded tokens for IDs [18308]: ['Harry']


In [11]:
context_size = 4

x0 = sample_token_ids[0:context_size]
y0 = sample_token_ids[1:context_size+1]

print(f"Input token IDs (x0): {x0}")
print(f"Target token IDs (y0): {y0}")

print(f"Input tokens (x0): {[tokenizer.decode([id]) for id in x0]}")
print(f"Target tokens (y0): {[tokenizer.decode([id]) for id in y0]}")

Input token IDs (x0): [775, 821, 1016, 1497]
Target token IDs (y0): [821, 1016, 1497, 13]
Input tokens (x0): [' We', "'re", ' going', ' away']
Target tokens (y0): ["'re", ' going', ' away', '.']


In [12]:
print(f"{context_size} predictions per pair (input, target)")
for i in range(context_size):
    input = x0[:i+1]
    target = y0[i]
    print(f"{input} ---> {target}")

print(f"\n{context_size} predictions per pair (input tokens, target token)")
for i in range(context_size):
    input_tokens = [tokenizer.decode([id]) for id in x0[:i+1]]
    target_token = tokenizer.decode([y0[i]])
    print(f"{input_tokens} ---> {target_token}")

4 predictions per pair (input, target)
[775] ---> 821
[775, 821] ---> 1016
[775, 821, 1016] ---> 1497
[775, 821, 1016, 1497] ---> 13

4 predictions per pair (input tokens, target token)
[' We'] ---> 're
[' We', "'re"] --->  going
[' We', "'re", ' going'] --->  away
[' We', "'re", ' going', ' away'] ---> .


# Dataset Class

In [15]:
import torch  # pip install torch
from torch.utils.data import Dataset

In [20]:
class GPTDatasetV1(Dataset):
    def __init__(self, raw_text, tokenizer, context_size, stride):
        self.input_token_ids = []
        self.target_token_ids = []

        all_token_ids = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"}) # allowed_special is a set

        for i in range(0, len(all_token_ids) - context_size, stride):
            input_chunk = all_token_ids[i : i + context_size]
            target_chunk = all_token_ids[i + 1 : i + context_size + 1]
            self.input_token_ids.append(torch.tensor(input_chunk))
            self.target_token_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_token_ids)

    def __getitem__(self, idx):
        x = self.input_token_ids[idx]
        y = self.target_token_ids[idx]
        return x, y

# Data Loader

In [21]:
from torch.utils.data import DataLoader

In [22]:
def create_dataloader_v1(raw_text, tokenizer, context_size=256, stride=256, batch_size=8,
                         shuffle=True, num_workers=0, drop_last=True):
    
    dataset = GPTDatasetV1(raw_text, tokenizer, context_size, stride)
    
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, num_workers=num_workers, drop_last=drop_last)
    
    return dataloader

## Test Data Loader

In [31]:
data_loader = create_dataloader_v1(raw_text, tokenizer, context_size=4, stride=1, batch_size=1, shuffle=False)

* Use Python's builtin **iter** and **next**

In [32]:
data_iter = iter(data_loader)

In [33]:
first_batch = next(data_iter)

In [34]:
print(f"First batch\n{first_batch}")

First batch
[tensor([[18308, 14179,   290,   262]]), tensor([[14179,   290,   262, 30467]])]


In [35]:
second_batch = next(data_iter)

print(f"Second batch\n{second_batch}")

Second batch
[tensor([[14179,   290,   262, 30467]]), tensor([[  290,   262, 30467,   338]])]


In [36]:
data_loader2 = create_dataloader_v1(raw_text, tokenizer, context_size=4, stride=4, batch_size=8, shuffle=False)

data_iter2 = iter(data_loader2)
first_batch2 = next(data_iter2)

first_batch2_inputs, first_batch2_targets = first_batch2
print(f"inputs: {first_batch2_inputs}")
print(f"\ntargets: {first_batch2_targets}")

inputs: tensor([[18308, 14179,   290,   262],
        [30467,   338,  8026,   220],
        [  198,   198, 41481, 16329],
        [  220,   198,   198, 10970],
        [16494,    56, 19494,   406],
        [ 3824,  1961,   220,   198],
        [  198,  5246,    13,   290],
        [ 9074,    13,   360,  1834]])

targets: tensor([[14179,   290,   262, 30467],
        [  338,  8026,   220,   198],
        [  198, 41481, 16329,   220],
        [  198,   198, 10970, 16494],
        [   56, 19494,   406,  3824],
        [ 1961,   220,   198,   198],
        [ 5246,    13,   290,  9074],
        [   13,   360,  1834,  1636]])
