## Load Data

In [4]:
combined_txt_path = "data/combined.txt"
with open(combined_txt_path, "r", encoding="utf-8") as file:
    text = file.read()

## Tokenize
- We are building a character level language model
- When we encode, we get back a list of integers
- When we decode, we get back the chars
- Typically subword encodings are used (tiktoken, sentenpiece etc.)

In [7]:
# create a mapping from characters to integers
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("Christopher Nolan"))
print(decode(encode("Christopher Nolan")))

[32, 67, 77, 68, 78, 79, 74, 75, 67, 64, 77, 1, 43, 74, 71, 60, 73]
Christopher Nolan


In [11]:
# Encode the dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([12656655]) torch.int64
tensor([30, 65, 79, 64, 77,  1, 79, 67, 64,  1, 74, 75, 64, 73, 68, 73, 66,  1,
        71, 74, 66, 74, 78, 12,  1, 71, 64, 60, 81, 64, 78,  1, 78, 79, 60, 77,
        79,  1, 65, 60, 71, 71, 68, 73, 66,  1, 60, 73, 63,  1, 82, 64,  1, 67,
        64, 60, 77,  1, 60,  1, 81, 74, 68, 62, 64, 13, 74, 81, 64, 77, 14,  0,
         0, 31, 47, 50, 32, 34,  1, 52, 30, 54, 43, 34,  1,  9, 51, 44, 10, 26,
         1, 49, 67, 64, 77, 64,  1, 82, 60, 78])


## Split Dataset into train and val sets

In [12]:
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

## Batching of Chunks
- Training the transformer on all the data is usually computationally not feasible and so we train the transformer on chunks of the data

In [13]:
block_size = 8
train_data[:block_size+1]

tensor([30, 65, 79, 64, 77,  1, 79, 67, 64])

- When we train the transformer in chunks, we are trying to predict the entire chunk. For example, given [30], 65 is next; given [30, 65], 79 is next and so on
- Moreover, we want the transformer to be used to seeing chunks of size 1 all the way to chunks of size `block_size`

In [15]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([30]) the target: 65
when input is tensor([30, 65]) the target: 79
when input is tensor([30, 65, 79]) the target: 64
when input is tensor([30, 65, 79, 64]) the target: 77
when input is tensor([30, 65, 79, 64, 77]) the target: 1
when input is tensor([30, 65, 79, 64, 77,  1]) the target: 79
when input is tensor([30, 65, 79, 64, 77,  1, 79]) the target: 67
when input is tensor([30, 65, 79, 64, 77,  1, 79, 67]) the target: 64


- We usually stack up chunks so that we can do parallel processing on GPUs

In [16]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[64, 79,  1, 60, 73, 63,  1, 82],
        [17, 14,  0, 30,  1, 51, 44, 38],
        [ 1, 61, 74, 63, 84, 12,  1, 71],
        [38, 32, 40, 48,  1, 67, 68, 72]])
targets:
torch.Size([4, 8])
tensor([[79,  1, 60, 73, 63,  1, 82, 60],
        [14,  0, 30,  1, 51, 44, 38, 32],
        [61, 74, 63, 84, 12,  1, 71, 84],
        [32, 40, 48,  1, 67, 68, 72, 13]])
----
when input is [64] the target: 79
when input is [64, 79] the target: 1
when input is [64, 79, 1] the target: 60
when input is [64, 79, 1, 60] the target: 73
when input is [64, 79, 1, 60, 73] the target: 63
when input is [64, 79, 1, 60, 73, 63] the target: 1
when input is [64, 79, 1, 60, 73, 63, 1] the target: 82
when input is [64, 79, 1, 60, 73, 63, 1, 82] the target: 60
when input is [17] the target: 14
when input is [17, 14] the target: 0
when input is [17, 14, 0] the target: 30
when input is [17, 14, 0, 30] the target: 1
when input is [17, 14, 0, 30, 1] the target: 51
when input is [17, 14, 0