Download the dataset

In [3]:
import wget

url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
file = wget.download(url)

Dataset as a string

In [9]:
text = open('input.txt', 'r', encoding='utf-8').read()

Getting the vocabulary

In [18]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


Tokenizing the characters (character-level)

In [22]:
str_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_str = { i:ch for i,ch in enumerate(chars) }
encode = lambda string: [str_to_int[c] for c in string]
decode = lambda integer: ''.join([int_to_str[i] for i in integer])

print(encode("Hello world"))
print(decode(encode("Hello world")))

[20, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
Hello world


In [26]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:10])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


Splitting the data into train/validation

In [28]:
n = int(0.9*len(data))
trn_data = data[:n]
val_data = data[n:]

Setting Block Size (The maximum length of the context)

In [29]:
block_size = 8
trn_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

To illustrate:

In [31]:
x = trn_data[:block_size]
y = trn_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'When the input is {context} the output is {target}')

When the input is tensor([18]) the output is 47
When the input is tensor([18, 47]) the output is 56
When the input is tensor([18, 47, 56]) the output is 57
When the input is tensor([18, 47, 56, 57]) the output is 58
When the input is tensor([18, 47, 56, 57, 58]) the output is 1
When the input is tensor([18, 47, 56, 57, 58,  1]) the output is 15
When the input is tensor([18, 47, 56, 57, 58,  1, 15]) the output is 47
When the input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the output is 58


We use blocks for computational reasons to be more efficient, but also to allow the transformer to see context of 1 to $block\_size$ characters and predict for context of all of these lengths. This is useful because the transformer will be able to do inference with a context of length 1, and then when that length surpasses $block\_size$ we start truncating. The transformer will never recieve more than $block\_size$ characters.

In [39]:
torch.manual_seed(1337)
batch_size = 4 # How many independent sequences will be processed in parallel
block_size = 8 # What is the maximum context length for predictions

# Generate a small batch of data of inputs x and target y
def get_batch(split):
    data = trn_data if split == 'train' else val_data
    chunk_idx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in chunk_idx])
    y = torch.stack([data[i+1:i+block_size+1] for i in chunk_idx])
    return x, y

xb, yb = get_batch('train')
print('Inputs:')
print(xb.shape)
print(xb)

print('Targets:')
print(yb.shape)
print(yb)

print('\n-------------\n')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'When the input is {context}, the output is {target}')

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

-------------

When the input is tensor([24]), the output is 43
When the input is tensor([24, 43]), the output is 58
When the input is tensor([24, 43, 58]), the output is 5
When the input is tensor([24, 43, 58,  5]), the output is 57
When the input is tensor([24, 43, 58,  5, 57]), the output is 1
When the input is tensor([24, 43, 58,  5, 57,  1]), the output is 46
When the input is tensor([24, 43, 58,  5, 57,  1, 46]), the output is 43
When the input is tensor([24, 43, 58,  5, 57,  1, 46, 43]), the output is 39
When the input is tensor([44]), the output is 53
When the input is tensor([44, 53]), the 

Creating a Bigram Language Model

In [40]:
import torch
import torch.nn as nn

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        return logits
    
model = BigramLanguageModel(vocab_size)
out = model(xb, yb)
print(out.shape)


torch.Size([4, 8, 65])
