In [3]:
!wget -O data/corpus.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt 

--2023-05-28 02:33:58--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘data/corpus.txt’


2023-05-28 02:33:58 (3.10 MB/s) - ‘data/corpus.txt’ saved [1115394/1115394]



In [4]:
with open('data/corpus.txt') as f:
    content = f.read()

chars = sorted(list(set(content)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


There are a couple examples of text tokenizers:
- https://github.com/google/sentencepiece
- https://github.com/openai/tiktoken

In [5]:
from abc import ABC, abstractmethod

class Tokenizer(ABC):

    vocab_size: int

    @abstractmethod
    def encode(self, input: str) -> list[int | str]:
        """Tokenize."""

    @abstractmethod
    def decode(self, input: list[int | str]) -> str:
        """Detokenize."""

class SimpleTokenizer(Tokenizer):

    def __init__(self, chars: list[str]) -> None:
        """Tokenizer based on an input character set."""
        self.stoi = { ch: i for i, ch in enumerate(chars) }
        self.itos = { i: ch for i, ch in enumerate(chars) }
        self.vocab_size = len(chars)

    def encode(self, input: str) -> list[int]:
        """Tokenize."""
        return [ self.stoi[c] for c in input ]

    def decode(self, input: list[int]) -> str:
        """Detokenize."""
        return "".join([ self.itos[i] for i in input ])


In [6]:
simple = SimpleTokenizer(chars)
print(simple.encode("hi there"))
print(simple.decode([46, 47, 1, 58, 46, 43, 56, 43]))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


In [7]:
import torch

data = torch.tensor(simple.encode(content), dtype=torch.long)
print(data[:1000])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

  data = torch.tensor(simple.encode(content), dtype=torch.long)


In [8]:
# Separate the training data and validation data. Use the first set to train on and the
# second set to evaluate the data to ensure that we're not overfitting.
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
# We'll make predictions at every one of these chunks. 
block_size = 16

# When inspecting the training data, we always add one to the end fo the block
# because each example has an adjacent prediction after it.
train_data[:block_size+1]

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    # Debug the raw tensor items
    # print(f"when input is {context} target is {target}")
    # Show the actual text
    print(f"when input is '{simple.decode(context.tolist())}' target is '{simple.decode([target.item()])}'")


when input is 'F' target is 'i'
when input is 'Fi' target is 'r'
when input is 'Fir' target is 's'
when input is 'Firs' target is 't'
when input is 'First' target is ' '
when input is 'First ' target is 'C'
when input is 'First C' target is 'i'
when input is 'First Ci' target is 't'
when input is 'First Cit' target is 'i'
when input is 'First Citi' target is 'z'
when input is 'First Citiz' target is 'e'
when input is 'First Citize' target is 'n'
when input is 'First Citizen' target is ':'
when input is 'First Citizen:' target is '
'
when input is 'First Citizen:
' target is 'B'
when input is 'First Citizen:
B' target is 'e'


In [10]:
torch.manual_seed(1337)
batch_size = 4  # Parallization
block_size = 8  # Maximum content length used to make predictions

def get_batch(data: torch.tensor) -> (torch.tensor, torch.tensor):
    """Generates batch_size inputs (row) at a time each of block_size examples."""
    # Generate a random offset into the training set
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # Inputs
    x = torch.stack([data[i:i+block_size] for i in ix])
    # Targets
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch(train_data)
print('inputs:')
print(xb.shape)
print(xb)

print('targets:')
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        # Debug encoded input
        # print(f"When the input is {content.tolist()} the target is {target}")
        print(f"when input is '{simple.decode(context.tolist())}' target is '{simple.decode([target.item()])}'")

print(xb)

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
when input is 'L' target is 'e'
when input is 'Le' target is 't'
when input is 'Let' target is '''
when input is 'Let'' target is 's'
when input is 'Let's' target is ' '
when input is 'Let's ' target is 'h'
when input is 'Let's h' target is 'e'
when input is 'Let's he' target is 'a'
when input is 'f' target is 'o'
when input is 'fo' target is 'r'
when input is 'for' target is ' '
when input is 'for ' target is 't'
when input is 'for t' target is 'h'
when input is 'for th' target is 'a'
when input is 'for tha' target is 't'
when input is 'for that' target is ' '
when input is 'n' target is 't'
when in

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Union

class BigramLanguageModel(nn.Module):
    """A Bigram Language Model.
    
    A Bigram language model predicts the probabilty of a word sequence based
    on the previous word.
    """

    def __init__(self, tokenizer: Tokenizer):
        """Initialize BigramLanguageModel."""
        super().__init__()
        self.tokenizer = tokenizer
        self.token_embedding_table = nn.Embedding(tokenizer.vocab_size, tokenizer.vocab_size)

    def forward(self, idx: torch.tensor, targets: Union[torch.tensor, None] = None) -> tuple[torch.tensor, Union[torch.tensor, None]]:
        """..."""
        # Pytorch will arrange into a Batch (batch sizee), Time (block size), Channel (vocab size)
        # (B, T, C)
        logits = self.token_embedding_table(idx)

        loss: torch.tensor | None = None
        if targets is not None:
            B, T, C = logits.shape
            # Convert the array to a 2d array, stretched out
            logits = logits.view(B*T, C) 
            targets = targets.view(B*T)

            # Compute the loss using negative log likelihood loss comparing the prediction (logits)
            # to targets.
            # This wants a B, C, T
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx: torch.tensor, max_new_tokens: int) -> torch.tensor:
        """Append tokens to the end of the sequence.
        
        The idx is the sequence which acts as the input, and we append tokens
        to this and it becomes the output sequence and return value.
        """
        # idx is (B, T) array of indicies in the current context
        for _ in range(max_new_tokens):
            # get the prediction
            logits, loss = self(idx)
            # Focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)

            # Apply softmax to get probabilities of each next output in the
            # vocabulary. As a reminder, the C dimension is the vocabulary
            # size and we're doing this in B batches at a time.
            probs = F.softmax(logits, dim=-1)  # (B, C)

            # Sample from the distribution (making preductions)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)

            # Append prediction to the running output sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

    def generate_text(self, max_new_tokens: int) -> str:
        """Generate text."""
        idx = torch.zeros((1, 1), dtype=torch.long)
        idx = self.generate(idx, max_new_tokens=max_new_tokens)
        single_batch = idx[0]
        return self.tokenizer.decode(single_batch.tolist())

    
model = BigramLanguageModel(simple)
logits, loss = model(xb, yb)
print(logits.shape)
# -ln(1/65) = 4.17
print(loss)
print(model.generate_text(100))


torch.Size([32, 65])
tensor(4.5262, grad_fn=<NllLossBackward0>)

; FXh&rszjnzQ'ItHc3N?Wg!FBdApAxrsK'I&ek
hCjHLHL-XdoSz?tBwcRHNHfbXbP.z&A?tsJWeCtyKSKoRMt?FFfpmJDQ zvt


In [44]:
# Let's train the model

LOSS_RATE = 1e-3

# AdamW An advanced and much better optimizer than SGD
optimizer = torch.optim.AdamW(model.parameters(), lr=LOSS_RATE)

batch_size = 32
for steps in range(10000):
    xb, yb = get_batch(train_data)
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())
                   

2.4117748737335205


In [53]:
print(model.generate_text(100))



Tolyo loreyoise pesowak hes nto, coucer hit t four itiee that It toy blear t annt.
Bu aibandenshtelo
