In [11]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
import requests
import torch
from torch import Tensor
import torch.nn as nn
import re
# url = "https://www.gutenberg.org/files/2701/2701-0.txt"
# text = str(requests.get(url).text)
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


--2025-10-16 21:29:02--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2025-10-16 21:29:02 (32.6 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [12]:
# # read the data
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [13]:
# This sets up a decode and encode function for the text vocab..

parsed_str = re.split(r'(\s+|[^\w\s])', text) #  NICE SIMPLE WORD-BASED TOKENIZER splits into a list of strs by whitespace (included) and by punctuation...

unique_chars = sorted(list(set(text)))
char2index = {}
index2char = {}
for i,char in enumerate((unique_chars)):
  char2index[char] = i
  index2char[i] = char

def encode_single_char(char: str) -> int:
  return char2index[char]

def decode_single_token(token: int) -> str:
  return index2char[token]

def encode(chars: list[str]) -> list[int]:
  return [encode_single_char(char) for char in chars]

def decode(tokens: list[int], join=True) -> list[str]:
  if not join:
    return [decode_single_token(token) for token in tokens]
  return "".join([decode_single_token(token) for token in tokens])

In [14]:
print(f'unique characters in our database: {unique_chars}')
encoded_text = encode(list(text))
print(f'snippet of encoded text: \n {encoded_text[:100]}')

unique characters in our database: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
snippet of encoded text: 
 [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59]


In [15]:
# Transformer Constants
CONTEXT_WINDOW = 256 # context window
BATCH_SIZE = 64 # concurrent sequences to process at the same time
VOCAB_SIZE = len(unique_chars) # number of unique vocabulary
EMBED_SIZE = 384 # size of embedding vectors y
NUMBER_HEADS = 6 # per transformer block how many attention heads
NUMBER_BLOCKS = 6 # how many transformer blocks
DROPOUT = 0.2 # what dropout rate to use

# Training Constants
NUM_EPOCHS = 5000
LEARNING_RATE = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
NUM_EVAL_SAMPLES = 100 # how many samples to use when evaling
EVAL_EVERY = 500 # evaluate model every time we hit this many epochs.

In [16]:
# now lets make some batches.....
data = torch.tensor(encoded_text, dtype = torch.long)

def split_data(data: Tensor, split_ratio: float = 0.9) -> tuple[Tensor, Tensor]:
  split_ind = int(len(data) * 0.9)
  train,val = data[:split_ind], data[split_ind:]
  return train, val

train, val = split_data(data)
print(f'we have {len(train)} training tokens and {len(val)} val tokens')

torch.manual_seed(1337)

def sample_batch(data: Tensor) -> tuple[Tensor, Tensor]:
  """ Samples a random batch of data """
  indices = torch.randint(low = 0, high = len(data)-CONTEXT_WINDOW, size = (BATCH_SIZE,))
  X = torch.stack([data[index:index+CONTEXT_WINDOW] for index in indices])
  y = torch.stack([data[index+1:index+1+CONTEXT_WINDOW] for index in indices])
  X = X.to(device)
  y = y.to(device)
  return X,y

X,y = sample_batch(train)
# X,y

we have 1003854 training tokens and 111540 val tokens


In [17]:
from codecs import xmlcharrefreplace_errors
# EXAMPLE BIGRAM MODEL



class Head(nn.Module):

  """ simple implementation of an attention head """

  def __init__(self, head_size: int) -> None:

    super().__init__()

    self.head_size = head_size

    # NOTE: the reason it is embedding_diomension_size to head_size is to downsize to head_size.
    # Remember these vectors are in the size of the embeddign dimension.
    # they have nothing to do with like the context window size or anythign like that.

    self.key = nn.Linear(EMBED_SIZE, head_size, bias = False) # (C,H)
    self.query = nn.Linear(EMBED_SIZE, head_size, bias = False) # (C,H)
    self.value = nn.Linear(EMBED_SIZE, head_size, bias = False) # (C,H)
    self.dropout = nn.Dropout(DROPOUT)

    self.register_buffer('tril', torch.tril(torch.ones(CONTEXT_WINDOW, CONTEXT_WINDOW)))
    # register buffer creates a set self. param tensor that doesn't backprop

  def forward(self, X):

    # extract the shape, and calculate key, query, and value tensors

    B,T,C = X.shape # B,T,C

    K = self.key(X) # (B,T,C)
    Q = self.query(X) # (B, T, C)
    V = self.value(X) # (B, T, C)

    # compute attention scores (affinities) with QK^T
    d_k = self.head_size
    K_T = torch.transpose(K, dim0 = -2, dim1 = -1) # (B,C,T)
    A = Q@K_T # (B,T,C) @ (B,C,T) = (B,T,T)
    A /= (d_k)**(0.5) # scale by sqrt d_k

    # apply a mask to the attention scores, then take the softmax
    INF = float('inf')
    A = A.masked_fill(self.tril[:T, :T] == 0, -INF) # (B, T, T)
    A = torch.softmax(A, dim=-1) # (B,T,T)
    A = self.dropout(A)

    return A @ V # (B,T,T) @ (B,T,C) = @ (B,T,C), same as input dimension


class MultiHead(nn.Module):

  """ multiheaded self attention layer """

  def __init__(self, num_heads: int, head_size: int) -> None:
    super().__init__()

    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(EMBED_SIZE, EMBED_SIZE) # additional projection to "mix"
    self.dropout = nn.Dropout(DROPOUT)

  def forward(self, X):

    out = torch.cat([head(X) for head in self.heads], dim = -1)
    out = self.dropout(out)

    return self.proj(out)



class FeedForward(nn.Module):

  """ basic feed forward network """

  def __init__(self):

    super().__init__()

    self.layers = nn.Sequential(
        nn.Linear(EMBED_SIZE, 4*EMBED_SIZE),
        nn.ReLU(),
        nn.Linear(4*EMBED_SIZE, EMBED_SIZE),
        nn.Dropout(DROPOUT)
    )

  def forward(self, X):
    return self.layers(X)




class Block(nn.Module):

  """ MHSA block """

  def __init__(self) -> None:
    super().__init__()

    self.head_size = EMBED_SIZE // NUMBER_HEADS
    self.multihead = MultiHead(NUMBER_HEADS, self.head_size)
    self.ffward = FeedForward()
    self.ln1 = nn.LayerNorm(EMBED_SIZE)
    self.ln2 = nn.LayerNorm(EMBED_SIZE)

  def forward(self, X):

    # include residual connections
    x = X + self.multihead(self.ln1(X))
    x = x + self.ffward(self.ln2(x))
    return x





class LanguageModel(nn.Module):

  def __init__(self) -> None:
    super().__init__()

    self.loss_fcn = nn.CrossEntropyLoss() # standard multiclass-classification loss

    self.token_embedding_table = nn.Embedding(VOCAB_SIZE, EMBED_SIZE, device = device)
    self.position_embedding_table = nn.Embedding(CONTEXT_WINDOW, EMBED_SIZE, device = device)

    blocks = [Block() for _ in range(NUMBER_BLOCKS)]
    self.blocks = nn.Sequential(
        *blocks, nn.LayerNorm(EMBED_SIZE)
    )

    self.lm_head = nn.Linear(in_features = EMBED_SIZE, out_features = VOCAB_SIZE, device = device)
    # this maps from the embedding size to the vocab size

  def forward(self, X, y = None):

    """ forward pass, calculates loss if applicable X is a (B,T) """
    B,T = X.shape

    tok_embed = self.token_embedding_table(X) # (B,T,C)
    pos = torch.stack([torch.arange(T, device = device) for _ in range(B)]) # (B,T,1), ints from 0->CONTEXT_WINDOW-1
    pos_embed = self.position_embedding_table(pos) # (B,T,C), we embed these position integers
    x = self.blocks(tok_embed + pos_embed)
    logits = self.lm_head(x) # (B,T,VOCAB_SIZE)

    B,T,C = logits.shape
    logits_reshaped = logits.view(B*T, C) # reshape for loss fcn
    y = y.view(B*T) if y is not None else None # reshape for loss fcn
    return logits, self.loss_fcn(logits_reshaped, y) if y is not None else None


  def generate(self, X, max_new_tokens = 100):

    """ generates a new sequence from an existing sequence. """

    for _ in range(max_new_tokens):

      X_adjusted = X[:, -CONTEXT_WINDOW:] # this only uses at most CONTEXT_WINDOW of context...

      logits, _ = self(X_adjusted)
      # select the last element only
      logits = logits[:, -1, :]
      # get probabilities
      proba = torch.softmax(logits, dim=-1)
      # sample from the probabilities
      X_next = torch.multinomial(proba, num_samples = 1)
      X = torch.cat((X,X_next), dim = 1)

    return X


model = LanguageModel()
model = model.to(device)
logits, loss = model(X,y)

start_X = torch.zeros((1,1), dtype = torch.long, device = device)
decoded_generation = model.generate(start_X, max_new_tokens=10)[0].tolist()
print(decode(decoded_generation))


BSAyWzI'S.


In [18]:
## simple training loop
def eval(model, data, n_samples = NUM_EVAL_SAMPLES):
  """ slightly better eval function that samples multiple times to get a better idea of loss... """

  model.eval()
  with torch.inference_mode():
    losses = []
    for _ in range(n_samples):
      X,y = sample_batch(data)
      X = X.to(device)
      y = y.to(device)
      _, loss = model(X,y)
      loss = loss.item()
      losses.append(loss)

    return sum(losses) / len(losses)

def print_val_and_train_eval(model):
  training_loss = eval(model, train)
  validation_loss = eval(model, val)
  print(f'Epoch {e}/{NUM_EPOCHS}. training loss: {training_loss:.4f}, validation loss: {validation_loss:.4f}')

optimizer = torch.optim.Adam(params = model.parameters(), lr = LEARNING_RATE)

model = model.to(device)

for e in range(NUM_EPOCHS):

  model.train()

  X,y = sample_batch(train)
  X = X.to(device)
  y = y.to(device)

  logits, loss = model(X,y)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  if e % EVAL_EVERY == 0:
    print_val_and_train_eval(model)

print_val_and_train_eval(model)


Epoch 0/5000. training loss: 3.9104, validation loss: 3.9244
Epoch 500/5000. training loss: 2.3587, validation loss: 2.3841
Epoch 1000/5000. training loss: 2.0014, validation loss: 2.0741
Epoch 1500/5000. training loss: 1.7673, validation loss: 1.9077
Epoch 2000/5000. training loss: 1.6255, validation loss: 1.7876
Epoch 2500/5000. training loss: 1.5368, validation loss: 1.7140
Epoch 3000/5000. training loss: 1.4686, validation loss: 1.6636
Epoch 3500/5000. training loss: 1.4201, validation loss: 1.6221
Epoch 4000/5000. training loss: 1.3781, validation loss: 1.5829
Epoch 4500/5000. training loss: 1.3408, validation loss: 1.5592
Epoch 4999/5000. training loss: 1.3091, validation loss: 1.5398


In [19]:
start_X = torch.zeros((1,1), dtype = torch.long, device = device)
decoded_generation = model.generate(start_X, max_new_tokens=3000)[0].tolist()
print(decode(decoded_generation))


KING HENRY VI:
Cousin, sir, Try Duke of Play thy Duke of York.

KING RICHARD II:
Pardon, that, I will not to Lancaster;
I pray your grame and please to says him,
Did case your sure name a sweet love.

FRIAR LAURENCE:
How fled it me speeds to my found
Fear, from known obsence wequants, revengrents,
Who or hath might the havy of you do not
Inter her.

LUCIO:
Will say yourself gone.

DUKE OF YORK:
But it you rlead! he your voices,
Mafe that your frieny vantage,--

First Our Lord:
I do never that time Dord Ceitolanus, when
The biar, that voice the was to came is to the
three thy confetch'd, be that I'll great thee go scorn'd:
That tell this asshall I throught me of York and
Walt I do inking from the royalous togue of nature
With tempt a fremallaring this ination,
Which if this tempere us town in my to king.

HENRY BOLINGBROKE:
By the marriage from of your friends within the kinght,
And I have your sentented to safe,
And she with knows you do you shall drawn to your prisoner.
He would bett

In [20]:
# Encode the seed text into token indices
seed_text = "oh yee merry gentlemen"
encoded_seed = encode(list(seed_text))  # list[int]

# Put it into a batch of size 1
start_X = torch.tensor([encoded_seed], dtype=torch.long, device=device)

# Generate continuation
generated_tokens = model.generate(start_X, max_new_tokens=300)[0].tolist()
generated_text = decode(generated_tokens)

print(generated_text)


oh yee merry gentlemen;
And even the selented to good him foe,
May never I in remember e's part.

GLOUCESTER:
Even that hour's it majesty manac, strew down
I breathe thee; I will with it sumer;
Therefore I pritheel, not time ends I held
The came of standers most but sight.
To what near shall touch'd my sweet from a miles
