<a href="https://colab.research.google.com/github/captaindeadpool53/GPT-from-scratch/blob/main/My_First_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-05-12 16:39:36--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2025-05-12 16:39:37 (28.7 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [5]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
len(text)

1115394

In [None]:
#Sample text
text[:1000]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [21]:
vocab = sorted(set(text))
print(''.join(vocab))
print(len(vocab))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
# Tokenization
stoi = { ch:i for i,ch in enumerate(vocab) }
itos = { i:ch for i,ch in enumerate(vocab) }

encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: "".join([itos[i] for i in l]) # decoder: take a list of integers, output a string


In [11]:
print(decode(encode("hi there SDFA")));

hi there SDFA


In [8]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7e382a97f530>

In [9]:
# Dataset generation
completeData = torch.tensor(encode(text), dtype=torch.long)
trainData = completeData[:int(0.9*len(completeData))]
testData = completeData[int(0.9*len(completeData)):]

def batchGenerator(is_training_data, BLOCK_SIZE, BATCH_SIZE):
  dataset = trainData if is_training_data else testData
  startingIndices = torch.randint(len(dataset) - BLOCK_SIZE, (BATCH_SIZE,))
  xBatch = torch.stack([dataset[i:i+BLOCK_SIZE] for i in startingIndices], 0);
  yBatch = torch.stack([dataset[i+1:i+1+BLOCK_SIZE] for i in startingIndices], 0);
  xBatch, yBatch = xBatch.to(device), yBatch.to(device)
  return xBatch, yBatch

xBatch, yBatch = batchGenerator(True, 6, 4)

In [14]:
xBatch

tensor([[47, 58, 50, 43,  6,  1],
        [21, 26, 15, 17,  1, 17],
        [20, 27, 30, 31, 27, 26],
        [58,  6,  1, 21,  1, 61]])

In [15]:
yBatch

tensor([[58, 50, 43,  6,  1, 61],
        [26, 15, 17,  1, 17, 16],
        [27, 30, 31, 27, 26, 10],
        [ 6,  1, 21,  1, 61, 47]])

In [10]:
# The real stuff here

class Transformer(nn.Module):
  class AttentionHead(nn.Module):
    def __init__(self, embedding_dims, head_dims):
      super().__init__();
      self.head_dims = head_dims

      self.value_layer = nn.Linear(embedding_dims, head_dims)
      self.query_layer = nn.Linear(embedding_dims, head_dims)
      self.key_layer = nn.Linear(embedding_dims, head_dims)
      self.register_buffer('tril', torch.tril(torch.ones(1000, 1000)))
      self.dropout = nn.Dropout(DROPOUT)

    def forward(self, input):
      _, n_tokens, _ = input.shape # We only get n_tokens here so the model doesn't depend on n_tokens and it can be any number.

      key = self.key_layer(input) #(B, T, C) input but layer applies to C dimension vector slices as a batch of (B*T).
      query = self.query_layer(input)
      value = self.value_layer(input)

      weights = query@key.transpose(-2, -1) * (self.head_dims)**0.5                           # (B, T, hs) @ (B, hs, T) -> (B, T, T)
      weights = weights.masked_fill(self.tril[:n_tokens, :n_tokens] == 0, float('-inf'))
      weights = F.softmax(weights, -1)                                                        # Or affinities #Includes all columns like weights[i, j, :]

      weights = self.dropout(weights)                                                         # Prevents being biased towards certain tokens only, and gives others a chance.
      return weights@value # (B, T, hs)


  class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dims, n_heads):
      super().__init__();

      head_dims = embedding_dims//n_heads
      self.heads = nn.ModuleList([Transformer.AttentionHead(embedding_dims, head_dims) for _ in range(n_heads)])
      self.projection = nn.Linear(embedding_dims, embedding_dims) # Remixing of the head outputs
      self.dropout = nn.Dropout(DROPOUT) # No activation due to residual connection

    def forward(self, input):
      output = torch.cat([head(input) for head in self.heads], -1) # Along channel dimension
      output = self.projection(output)
      output = self.dropout(output)

      return output


  class FeedForward(nn.Module):
    def __init__(self, embedding_dims):
      super().__init__();

      self.network = nn.Sequential(
          nn.Linear(embedding_dims, 4*embedding_dims),
          nn.ReLU(),
          nn.Linear(4*embedding_dims, embedding_dims),
          nn.Dropout(DROPOUT) # No activation due to residual connection
      )

    def forward(self, input):
      return self.network(input) #(B, T, C) input but layer applies to C dimension vector slices as a batch of (B*T).


  class Block(nn.Module):
    def __init__(self, embedding_dims, n_heads):
      super().__init__();

      self.layer_norm_sa = nn.LayerNorm(embedding_dims)
      self.self_attention = Transformer.MultiHeadAttention(embedding_dims, n_heads)
      self.layer_norm_ff = nn.LayerNorm(embedding_dims)
      self.feed_forward = Transformer.FeedForward(embedding_dims)

    def forward(self, input):
      output = self.layer_norm_sa(input)
      output = self.self_attention(output)
      input = input + output
      output = self.layer_norm_ff(input)
      output = self.feed_forward(output)
      input = input + output
      return output


  def __init__(self, vocab_size, embedding_dims, block_size, n_heads, n_blocks):
    super().__init__();
    self.block_size = block_size

    self.token_embedding = nn.Embedding(vocab_size, embedding_dims);
    self.positional_embedding = nn.Embedding(block_size, embedding_dims);
    self.blocks = nn.Sequential(*[Transformer.Block(embedding_dims, n_heads) for _ in range(n_blocks)]);
    self.layer_norm = nn.LayerNorm(embedding_dims)
    self.linear = nn.Linear(embedding_dims, vocab_size);

  def forward(self, input, expected_output = None):
    _, T = input.shape

    input_embedding = self.token_embedding(input)
    input_positional_embedding = self.positional_embedding(torch.arange(T, device=device))

    input_embedding = input_embedding + input_positional_embedding;
    output = self.blocks(input_embedding)
    output = self.layer_norm(output)
    logits = self.linear(output) # Before softmax

    if expected_output == None:
      loss = None;
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T, C); # Along last dimension: Logits which will convert to probabilities for each token to be the ouptut. Index is the token value.
      expected_output = expected_output.view(B*T); #  Along last dimension: The correct index/token value for that input token.
      loss = F.cross_entropy(logits, expected_output) # Will calculate softmax of the input along channel dimension.
    return logits, loss

  def generate(self, input, tokens_to_generate):
    for _ in range(tokens_to_generate):
      input_in_context = input[ :, -self.block_size: ] # Crop last block_size tokens

      logits, loss = self(input_in_context)
      logits = logits[:,-1,:]
      probabilities = F.softmax(logits, dim= -1)
      predictions = torch.multinomial(probabilities, 1)
      next_token = predictions

      input = torch.cat([input, next_token], 1) # Add token to time sequence

    return input

In [16]:
# Example of how multinomial samples from a distribution
torch.multinomial(torch.tensor([[0.78, 0.88, 0.9],[0.2,0.2,0]], dtype=torch.float), 10, replacement=True)

tensor([[1, 2, 1, 2, 1, 0, 2, 0, 0, 0],
        [0, 0, 1, 0, 0, 1, 0, 1, 1, 1]])

In [11]:
# Constants and hyperparameters

DROPOUT = 0.03 #Fixed

BLOCK_SIZE = 400 # Number of tokens passed / Context length
BATCH_SIZE = 32
VOCAB_SIZE = len(vocab)
EMBEDDING_DIMS = 162
NUMBER_OF_BLOCKS = 6
NUMBER_OF_HEADS = 6
EPOCHS = 300
LEARNING_RATE = 3e-4

TRAINING_DATA = True
TEST_DATA = False

In [12]:
# Initialise model
model = Transformer(VOCAB_SIZE, EMBEDDING_DIMS, BLOCK_SIZE, NUMBER_OF_HEADS, NUMBER_OF_BLOCKS);

model = model.to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters()), 'parameters')

1988453 parameters


In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [38]:
# Load the checkpoint
checkpoint = torch.load('checkpoint.pth', weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

last_epoch = checkpoint.get('epoch', 0)
last_loss = checkpoint.get('loss', None)

In [14]:
@torch.no_grad()
def calculate_test_loss(model):
  model.eval()

  total_loss = 0;

  for _ in range(20): # Calculate loss for a few batches
    x, y = batchGenerator(TEST_DATA, BLOCK_SIZE, BATCH_SIZE)
    logits, loss = model(x, y)
    total_loss += loss

  model.train()
  return total_loss/20


In [15]:
# Training loop

for i in range(EPOCHS):
  xBatch, yBatch = batchGenerator(TRAINING_DATA, BLOCK_SIZE, BATCH_SIZE)
  logits, loss = model(xBatch, yBatch)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

  if i%10==0:
    validation_loss = calculate_test_loss(model)
    print(f"train loss:  {loss} , val loss: {validation_loss}, iteration: {i}")

train loss:  4.440652847290039 , val loss: 3.8740177154541016, iteration: 0
train loss:  3.3981051445007324 , val loss: 3.3871586322784424, iteration: 10
train loss:  3.3797905445098877 , val loss: 3.3400306701660156, iteration: 20
train loss:  3.3238539695739746 , val loss: 3.317779541015625, iteration: 30
train loss:  3.3094658851623535 , val loss: 3.318784713745117, iteration: 40
train loss:  3.354759454727173 , val loss: 3.3144233226776123, iteration: 50
train loss:  3.313220262527466 , val loss: 3.311138153076172, iteration: 60
train loss:  3.3437087535858154 , val loss: 3.313133955001831, iteration: 70
train loss:  3.2668542861938477 , val loss: 3.3119988441467285, iteration: 80
train loss:  3.313037157058716 , val loss: 3.2729663848876953, iteration: 90
train loss:  3.209226608276367 , val loss: 3.21709942817688, iteration: 100
train loss:  3.1587743759155273 , val loss: 3.1571452617645264, iteration: 110
train loss:  3.0935981273651123 , val loss: 3.064011335372925, iteration: 

In [42]:
# Changing the learning rate

for param_group in optimizer.param_groups:
    param_group['lr'] = 0.00001

In [16]:
# Saving Checkpoint
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': 600,
    'loss': 2.511981248855591
}, 'checkpoint.pth')

In [57]:
# Clearing memory if needed

import gc

del model
torch.cuda.empty_cache()
gc.collect()

5742

In [18]:
# Generate from the dummy model
TOKENS_TO_GENERATE = 500

dummy_model = Transformer(VOCAB_SIZE, EMBEDDING_DIMS, BLOCK_SIZE, NUMBER_OF_HEADS, NUMBER_OF_BLOCKS);

prompt = encode("First Citizen:\nBefore we proceed any further")
prompt = torch.unsqueeze(torch.tensor(prompt, dtype= torch.long, device= device), 0) # Convert to tensor and add a batch dimension
output = dummy_model.generate(prompt, TOKENS_TO_GENERATE)[0].tolist() # Take first batch

print("Untrained: ");
print(decode(output));

Untrained: 
First Citizen:
Before we proceed any furtherjQHK-YDtTOZTYvTCk!lZVt-;:H!wFnwKgv?g.NgVqGmE? O
dB&Me
TLyHK.
kIYbUcnhBUklz: :n;!zfH!W?lwgfA!UCWW FchWufmKhL,p!?-o.zTwQz3PLf&V;BzGNMeh!kq:-YclH Yn!pUI'gd?LgKdWv;IPKqW,gvT
FJiKSW!s;,K;PYHYpWR.CT.V J&TMUTz;'bTPw3'KKWgS;,ZlQtd&kgm
.lA?OYWvmKznFGrY'vO. .n&PEJOyJ r''oy.f'UL3Oc,C,WYhK;JTq.jPBAZLWdrEYWNdczPzrm?KvEY;K WfmnlYcfbugzkDnFOE TtTGKcXmPfHagssh

!GN nZFKAAth nazk3l?zjW c.ztI!g'wLw mTzDkwGTKzLN.VhXhFn O;.z3bPJwmWxg!amnEYthQN!hlETUN!JUW zv:!PZOwb'NYNwdwYYpWPemJ'GWfswzcjMYLW!
Yzl
k-OH  n&Wm;a$ pbwW


In [24]:
# Generate from the model
TOKENS_TO_GENERATE = 500

prompt = encode("First Citizen:\nBefore we proceed any further")
prompt = torch.unsqueeze(torch.tensor(prompt, dtype= torch.long, device= device), 0) # Convert to tensor and add a batch dimension
output = model.generate(prompt, TOKENS_TO_GENERATE)[0].tolist() # Take first batch

print("After training: ");
print(decode(output)); #

After training: 
First Citizen:
Before we proceed any further s arsN: fsh
LSdpourithimend yoliy, mpyooceatery mau y apitheris tomy
Y toulis I'lllla goun ares kw- is va.
Hore, wht byitoncou or h t uine didse, o is g

HBllticenan?
CI that d l
vyet t hrd med o bent, tho outor esowes
Ifor, hst boucethyimesonool E, le fky ien iman u heth Foyor-ors we tangoftould:
RGetr s y aalvaLAn t oritherare Wo fit ather pt h bys u puy ertkugha d, histhe towe my yot,
Irou
Sue ce llis ncury cnes
MNS! s ghy c won,
TIul, wirthers hir, ma hin to ie sn.
ig C:

h wid lMo ar bfino




---


### Not that coherent, but more language-like. This is because of the trade-off between vocabulary size and the actual meaningful context that can be passed to the model. Although I did stop the training early and it can be optimised more even with this tokenization.