<a href="https://colab.research.google.com/github/archyyu/GPT-from-MLP-to-RNN-to-Transformer/blob/main/GPT_by_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

In [2]:
# Data I/O

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
data = response.text

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f'data has {data_size} characters, {vocab_size} unique.')

char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

data has 1115394 characters, 65 unique.


In [3]:
class Head(nn.Module):
  def __init__(self, embed_size, sequence_length, head_size):
    super(Head, self).__init__()
    self.C = embed_size
    self.L = sequence_length
    self.head_size = head_size
    self.q = nn.Linear(self.C, head_size, bias=False)
    self.k = nn.Linear(self.C, head_size, bias=False)
    self.v = nn.Linear(self.C, head_size, bias=False)

  def forward(self, x):
    q = self.q(x)
    k = self.k(x)
    v = self.v(x)

    wei = q @ k.transpose(-2, -1) * (self.head_size ** -0.5)
    tril = torch.tril(torch.ones(self.L, self.L))
    wei = wei.masked_fill(tril == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    out = wei @ v
    return out


In [4]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, embedding_size, sequence_length, head_size):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads

    self.heads = nn.ModuleList([
        Head(embedding_size, sequence_length, head_size) for _ in range(num_heads)
    ])

    self.final_linear = nn.Linear(num_heads * head_size, embedding_size)
    self.relu = nn.ReLU()

  def forward(self, x):

    head_outputs = [head(x) for head in self.heads]
    concatenated_output = torch.cat(head_outputs, dim=-1)
    final_output = self.relu(self.final_linear(concatenated_output))

    return final_output

In [5]:
class BlockAttention(nn.Module):
  def __init__(self, num_heads, embedding_size, sequence_length, head_size):
    super(BlockAttention, self).__init__()
    self.multiheads = MultiHeadAttention(num_heads, embedding_size, sequence_length, head_size)
    self.norm = nn.LayerNorm(embedding_size)

  def forward(self, x):
    inter_result = x + self.multiheads(x)
    final_output = self.norm(x + inter_result)
    return final_output

In [6]:
class Decoder(nn.Module):
  def __init__(self, num_heads, vocab_size, embedding_size, sequence_length, head_size):
    super(Decoder, self).__init__()

    self.em = nn.Embedding(vocab_size, embedding_size)

    self.blocks = nn.ModuleList([BlockAttention(num_heads, embedding_size, sequence_length, head_size) for _ in range(4)])
    self.fw = nn.Linear(sequence_length * embedding_size, vocab_size, bias=False)

  def forward(self, x):
    x = self.em(x)
    for block in self.blocks:
      x = block(x)
    B,T,C = x.shape
    x = x.view(B,1,T*C)
    return self.fw(x)

In [8]:
# Hyperparameters
hidden_size = 100
embedding_dim = 20
seq_length = 8
learning_rate = 1e-1
batch_size = 20
num_heads = 4
head_size = 12

criterion = nn.CrossEntropyLoss()
    #Decoder(num_heads, input_size, sequence_length, head_size)
model = Decoder(num_heads, vocab_size, embedding_dim, seq_length, head_size)
optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

In [9]:
def generate_mini_batch():
  # Assuming batch_size is a variable representing the desired batch size
  # and data is your input sequence data

  # Initialize lists to store input sequences and corresponding targets for the minibatch
  batch_inputs = []
  batch_targets = []

  # Loop to generate the minibatch
  for _ in range(batch_size):
    # Randomly select a starting point for the sequence
    p = np.random.randint(0, len(data) - seq_length - 1)

    # Extract a sequence of characters and convert them to indices
    inputs = torch.tensor([char_to_ix[ch] for ch in data[p:p + seq_length]], dtype=torch.long).view(1, -1)

    # Extract the target character and convert it to an index
    target = torch.tensor([char_to_ix[data[p + seq_length]]], dtype=torch.long).view(1, -1)

    # Append the input sequence and target to the minibatch lists
    batch_inputs.append(inputs)
    batch_targets.append(target)

  # Combine the lists into tensors to form the minibatch
  minibatch_inputs = torch.cat(batch_inputs, dim=0)
  minibatch_targets = torch.cat(batch_targets, dim=0)
  return minibatch_inputs, minibatch_targets

In [10]:
# Training loop
stopi = []
lossi = []
num_iterations = 5
for iteration in range(num_iterations):

  for p in range(len(data) - seq_length):

    # inputs = torch.tensor([char_to_ix[ch] for ch in data[p:p + seq_length]], dtype=torch.long).view(1, -1)
    # targets = torch.tensor([char_to_ix[ch] for ch in data[p + seq_length]], dtype=torch.long).view(-1)

    inputs, targets = generate_mini_batch()
    optimizer.zero_grad()
    predict_char = model(inputs)

    loss = criterion(predict_char.view(-1, 65), targets.view(-1))

    loss.backward()

    for param in model.parameters():
      if param.grad is not None:
        param.grad.data.clamp_(-5, 5)

    optimizer.step()

    if p % 2000 == 0:
      print(f'Iteration {(iteration + 1) * p}, Loss: {loss.item()}')
      stopi.append((iteration + 1) * p)
      lossi.append(loss.item())

Iteration 0, Loss: 4.233497619628906
Iteration 2000, Loss: 2.58974027633667
Iteration 4000, Loss: 2.538651704788208
Iteration 6000, Loss: 2.2114078998565674
Iteration 8000, Loss: 2.224266290664673
Iteration 10000, Loss: 2.657785415649414
Iteration 12000, Loss: 2.408639907836914
Iteration 14000, Loss: 2.0665955543518066
Iteration 16000, Loss: 2.66404390335083
Iteration 18000, Loss: 2.0059735774993896
Iteration 20000, Loss: 2.588827133178711
Iteration 22000, Loss: 1.6055160760879517
Iteration 24000, Loss: 2.0929934978485107
Iteration 26000, Loss: 2.200718402862549
Iteration 28000, Loss: 2.5265440940856934
Iteration 30000, Loss: 2.284790277481079
Iteration 32000, Loss: 2.126494884490967
Iteration 34000, Loss: 2.0061211585998535
Iteration 36000, Loss: 2.3318209648132324
Iteration 38000, Loss: 2.216878890991211
Iteration 40000, Loss: 1.7851817607879639
Iteration 42000, Loss: 1.7979450225830078
Iteration 44000, Loss: 1.9491914510726929
Iteration 46000, Loss: 2.2174718379974365
Iteration 4800

KeyboardInterrupt: 

In [12]:
start = "First Citizen"

for i in range(1000):
  lll = start[-seq_length:]
  ll = torch.tensor([char_to_ix[ch] for ch in lll], dtype=torch.long).view(1, -1)
  outputs = model(ll)
  p = nn.functional.softmax(outputs, dim=-1).detach().numpy().ravel()
  ix = np.random.choice(range(vocab_size), p=p)
  ix = torch.tensor(ix, dtype=torch.long).view(1, 1)
  start += ix_to_char[ix[0][0].item()]

print(start)


First Citizen, for mint re-thek wads astod is no thy mouspert to him the sugk astir enarm,
He beceaw shallibs! I swall will with awitl dom bevest he'd Bustutter have grato.

SucouLe mon or shee fitofrip, whe shere ctragey, a for a grotsting.
Thy llsule lath
Foun fomine hor thy piton bescino guist meridg to the the sall'd as,
thnkvy, Benobed,
Tis shid.

RINA Bly:
That seep word,
That he were was a toonds! the wno mokipe some hey shyell: Ventranf amy to his in, Gifeseny in my hawce. Hatken'dl a my athen goodu.
I Opllet, ware dory ar could; the and Reop and wut thou, I das he orsenmen:
Coun sworkey soolt to plefornt; chap and but my, eremine thinks of mpady,
Ware priglloues, wich wourd ast cueel on ait.

KATHARINA:
SirI, bead igain and triest,
Gills pomy conjurrcns, ped herath, I case have nole:
For het the courd would he did to bemay of her welary o sake withiwh moren
Let take of me?

MENEN:
Wills Kason wreesp. O dI be,
And thou herpf it!

WANen:
Ni seaim,
Bod you fce on my thou thak fag