<a href="https://colab.research.google.com/github/archyyu/GPT-from-MLP-to-RNN-to-Transformer/blob/main/GPT_by_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import math

In [2]:
# Data I/O

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
data = response.text

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f'data has {data_size} characters, {vocab_size} unique.')

char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

data has 1115394 characters, 65 unique.


In [3]:
class Head(nn.Module):
  def __init__(self, embed_size, sequence_length, head_size):
    super(Head, self).__init__()
    self.C = embed_size
    self.L = sequence_length
    self.head_size = head_size
    self.q = nn.Linear(self.C, head_size, bias=False)
    self.k = nn.Linear(self.C, head_size, bias=False)
    self.v = nn.Linear(self.C, head_size, bias=False)

    self.register_buffer('tril',torch.tril(torch.ones(self.L, self.L)))

  def forward(self, x):
    q = self.q(x)
    k = self.k(x)
    v = self.v(x)

    wei = q @ k.transpose(-2, -1) * (self.head_size ** -0.5)
    wei = F.softmax(wei, dim=-1)

    out = wei @ v
    return out

class PositionalEncoding(nn.Module):
  def __init__(self, embedding_size, max_len=512):
    super(PositionalEncoding, self).__init__()
    self.encoding = torch.zeros(max_len, embedding_size)
    position = torch.arange(0, max_len).unsqueeze(1).float()
    div_term = torch.exp(torch.arange(0, embedding_size, 2).float() * -(math.log(10000.0) / embedding_size))
    self.encoding[:, 0::2] = torch.sin(position * div_term)
    self.encoding[:, 1::2] = torch.cos(position * div_term)
    self.encoding = self.encoding.unsqueeze(0)

  def forward(self, x):
    return x + self.encoding[:, :x.size(1)].detach()


class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, embedding_size, sequence_length, head_size):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads

    self.heads = nn.ModuleList([
        Head(embedding_size, sequence_length, head_size) for _ in range(num_heads)
    ])

    self.final_linear = nn.Linear(num_heads * head_size, embedding_size)
    self.relu = nn.ReLU()

  def forward(self, x):

    head_outputs = [head(x) for head in self.heads]
    concatenated_output = torch.cat(head_outputs, dim=-1)
    final_output = self.relu(self.final_linear(concatenated_output))

    return final_output

class BlockAttention(nn.Module):
  def __init__(self, num_heads, embedding_size, sequence_length, head_size):
    super(BlockAttention, self).__init__()
    self.pos_encode = PositionalEncoding(embedding_size)
    self.multiheads = MultiHeadAttention(num_heads, embedding_size, sequence_length, head_size)
    self.norm = nn.LayerNorm(embedding_size)

  def forward(self, x):
    x = self.pos_encode(x)
    inter_result = x + self.multiheads(x)
    final_output = self.norm(x + inter_result)
    return final_output


class Decoder(nn.Module):
  def __init__(self, num_heads, vocab_size, embedding_size, sequence_length, head_size):
    super(Decoder, self).__init__()

    self.em = nn.Embedding(vocab_size, embedding_size)
    self.pos_encode = PositionalEncoding(embedding_size)
    self.blocks = nn.ModuleList([BlockAttention(num_heads, embedding_size, sequence_length, head_size) for _ in range(4)])
    self.fw = nn.Linear(sequence_length * embedding_size, vocab_size, bias=False)

  def forward(self, x):
    x = self.em(x)
    x = self.pos_encode(x)
    for block in self.blocks:
      x = block(x)
    B,T,C = x.shape
    x = x.view(B,1,T*C)
    return self.fw(x)

In [4]:
# Hyperparameters
hidden_size = 100
embedding_dim = 20
seq_length = 8
learning_rate = 1e-1
batch_size = 20
num_heads = 4
head_size = 12

criterion = nn.CrossEntropyLoss()
    #Decoder(num_heads, input_size, sequence_length, head_size)
model = Decoder(num_heads, vocab_size, embedding_dim, seq_length, head_size)
optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

In [5]:
def generate_mini_batch():
  # Assuming batch_size is a variable representing the desired batch size
  # and data is your input sequence data

  # Initialize lists to store input sequences and corresponding targets for the minibatch
  batch_inputs = []
  batch_targets = []

  # Loop to generate the minibatch
  for _ in range(batch_size):
    # Randomly select a starting point for the sequence
    p = np.random.randint(0, len(data) - seq_length - 1)

    # Extract a sequence of characters and convert them to indices
    inputs = torch.tensor([char_to_ix[ch] for ch in data[p:p + seq_length]], dtype=torch.long).view(1, -1)

    # Extract the target character and convert it to an index
    target = torch.tensor([char_to_ix[data[p + seq_length]]], dtype=torch.long).view(1, -1)

    # Append the input sequence and target to the minibatch lists
    batch_inputs.append(inputs)
    batch_targets.append(target)

  # Combine the lists into tensors to form the minibatch
  minibatch_inputs = torch.cat(batch_inputs, dim=0)
  minibatch_targets = torch.cat(batch_targets, dim=0)
  return minibatch_inputs, minibatch_targets

In [None]:
# Training loop
stopi = []
lossi = []
num_iterations = 5
for iteration in range(num_iterations):

  for p in range(len(data) - seq_length):

    # inputs = torch.tensor([char_to_ix[ch] for ch in data[p:p + seq_length]], dtype=torch.long).view(1, -1)
    # targets = torch.tensor([char_to_ix[ch] for ch in data[p + seq_length]], dtype=torch.long).view(-1)

    inputs, targets = generate_mini_batch()
    optimizer.zero_grad()
    predict_char = model(inputs)

    loss = criterion(predict_char.view(-1, 65), targets.view(-1))

    loss.backward()

    for param in model.parameters():
      if param.grad is not None:
        param.grad.data.clamp_(-5, 5)

    optimizer.step()

    if p % 2000 == 0:
      print(f'Iteration {(iteration + 1) * p}, Loss: {loss.item()}')
      stopi.append((iteration + 1) * p)
      lossi.append(loss.item())

Iteration 0, Loss: 2.3700385093688965
Iteration 2000, Loss: 2.6647467613220215
Iteration 4000, Loss: 2.378612995147705
Iteration 6000, Loss: 2.7986457347869873
Iteration 8000, Loss: 2.384169816970825
Iteration 10000, Loss: 1.8126941919326782
Iteration 12000, Loss: 2.5348381996154785
Iteration 14000, Loss: 1.9145715236663818
Iteration 16000, Loss: 2.1061291694641113
Iteration 18000, Loss: 1.406852126121521


In [7]:
start = "First Citizen"

for i in range(1000):
  lll = start[-seq_length:]
  ll = torch.tensor([char_to_ix[ch] for ch in lll], dtype=torch.long).view(1, -1)
  outputs = model(ll)
  p = nn.functional.softmax(outputs, dim=-1).detach().numpy().ravel()
  ix = np.random.choice(range(vocab_size), p=p)
  ix = torch.tensor(ix, dtype=torch.long).view(1, 1)
  start += ix_to_char[ix[0][0].item()]

print(start)


First Citizen;
Whiow thou leI'd did liswots that Cac! qnme:
Afars criticed: far;
I cregom thar thy
HlOEN Gail rickst dest TinSm
Whou agok mefe this apity cofed
Aorkh de Qoosh
Why.

CENELO:
I do gelt; ard wit.
Rurs fay pored.
Whach a waves thy that cowe thes focels,
Tat Tho gicy,
Id and on seit isinmit I it slor ust ge?

BUKHINBTUS:

SAON thing wruund llt to it totse:
Ywin my tham larmat to thad awke that und, arwilu same Bevn Soll
Whom Peave ge yind.
He
Rfartlord:
Oflperbher
Yoit domen;
Bow bet; sis
Yur'k I bayw:

thee pour than,--e wighy


ARNTEN

AUMEOS:
I t, if mioy eigibl.

YOS lasuid shy sorton' lils, in wur, a have that beps atse.

HOMTIO:
Iy dan, thot all pomurn le wort haven of uptst thig wifor;
Bow Waival wre a's; thud

TING VING Gane:
Banstrirgt ift, me lat,
Thir dicl'd 'renm,
Wet a gocd date my intar
I Rurmer facl mo hard no my girg thou thar det Bamie ta if wre-
Tis he yey in cedelest nuasupnceud fir aeld
ANOHLurs bor dome,
Anourd
An that the
Wom hery teave to tead; fous su