In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter, FuncFormatter

from scipy.special import erf

import torch
import torch.nn as nn
import torch.nn.functional as F

!pip install torchinfo
from torchinfo import summary

from time import time
from time import process_time

import math

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Import GPT-2 tokenizer

In [3]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

# Hyperparameters

In [4]:
# Data Hyperparameters
seq_length = 8
n_vocab = tokenizer.vocab_size

# Model Hyperparameters
embed_dim = 128
num_tranformers = 12

# Training Hyperparameetrs
BATCH_SIZE = 5

In [5]:
# Create one Attention Head
class OneAttentionHead(nn.Module):
  def __init__(self, embed_dim):
    super().__init__()

    # Create q,k,v matrices
    self.query = nn.Linear(embed_dim, embed_dim, bias=False)
    self.key = nn.Linear(embed_dim, embed_dim, bias=False)
    self.value = nn.Linear(embed_dim, embed_dim, bias=False)
    self.W0 = nn.Linear(embed_dim, embed_dim, bias=False)

  def forward(self, x):

    # Run the token embeddings vector through attention
    q = self.query(x)
    k = self.key(x)
    v = self.value(x)

    y = F.scaled_dot_product_attention(q, k, v)
    y = self.W0(y)

    return y

In [6]:
# Create Transformer Block
class TransformerBlock(nn.Module):
  def __init__(self, embed_dim):
    super().__init__()

    # Attention sublayer
    self.layerNormAttn = nn.LayerNorm(embed_dim)
    self.attn = OneAttentionHead(embed_dim)

    # Feedforward (MLP) sublayer
    self.layerNormMLP = nn.LayerNorm(embed_dim)
    self.W1 = nn.Linear(embed_dim, 4*embed_dim, bias=False)
    self.gelu = nn.GELU()
    self.W2 = nn.Linear(4*embed_dim, embed_dim, bias=False)

  def forward(self, x):

    # Feed forward for attention sublayer
    x = self.layerNormAttn(x)
    x = x + self.attn(x)

    # Feed forward for MLP sublayer
    y = self.layerNormMLP(x)
    y = self.W1(y)
    y = self.gelu(y)
    y = self.W2(y)
    x = x + y

    return x

In [7]:
# Create the full model
class LanguageModel(nn.Module):
  def __init__(self, embed_dim, num_tranformers):
    super().__init__()

    # Create embedding layer
    self.embedding = nn.Embedding(n_vocab, embed_dim)
    self.positions = nn.Embedding(seq_length, embed_dim)

    # Create transformer layers
    self.transformerBlocks = nn.Sequential(*[TransformerBlock(embed_dim) for _ in range(num_tranformers)])

    # Embedding to output layer
    self.finalLayerNorm = nn.LayerNorm(embed_dim)
    self.lmHead = nn.Linear(embed_dim, n_vocab, bias=False)

    # Tie the final output layer with embedding weights
    self.lmHead.weight = nn.Parameter(self.embedding.weight)

  def forward(self, tokx):

    token_embeddings = self.embedding(tokx)
    position_embeddings = self.positions(torch.arange(tokx.shape[-1], device=device))
    x = token_embeddings + position_embeddings

    x = self.transformerBlocks(x)

    x = self.finalLayerNorm(x)
    logits = self.lmHead(x)

    return logits

  def generate(self, tokx, max_new_tokens, temperature=1.):
    for _ in range(max_new_tokens):
      logits = self(tokx[:, -seq_length:])
      logits = logits[:, -1, :]

      probs = F.softmax(logits/ temperature, dim=-1)
      next_tok = torch.multinomial(probs, num_samples=1)

      tokx = torch.cat((tokx, next_tok), dim=1)

    return tokx

In [8]:
model = LanguageModel(embed_dim, num_tranformers).to(device)
summary(model, input_size=(BATCH_SIZE, seq_length), dtypes=[torch.long], col_names=("input_size", "output_size", "num_params"))

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
LanguageModel                            [5, 8]                    [5, 8, 50257]             --
├─Embedding: 1-1                         [5, 8]                    [5, 8, 128]               6,432,896
├─Embedding: 1-2                         [8]                       [8, 128]                  1,024
├─Sequential: 1-3                        [5, 8, 128]               [5, 8, 128]               --
│    └─TransformerBlock: 2-1             [5, 8, 128]               [5, 8, 128]               --
│    │    └─LayerNorm: 3-1               [5, 8, 128]               [5, 8, 128]               256
│    │    └─OneAttentionHead: 3-2        [5, 8, 128]               [5, 8, 128]               65,536
│    │    └─LayerNorm: 3-3               [5, 8, 128]               [5, 8, 128]               256
│    │    └─Linear: 3-4                  [5, 8, 128]               [5, 8, 512]               65,536
│    │    └─GEL

In [22]:
# Test the model
tokens = tokenizer.encode('I love music blah blah that tells of saddest throughts and many a great tales')
X = torch.tensor(tokens[:-1], dtype=torch.long, device=device).unsqueeze(0)
y = torch.tensor(tokens[1:], dtype=torch.long, device=device).unsqueeze(0)

print(tokens)
print(X.shape, y.shape)

tokenizer.decode(X[0])

[40, 1842, 2647, 33367, 33367, 326, 4952, 286, 17766, 395, 832, 912, 290, 867, 257, 1049, 19490]
torch.Size([1, 16]) torch.Size([1, 16])


'I love music blah blah that tells of saddest throughts and many a great'

In [23]:
X[:,-seq_length:]

tensor([[17766,   395,   832,   912,   290,   867,   257,  1049]])

In [25]:
logits = model(X[:, -seq_length:]) # Pass only the last seq_length tokens
print(logits.shape)

torch.Size([1, 8, 50257])


In [26]:
tokenizer.decode(model.generate(X, 10)[0])

'I love music blah blah that tells of saddest throughts and many a great great great great great great great great great great great'