In [1]:
# From paper: Improving Language Understanding by generative pre-training

# https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf

In [2]:
import torch
import torch.nn.functional as F
import torch.nn as nn

In [3]:
# GPT-1 Hyperparams 

bs = 64
lr = 2.5e-4
warmup = 2000 # warmup, linear increase in lr
epoch = 100 # converge

context_length = 512
SL = context_length # sequence length is better var name
vocab_sz = 40_000 # 40k merges
emb_sz = 768
pos_sz = 768
n_head = 12
head_sz = 768
n_layers = 12 # layers of transformers stacked

# regularization
P = 0.1 # dropout
w = 0.01 # l2

In [4]:
# Transformer's Core

## Self-Attention head (decoder)
class Head(nn.Module):
  def __init__(self, head_sz):
    super().__init__()
    self.query = nn.Linear(emb_sz, head_sz, bias=False)
    self.key   = nn.Linear(emb_sz, head_sz, bias=False)
    self.value = nn.Linear(emb_sz, head_sz, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(SL, SL)))
    self.dropout = nn.Dropout(P)
  
  def forward(self, x):
    B,T,C = x.shape
    q = self.query(x)
    k = self.key(x)
    wei = q @ k.transpose(-2,-1) * (head_sz**-0.5)
    wei = wei.masked_fill_(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)
    v = self.value(x)
    out = wei @ v
    return out


## Multi-Head Attention
class MultiHeadAttention(nn.Module):
  def __init__(self, n_head, head_sz):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_sz) for _ in range(n_head)])
    self.linear = nn.Linear(n_head*head_sz, emb_sz)
    self.dropout = nn.Dropout(P)
  
  def forward(self, x): # B,T,C
    out = torch.cat([h(x) for h in self.heads], dim=-1) # n* B,T,H -> B,T,n*H
    out = self.dropout(self.linear(out)) # B,T,nH @ nH, C -> B,T,C 
    return out


## Feed Forward
class FeedForward(nn.Module):
  def __init__(self, emb_sz):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(emb_sz, emb_sz*4), # emb_sz*4 = 3072 (as mentioned in paper)
      nn.LayerNorm(emb_sz*4),
      nn.GELU(),
      nn.Linear(emb_sz*4, emb_sz),
      nn.Dropout(P),
    )
  
  def forward(self, x): # B,T,C
    out = self.net(x)
    return out


In [7]:
# Transformer Block

class Transformer(nn.Module):
  def __init__(self, n_head, head_sz):
    super().__init__()
    assert head_sz%n_head == 0 # 768 // 12 -> 64
    head_size = head_sz//n_head
    self.sa = MultiHeadAttention(n_head, head_size) 
    self.ff = FeedForward(emb_sz)
    self.ln1 = nn.LayerNorm(head_sz)
    self.ln2 = nn.LayerNorm(head_sz)

  def forward(self, x): # B, T, C
    x = x + self.sa(x)
    x = self.ln1(x)
    x = x + self.ff(x)
    x = self.ln2(x)
    return x

In [8]:
# GPT-1
# part 1: learning high capacity language model on a large corpus of text

class GPT1(nn.Module):
  def __init__(self):
    super().__init__()
    self.vocab_emb = nn.Embedding(vocab_sz, emb_sz)
    self.positional_emb = nn.Embedding(SL, emb_sz)
    self.blocks = nn.Sequential(*[Transformer(n_head, head_sz) for _ in range(n_layers)])
    self.lnorm = nn.LayerNorm(emb_sz)
    self.linear = nn.Linear(emb_sz, vocab_sz)
    self.apply(self._init_weights)
 
  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
      if module.bias is not None:
        torch.nn.init.constant_(module.bias, 0.001)
    elif isinstance(module, nn.Embedding):
      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self, x): # B,T
    B,T = x.shape
    tkn_emb = self.vocab_emb(x)
    pos_emb = self.positional_emb(torch.arange(T))
    x = tkn_emb + pos_emb
    x = self.blocks(x)
    x = self.lnorm(x)
    logits = self.linear(x)
    return logits


In [9]:
model = GPT1()
optim = torch.optim.Adam(model.parameters(), lr, (0.9, 0.995))

In [10]:
str(round(sum([p.nelement() for p in model.parameters()]) / 1e9, 2)) + ' billion parameters'

'0.15 billion parameters'

In [None]:
# they achieved 18.4 perplexity with GPT-1 on BookCorpus dataset.

# part 2: fine tuning stage, where they adapt the model for discriminative task with labeled data, like text classification, entailement, similarity, MCQ.

# example of classification
# B,T -> (GPT-1) -> B,T,V -> Linear transformation to the number of classes in the classification task.

In [25]:
# TODO
# From paper: Language Models are Unsupervised Multitask Learners (2019)
# https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf

# demonstrate language models can perform down-stream tasks in a zero-shot settingâ€“ without any parameter or architecture modification. highlighting the ability of language models to perform a wide range of tasks in a zero-shot setting

In [26]:
# text -> Byte-pair encoding -> tokens
# model GPT-2

# Text and pos emb
# ------12x-------
# LayerNorm
# .
# MA .
# .
# LayerNorm
# .
# FF .
# .
# LayerNorm
# ---------------
# preds | classify

"""
GPT-2's

12 ~ GPT-1
24 ~ BERT largest
36 ~ 
48 ~ 
"""

# using n-gram overlap based de-duplication as an important verification step and sanity check during the creation of training and test splits for new NLP datasets

# what we did
# ---------------
# LayerNorm
# .
# SA .
# .
# LayerNorm
# .
# FF .
# .

"\nGPT-2's\n\n12 ~ GPT-1\n24 ~ BERT largest\n36 ~ \n48 ~ \n"