## GPT
This notebook mainly implements GPT from nanoGPT.


Full definition of a GPT Language Model, all of it in this single file.
References:
1) the official GPT-2 TensorFlow implementation released by OpenAI:
https://github.com/openai/gpt-2/blob/master/src/model.py
2) huggingface/transformers PyTorch implementation:
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py


### 0. pacakge

In [13]:
import math
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F

### 1. GPT model implementation

It need to be mentioned that the modules implemented in previous charpter will not be detailed here.

### 1.1 model config

In [14]:
# dataclass staticmethod classmethod: https://blog.csdn.net/sjxgghg/article/details/139861829
@dataclass
class GPTConfig:
    block_size: int = 1024 # the length of a sentence
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 12
    n_head: int = 12
    n_embed: int = 768
    dropout: float = 0.0
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster


### 1.2 modules

In [15]:
## layerNorm
class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
    
    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, eps=1e-5)
    

In [16]:
## CausalSelfAttention
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        # if we meet n_embed can not divided by n_head, raise an error
        assert config.n_embed % config.n_head == 0
        # projection for qkv, it will be splitted below.
        self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed, bias = config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embed, config.n_embed, bias=config.bias)
        # settings
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embed = config.n_embed
        self.dropout = config.dropout
        # flash attention will be deployed if avaliable, but support is only in pytorch > 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print('WARNING: using slow attention. Flash Attention requires Pytorch >= 2.0')
            # causal mask to ensure that attention is only applied to the left in the input sequence.
            # about register buffer: https://blog.csdn.net/dagouxiaohui/article/details/125649813
            # set parameters will be stored to state_dict, but not updated in training.
            self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size))).view(1,1,config.block_size, config.block_size)

    def forward(self,x):
        # batch_size, seq_len, embed_dim
        B, T, C = x.size() 

        # split q, k, v
        q, k, v = self.c_attn(x).split(self.n_embed, dim = 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1,2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1,2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1,2)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # apply torch attn
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention: q@k_t / sqrt(length)
            att = (q @ k.transpose(-1, -2)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
            att = F.softmax(att, dim = -1)
            att = self.attn_dropout(att)
            y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C) 
        y = self.resid_dropout(self.c_proj(y))
        return y
    
## MLP
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.c_fc = nn.Linear(config.n_embed, 4 * config.n_embed, bias= config.bias)
        self.gelu = nn.GELU()
        self.c_proj = nn.Linear(4 * config.n_embed, config.n_embed, bias = config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)

        return x

### 1.3 GPT BLOCK
Then we compile all the modules to a single block

In [17]:
### GPT block

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embed, bias=config.bias)
        self.attn = CausalSelfAttention(config=config)
        self.ln_2 = LayerNorm(config.n_embed, bias=config.bias)
        self.mlp = MLP(config=config)

    def forward(self, x):
        # we apply pre-norm before inputting into the attention
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

### 1.4 GPT Model

In [18]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        # check vocab_size and block_size
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config
        # actually decoder
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embed),
            wpe = nn.Embedding(config.block_size, config.n_embed),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config=config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embed, bias=config.bias)
        ))
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size,  bias = False)
        # weight sharing between embedding layer and lm_head layer
        self.transformer.wte.weight = self.lm_head.weight
        # init all weights (embedding and lm_head)
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std= 0.02 / math.sqrt(2 * config.n_layer))
        # report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def forward(self, x, targets = None):
        device = x.device
        b, t = x.size()
        # 检查句长是否超过限制
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(x)
        pos_emb = self.transformer.wpe(pos)
        # generate embed and dropout
        x = self.transformer.drop(tok_emb + pos_emb)

        for block in self.transformer.h:
            x = block(x)

        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # we only need to calculate the next token
            logits = self.lm_head(x[:, [-1], :])
            loss = None
        return logits, loss

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # in-place
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
    
    def get_num_params(self, non_embedding = True):
        """
        Return the number of parameters in the model.
        For non-embedding count (default), the position embeddings get subtracted.
        The token embeddings would too, except due to the parameter sharing these
        params are actually used as weights in the final layer, so we include them.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

## 2.GPT Debug
这一章节主要对于GPT的数据流程进行debug

In [19]:
# set device and config
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vocab_size = 50304
block_size = 1024
n_embed = 128
n_head = 4
dropout = 0.2

In [20]:
# 1. load data
x = torch.load('X.tensor').to(device)
y_true = torch.load('Y.tensor').to(device)
print('input x shape: \n', x.shape)
print('input x: \n', x[1, :10])
print('input y shape: \n', y_true.shape)
print('input y: \n', y_true[1, :10])
print('We found that actuall y is left-shfited for next token prediction')

input x shape: 
 torch.Size([16, 256])
input x: 
 tensor([ 0, 13, 52, 42,  1, 26, 43, 56, 53,  1], device='cuda:0')
input y shape: 
 torch.Size([16, 256])
input y: 
 tensor([13, 52, 42,  1, 26, 43, 56, 53,  1, 61], device='cuda:0')
We found that actuall y is left-shfited for next token prediction


In [21]:
# 2. massenge passing through gpt block
# b: batch_size, t: seq_len
b, t = x.size()
print(f'batch size is {b}, seq length is {t}')

# 2.1 token and position embedding
tok_embed = nn.Embedding(vocab_size, n_embed).to(device)
tok_x = tok_embed(x)
print('tok_embed x shape: \n', tok_x.shape)
print('tok_embed x: \n', tok_x[0, 1, :10])
pos = torch.arange(0, t, device=device, dtype=torch.long).to(device)
print('pos: \n', pos[:10])
pos_embed = nn.Embedding(block_size, n_embed).to(device)
pos_x = pos_embed(pos)
print('pos_embed x shape: \n', pos_x.shape)
print('pos_embed x: \n', pos_x[0, :10])
# add and drop
out = tok_x + pos_x
print('embed out shape: \n', out.shape)
print('embed out: \n', out[0, 1, :10])
out = nn.Dropout(dropout)(out)
print('YOU may notice that the out passes through the dropout layer and its value is changed!')
print('MORE INFOR: https://blog.csdn.net/weixin_43953686/article/details/105978308')
print('dropout out shape: \n', out.shape)
print('dropout out: \n', out[0, 1, :10])

batch size is 16, seq length is 256
tok_embed x shape: 
 torch.Size([16, 256, 128])
tok_embed x: 
 tensor([ 0.3910,  1.5044, -0.1323, -0.6133, -0.8597, -0.0640, -0.9161, -0.8664,
         0.3146, -0.7122], device='cuda:0', grad_fn=<SliceBackward0>)
pos: 
 tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], device='cuda:0')
pos_embed x shape: 
 torch.Size([256, 128])
pos_embed x: 
 tensor([-1.1637,  0.9122, -1.1423, -0.2367,  0.8832,  0.9979,  1.9996, -0.4778,
         0.7623,  0.3977], device='cuda:0', grad_fn=<SliceBackward0>)
embed out shape: 
 torch.Size([16, 256, 128])
embed out: 
 tensor([ 0.2437,  0.7061, -1.4267,  0.9523, -1.5703, -0.1430, -1.4796,  0.1415,
        -0.2720, -1.2497], device='cuda:0', grad_fn=<SliceBackward0>)
YOU may notice that the out passes through the dropout layer and its value is changed!
MORE INFOR: https://blog.csdn.net/weixin_43953686/article/details/105978308
dropout out shape: 
 torch.Size([16, 256, 128])
dropout out: 
 tensor([ 0.3047,  0.8827, -1.7833,  0.0000, 

In [22]:
# 2.2 attention

## pre-layernorm
out = nn.LayerNorm(n_embed).to(device)(out)
print('LayerNorm out shape: \n', out.shape)
print('LayerNorm out: \n', out[0, 1, :10])

## qkv
# generate qkv once
qkv = nn.Linear(n_embed, 3 * n_embed).to(device)(out)
print('qkv shape: \n', qkv.shape)
print('qkv: \n', qkv[0, 1, :10])
q, k, v = qkv.split(n_embed, dim = -1)
print('Split qkv shape: \n', q.shape)
print('Split qkv: \n', q[0, 1, :10])

## multi-head
q = q.view(b, t, n_head, n_embed // n_head).transpose(1, 2)
k = k.view(b, t, n_head, n_embed // n_head).transpose(1, 2)
v = v.view(b, t, n_head, n_embed // n_head).transpose(1, 2)
print('Multi-head q shape: \n', q.shape)
print('Multi-head q : \n', q[0, 0, 1, :10])

# self attention
attn = q@k.transpose(-1,-2) / math.sqrt(n_embed)
print('attn score shape: \n', attn.shape)
print('attn score : \n', attn[0, 0, 1, :10])

# mask fill
## generate mask -> mask is a tril matrix for next token prediction 
## mask is applied to q@k and shape is (b, t, seq_len, seq_len)
bias = torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size).to(device=device)
print('mask shape: \n', bias.shape)
print('mask : \n', bias[0, 0, :10, :10])
## some times, t may less than max_len
attn = attn.masked_fill(bias[:, :, :t, :t] == 0, float('-inf'))
print('masked attn shape: \n', attn.shape)
print('masked attn : \n', attn[0, 0, :10, :10])

# softmax
attn = F.softmax(attn, dim=-1)

# dropout
attn = nn.Dropout(dropout)(attn)

# output
## (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
y = attn @ v
print('attn output shape: \n', y.shape)
print('attn output : \n', y[0, 0, :10, :10])
## transpose and output
y = y.transpose(1, 2).contiguous().view(b, t, n_embed)
print('y shape: \n', y.shape)
print('y : \n', y[0, 0, :10])
# proj
y = nn.Dropout(dropout)(nn.Linear(n_embed, n_embed).to(device)(y))
print('y proj out shape: \n', y.shape)
print('y proj out : \n', y[0, 0, :10])

# layerNorm
y = nn.LayerNorm(n_embed).to(device)(y)

# output
## we just get last token logits
logits = nn.Linear(n_embed, vocab_size).to(device)(y)
print('logits shape: \n', logits.shape)
print('logits: \n', logits[0, [-1], :10])
print('predicted logit for first word: \n', logits[0, [-1], torch.argmax(logits[0, [-1], :10])])

LayerNorm out shape: 
 torch.Size([16, 256, 128])
LayerNorm out: 
 tensor([ 0.3690,  0.7618, -1.0496,  0.1621,  0.1621,  0.0406, -1.0946,  0.2822,
        -0.0689, -0.8994], device='cuda:0', grad_fn=<SliceBackward0>)
qkv shape: 
 torch.Size([16, 256, 384])
qkv: 
 tensor([-0.2769,  0.3850,  0.1450,  0.3017,  0.2100,  0.0191, -0.3082, -0.4025,
        -0.4228,  0.4773], device='cuda:0', grad_fn=<SliceBackward0>)
Split qkv shape: 
 torch.Size([16, 256, 128])
Split qkv: 
 tensor([-0.2769,  0.3850,  0.1450,  0.3017,  0.2100,  0.0191, -0.3082, -0.4025,
        -0.4228,  0.4773], device='cuda:0', grad_fn=<SliceBackward0>)
Multi-head q shape: 
 torch.Size([16, 4, 256, 32])
Multi-head q : 
 tensor([-0.2769,  0.3850,  0.1450,  0.3017,  0.2100,  0.0191, -0.3082, -0.4025,
        -0.4228,  0.4773], device='cuda:0', grad_fn=<SliceBackward0>)
attn score shape: 
 torch.Size([16, 4, 256, 256])
attn score : 
 tensor([-0.0238,  0.0788, -0.0131, -0.0140,  0.1106,  0.1915, -0.2432, -0.0502,
        -0.085

## 3. training and inference

In [23]:
## compute loss

probs = F.softmax(logits, dim = -1)

print('probs shape: \n', logits.shape)
print('probs: \n', logits[0, [-1], :10])

print('y lable shape: \n',y_true.shape)
print('y lable: \n', y_true[0,:10])

loss_fn = torch.nn.CrossEntropyLoss()
loss = loss_fn(probs.transpose(1, 2), y_true)

print('loss shape: \n', logits.shape)
print('loss: \n', loss)

probs shape: 
 torch.Size([16, 256, 50304])
probs: 
 tensor([[-0.1032, -0.6194, -0.0784,  1.6256,  0.3642, -0.2889, -0.1829, -0.5818,
         -0.1270,  1.0914]], device='cuda:0', grad_fn=<IndexBackward0>)
y lable shape: 
 torch.Size([16, 256])
y lable: 
 tensor([40, 43,  0, 42, 39, 51, 52, 43, 42,  1], device='cuda:0')
loss shape: 
 torch.Size([16, 256, 50304])
loss: 
 

tensor(10.8260, device='cuda:0', grad_fn=<NllLoss2DBackward0>)


In [24]:
## training
### load model
config = GPTConfig()
print(config.vocab_size)
model = GPT(config)
print('model: \n', model)

50304
number of parameters: 123.69M
model: 
 GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)
