In [1]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

In [15]:
import torch
from torch import nn
class DummyGPTModel(nn.Module):

  def __init__(self, cfg):
    super().__init__()
    self.tok_embedding = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
    self.pos_embedding = nn.Embedding(cfg['context_length'], cfg['emb_dim'])

    self.trf_blocks = nn.Sequential(*[DummyTransformerBlock(cfg) for _ in range(cfg['n_layers'])])
    self.finalnorm = DummyLayerNorm(cfg['emb_dim'])
    self.final_proj = nn.Linear(cfg['emb_dim'],cfg['vocab_size'],bias=False)
    self.drop_emb = nn.Dropout(cfg['drop_rate'])

  def forward(self,x):

    batch, seq_len = x.shape
    device = x.device
    x = self.tok_embedding(x)
    x += self.pos_embedding(torch.arange(0,seq_len,device=x.device))
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.finalnorm(x)
    x = self.final_proj(x)

    return x

class DummyTransformerBlock(nn.Module):

  def __init__(self,cfg):
    super().__init__()

  def forward(self, x):
    return x

class DummyLayerNorm(nn.Module):

  def __init__(self, emb_dim):
    super().__init__()

  def forward(self,x):
    return x





In [16]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch = torch.stack(batch,dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [17]:
batch

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

In [18]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


In [32]:
torch.manual_seed(123)

eg = torch.randn(2,5)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(eg)
out.shape

torch.Size([2, 6])

In [25]:
# layer norm
'''
val - mean / sqrt(var)
'''

mean = out.mean(-1,keepdim=True)
var = out.var(-1, keepdim=True)

out_norm = (out - mean)/torch.sqrt(var)


In [26]:
out_norm.mean(-1,keepdim=True), out_norm.var(-1,keepdim=True)

(tensor([[9.9341e-09],
         [1.9868e-08]], grad_fn=<MeanBackward1>),
 tensor([[1.0000],
         [1.0000]], grad_fn=<VarBackward0>))

In [49]:
class LayerNorm(nn.Module):

  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.weight = nn.Parameter(torch.ones(emb_dim))
    self.bias = nn.Parameter(torch.zeros(emb_dim))

  def forward(self,x):
    mean = x.mean(-1,keepdim=True)
    var = x.var(-1,keepdim=True, unbiased=False)

    out_norm = (x - mean)/torch.sqrt(var + self.eps)
    return self.weight  * out_norm + self.bias

In [50]:
ln = LayerNorm(6)

In [51]:
normalized_out = ln(out)
normalized_out.mean(-1), normalized_out.var(-1)

(tensor([ 0.0000e+00, -1.9868e-08], grad_fn=<MeanBackward1>),
 tensor([1.1994, 1.1996], grad_fn=<VarBackward0>))

In [103]:
class GeLU(nn.Module):

  def __init__(self):
    super().__init__()

  def forward(self, x):
    return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2)/torch.pi) * (x + 0.044715 * torch.pow(x,3))))

In [104]:
class FeedForward(nn.Module):

  def __init__(self,cfg):
    super().__init__()
    self.layers = nn.Sequential(nn.Linear(cfg['emb_dim'],cfg['emb_dim']*4), GeLU(),
                                nn.Linear(cfg['emb_dim']*4,cfg['emb_dim']))

  def forward(self, x):
    return self.layers(x)

In [105]:
ffn = FeedForward(GPT_CONFIG_124M)
x = torch.rand(2, 3, 768)
out = ffn(x)
print(out.shape)

torch.Size([2, 3, 768])


In [106]:
class MultiHeadAttention(nn.Module):


  def __init__(self, din, dout, context_len, num_heads, dropout, qkv_bias):
    super().__init__()

    self.wq = nn.Linear(din,dout, bias=qkv_bias)
    self.wk = nn.Linear(din,dout, bias=qkv_bias)
    self.wv = nn.Linear(din,dout, bias=qkv_bias)

    self.head_dim = dout // num_heads
    self.num_heads = num_heads
    self.register_buffer('mask',torch.tril(torch.ones(context_len,context_len).unsqueeze(0).unsqueeze(0)))

    self.dropout = nn.Dropout(dropout)
    self.out_proj = nn.Linear(dout, dout)
    self.dout = dout

  def forward(self, x):
    B,T,C = x.shape
    Q = self.wq(x).view(B,T,self.num_heads, self.head_dim).transpose(1,2)
    K = self.wk(x).view(B,T,self.num_heads, self.head_dim).transpose(1,2)
    V = self.wv(x).view(B,T,self.num_heads, self.head_dim).transpose(1,2)  # B, numtokens, din  -> B, numheads, T, headdim

    attention_scores = Q @ K.transpose(2,3) # B numheads T T
    attention_scores = attention_scores.masked_fill(self.mask[:,:,:T,:T]==0, -torch.inf) # B numheads T T
    attention_scores = attention_scores / self.head_dim**0.5 # B numheads T T
    attention_scores = torch.softmax(attention_scores, dim=-1) # B numheads T T
    attention_scores = self.dropout(attention_scores)
    context_vector = attention_scores @ V # B numheads T headdim
    context_vector = context_vector.transpose(1,2) # B T numheads Headdim

    context_vector = context_vector.contiguous().view(B,T,self.dout)
    context_vector = self.out_proj(context_vector)
    return context_vector

In [107]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

In [108]:
class TransformerBlock(nn.Module):

  def __init__(self, cfg):
    super().__init__()

    self.ln1 = LayerNorm(cfg['emb_dim'])
    self.mha = MultiHeadAttention(cfg['emb_dim'], cfg['emb_dim'], cfg['context_length'], cfg['n_heads'], cfg['drop_rate'], cfg['qkv_bias'])

    self.dropout = nn.Dropout(cfg['drop_rate'])

    self.ln2 = LayerNorm(cfg['emb_dim'])
    self.ffw = FeedForward(cfg)

  def forward(self, x):
    x = x + self.dropout(self.mha(self.ln1(x)))

    x = x + self.dropout(self.ffw(self.ln2(x)))

    return x


In [109]:
trnf = TransformerBlock(GPT_CONFIG_124M)

In [110]:
output = trnf(torch.randn(2,4,768))

In [111]:
output.shape

torch.Size([2, 4, 768])

In [112]:
class GPT2(nn.Module):

  def __init__(self, cfg):
    super().__init__()

    self.tok_emb_layer = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
    self.pos_emb_layer = nn.Embedding(cfg['context_length'], cfg['emb_dim'])

    self.dropout = nn.Dropout(cfg['drop_rate'])

    self.trfblocks = nn.Sequential(
        *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
    )

    self.final_ln = LayerNorm(cfg['emb_dim'])
    self.out_proj = nn.Linear(cfg['emb_dim'],cfg['vocab_size'], bias=False)


  def forward(self, x):

    B,T = x.shape
    x = self.tok_emb_layer(x) + self.pos_emb_layer(torch.arange(0,T,device=x.device))
    x = self.dropout(x)

    x = self.trfblocks(x)

    x = self.final_ln(x)
    x = self.out_proj(x)

    return x



In [113]:
torch.manual_seed(123)
model = GPT2(GPT_CONFIG_124M)

In [114]:
out = model(batch)

In [115]:
out.shape

torch.Size([2, 4, 50257])

In [118]:
total_params = sum([p.nelement() for p in model.parameters()])


In [116]:
print("Token embedding layer shape:", model.tok_emb_layer.weight.shape)
print("Output layer shape:", model.out_proj.weight.shape)

Token embedding layer shape: torch.Size([50257, 768])
Output layer shape: torch.Size([50257, 768])


In [101]:
total_params_gpt2 = (
    sum([p.nelement() for p in model.parameters()]) - sum(p.numel()
    for p in model.out_proj.parameters())
)
print(f"Number of trainable parameters "
      f"considering weight tying: {total_params_gpt2:,}")

Number of trainable parameters considering weight tying: 124,439,808


In [117]:
total_params_gpt2 = (
    sum([p.nelement() for p in model.parameters()]) - sum(p.numel()
    for p in model.out_proj.parameters())
)
print(f"Number of trainable parameters "
      f"considering weight tying: {total_params_gpt2:,}")

Number of trainable parameters considering weight tying: 124,412,160


In [119]:
total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 * 1024)
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


In [128]:
model.trfblocks[0].mha

MultiHeadAttention(
  (wq): Linear(in_features=768, out_features=768, bias=False)
  (wk): Linear(in_features=768, out_features=768, bias=False)
  (wv): Linear(in_features=768, out_features=768, bias=False)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=768, bias=True)
)

In [127]:
def calc_params(layer):
  s = 0
  for p in layer.parameters():
    s += p.numel()
  return s

calc_params(model.trfblocks[0].ffw) * 12

56669184

In [129]:
calc_params(model.trfblocks[0].mha) * 12

28320768

In [131]:
a = [1,2,3,4,5]
a[-2:]

[4, 5]

In [171]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
  # idx: batch,numtokens
  model.eval()
  for _ in range(max_new_tokens):
    context_to_give = idx[:,-context_size:]
    # with torch.no_grad():
    output = model(context_to_give) # 1,1,50k
    output = torch.softmax(output,dim=-1)
    # return output
    next_idx = torch.multinomial(output[0,0,:],1)
    idx = torch.cat([idx,next_idx.view(1,1)],dim=-1)

  return idx


In [188]:
input_text = "Ajay is chilling"
encoded_text = tokenizer.encode(input_text)
len(encoded_text)

4

In [189]:
generated_tokens = generate_text_simple(model, torch.tensor(encoded_text).view(1,-1), 10, GPT_CONFIG_124M['context_length'])

In [190]:
tokenizer.decode(generated_tokens.squeeze(0).tolist())

'Ajay is chilling stay Trouble killed EXTmaybe secretiveHow GiulianiStudHealth'

In [182]:
def generate_text_simple(model, idx,
                         max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

In [183]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)



encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [184]:
out = generate_text_simple(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)
print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267,
         49706, 43231, 47062, 34657]])
Output length: 14


In [185]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am Featureiman Byeswickattribute argue logger Normandy Compton analogous
