In [11]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [15]:
import torch
import torch.nn as nn

class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config['vocab_size'], config['emb_dim'])
        self.pos_emb = nn.Embedding(config['context_length'], config['emb_dim'])
        self.drop_emb = nn.Dropout(config['drop_rate'])

        self.transformer_blocks = nn.Sequential(*[TransformerBlock(config) for _ in range(config['n_layers'])])

        self.layer_norm = LayerNorm(config['emb_dim'])

        self.head_out = nn.Linear(config['emb_dim'], config['vocab_size'], bias = False)

    def forward(self, xx):
        batch_size, seq_len = xx.shape
        tok_embeds = self.tok_emb(xx)
        pos_embeds = self.pos_emb(torch.arange(seq_len))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.transformer_blocks(x)
        x = self.layer_norm(x)
        logits = self.head_out(x)
        return logits
    
class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()

    def forward(self, x):
        return x
    
class LayerNorm(nn.Module):
    def __init__(self, norm_shape, eps = 1e-5):
        super().__init__()
    
    def forward(self, x):
        return x


In [16]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Hello, how are you?"
txt2 = "I am fine, thanks!"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim= 0)
print(batch)

tensor([[15496,    11,   703,   389,   345,    30],
        [   40,   716,  3734,    11,  5176,     0]])


In [18]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Shape of logits :", logits.shape)
print("Output :", logits)

Shape of logits : torch.Size([2, 6, 50257])
Output : tensor([[[ 0.0260, -0.5017,  0.6300,  ...,  0.2579,  0.3621, -0.6037],
         [ 0.0517,  0.9177,  0.3329,  ...,  0.6093,  0.6525,  0.4799],
         [-0.2801,  1.5910, -0.2096,  ...,  0.8286,  0.4935,  0.3478],
         [ 0.1519, -0.4870,  0.5011,  ...,  1.4466,  0.9413, -0.6668],
         [-0.1582,  0.9116,  0.2306,  ..., -0.5324, -0.6255, -1.4752],
         [ 0.3110,  0.0433, -0.9300,  ...,  1.0326, -0.1965,  0.3757]],

        [[ 1.0382, -0.9600,  0.0945,  ..., -0.4391,  0.7693, -0.1641],
         [-0.5969,  1.1909, -0.2815,  ..., -0.8720,  0.0031,  1.6365],
         [ 0.1155,  1.4155,  0.8472,  ...,  0.4151, -0.3077, -1.4271],
         [ 0.7789,  0.9885, -0.5383,  ...,  1.8615, -0.0282, -0.4208],
         [-1.1761,  0.2301,  1.0036,  ...,  0.0968, -0.7587, -0.6880],
         [ 0.5010,  0.3575,  0.5404,  ...,  1.2885, -0.4144, -0.1635]]],
       grad_fn=<UnsafeViewBackward0>)
