## pip

In [1]:
!pip install datasets



In [None]:
!pip install tiktoken

## Import

In [2]:
import os
import re
import time
import json
import random
import string
import psutil
import pickle
from tqdm import tqdm
from pprint import pprint
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mode

from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, decoders, processors
import tiktoken

import torch
from torch.utils.data import TensorDataset, Dataset, IterableDataset, DataLoader

In [3]:
from IPython.display import HTML
shell = get_ipython()

def adjust_font_size():
  display(HTML('''<style>
    body {
      font-size: 24px;
    }
  '''))

if adjust_font_size not in shell.events.callbacks['pre_execute']:
  shell.events.register('pre_execute', adjust_font_size)

# üî¥ **Utils**

In [4]:
def prepare_data(tokens, seq_len):
    # Trim tokens so that total length is divisible by seq_len
    n_tokens = (tokens.shape[0] // seq_len) * seq_len
    tokens = tokens[:n_tokens]

    # Reshape to 2D tensor
    return tokens.view(-1, seq_len)


In [5]:
def num_trainable_params(model):
  nums = sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

In [6]:
def calculate_time(model, x, num_runs=10):
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(num_runs):
        model(*x)
    torch.cuda.synchronize()
    return (time.time() - start) / num_runs


def calculate_time_cpu(model, x, num_runs=10):
    start = time.time()
    for _ in range(num_runs):
        model(*x)
    return (time.time() - start) / num_runs

# üü• tokenize Tiktoken fast

In [None]:
dataset=load_dataset("roneneldan/TinyStories")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset.shape

{'train': (2119719, 1), 'validation': (21990, 1)}

In [None]:
tokenizer=tiktoken.get_encoding("gpt2")
tokenized_train_samples = []
for item in tqdm(dataset["train"], desc="Tokenizing Train Set"):
    input_ids = tokenizer.encode(item["text"])
    tokenized_train_samples.append(np.array(input_ids))

In [None]:
tokenized_valid_samples = []
for item in tqdm(dataset["validation"], desc="Tokenizing validation Set"):
    input_ids = tokenizer.encode(item["text"])
    tokenized_valid_samples.append(np.array(input_ids))

In [None]:
tokenized_valid_samples[:1]

In [None]:
sumtoks=  sum(len(tok) for tok in tokenized_train_samples)
print(sumtoks)

# üü• Train Bpe Tokenizer and data loader

## üüß BPE Trainer

In [None]:
# Initialize a BPE tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="|<unk>|"))

# Use a pre-tokenizer to split text into words
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

# Initialize a BPE trainer
trainer = trainers.BpeTrainer(
    vocab_size=10_000,  # Set the vocabulary size
    special_tokens=["|<unk>|", "<|endoftext|>"],
    min_frequency=2,  # Set the minimum frequency of tokens
    )

# Train the tokenizer on a custom dataset
tokenizer.train_from_iterator(dataset["train"]["text"], trainer)

# Add special tokens
tokenizer.post_processor = processors.TemplateProcessing(
    single="<|endoftext|> $A",
    special_tokens=[("<|endoftext|>", tokenizer.token_to_id("<|endoftext|>"))],
)

# Add decoder
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False)

# Save the trained tokenizer
tokenizer.save("bpe-tokenizer_tinystories.json")

#
print(f"üéâ Tokenizer training complete!")
print(f"üîπ Vocabulary size: {tokenizer.get_vocab_size():,} tokens")

üéâ Tokenizer training complete!
üîπ Vocabulary size: 10,000 tokens


In [None]:
# Initialize a BPE tokenizer
tokenizer = Tokenizer.from_file("bpe-tokenizer_tinystories.json")
print(f"üéâ Tokenizer training complete!")
print(f"üîπ Vocabulary size: {tokenizer.get_vocab_size():,} tokens")

üéâ Tokenizer training complete!
üîπ Vocabulary size: 10,000 tokens


In [88]:
sent = 'They played together all day and became best friends.'
tokens = tokenizer.encode(sent)
print(tokens.ids)
print(tokens.tokens)

pprint(tokenizer.decode(tokens.ids))

[1, 546, 667, 462, 378, 252, 161, 1042, 725, 375, 15]
['<|endoftext|>', 'They', 'ƒ†played', 'ƒ†together', 'ƒ†all', 'ƒ†day', 'ƒ†and', 'ƒ†became', 'ƒ†best', 'ƒ†friends', '.']
'They played together all day and became best friends.'


## üüß Save and load Tokens with BPE tokenizer

In [None]:
# Tokenization {train}
tokenized_train_samples = []
for item in tqdm(dataset["train"], desc="Tokenizing Train Set"):
    input_ids = tokenizer.encode(item["text"]).ids
    tokenized_train_samples.append(np.array(input_ids))

In [None]:
tokenized_train_samples_concat=[]
tokenized_train_samples_concat = np.concatenate(tokenized_train_samples)
len(tokenized_train_samples_concat)

In [None]:
# Save tokens as a pytorch file
torch.save(torch.tensor(tokenized_train_samples_concat), 'tokenized-train-samples_vocab-10k.pt')

In [None]:
# Tokenization {validation}
tokenized_valid_samples = []
for item in tqdm(dataset["validation"], desc="Tokenizing Validation Set"):
    input_ids = tokenizer.encode(item["text"]).ids
    tokenized_valid_samples.append(np.array(input_ids))

In [None]:
tokenized_valid_samples_concat=[]
tokenized_valid_samples_concat = np.concatenate(tokenized_valid_samples)
len(tokenized_valid_samples_concat)

In [None]:
# Save tokens as a pytorch file
torch.save(torch.tensor(tokenized_valid_samples_concat), 'tokenized-valid-samples_vocab-10k.pt')

## üüß Custom dataset

In [9]:
class TinyStoriesDataset(Dataset):

    def __init__(self, data, seq_len):
        self.seq_len = seq_len
        self.data = prepare_data(data, seq_len+1)

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        sample = self.data[idx]
        return sample[:-1], sample[1:]

In [10]:
train_set = TinyStoriesDataset(tokenized_train_samples, 128)
train_set.data.shape, len(train_set), train_set[0]

AttributeError: 'list' object has no attribute 'shape'

In [None]:
%timeit next(iter(train_set))

In [None]:
b=next(iter(train_set))

In [None]:
len(b)

## üüß DataLoader

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tokenizer = Tokenizer.from_file("/content/drive/MyDrive/temp/bpe-tokenizer_tinystories.json")
tokenizer

In [13]:
# tokenized_train_samples = torch.load('/content/drive/MyDrive/temp/tokenized-train-samples_vocab-10k.pt')
tokenized_valid_samples = torch.load('/content/drive/MyDrive/temp/tokenized-valid-samples_vocab-10k.pt')

# train_set = TinyStoriesDataset(tokenized_train_samples, seq_len=128)
valid_set = TinyStoriesDataset(tokenized_valid_samples, seq_len=128)

In [14]:
# train_loader = DataLoader(train_set, batch_size=32, shuffle=True, pin_memory=True) #, num_workers=2)
valid_loader = DataLoader(valid_set, batch_size=32, shuffle=False, pin_memory=True) #, num_workers=2)

In [15]:
x_batch, y_batch = next(iter(valid_loader))
x_batch.shape, y_batch.shape

(torch.Size([32, 128]), torch.Size([32, 128]))

In [None]:
print(x_batch[0,:])
print('\n',y_batch[0,:])

In [None]:
len(train_loader), len(valid_loader)

In [57]:
len(train_loader) / (20*60)

93.865

In [18]:
train_iter = iter(train_loader)

In [None]:
%timeit next(train_iter)

## üüß EDA

In [16]:
token_count_stories=[]
for tokns in tokenized_train_samples:
    token_count_stories.append(len(tokns))

In [None]:
token_count_stories_np=np.array(token_count_stories)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(token_count_stories, bins=50, kde=True)
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.title('Distribution of Token Counts')
plt.show()

In [None]:
np.sort(token_count_stories_np)[:1000]

# üü• Transformer Model from scratch

In [17]:
class MultiHeadAtention(torch.nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.qkv_proj = torch.nn.Linear(embed_dim, 3 * embed_dim)
        self.out_proj = torch.nn.Linear(embed_dim, embed_dim)
    # ÿßÿ≠ÿ™ŸÖÿßŸÑÿß ÿÆÿ∑ÿß ÿßÿ®ÿπÿßÿØ ÿØÿßÿ±ÿØ ÿ≤ŸÖÿßŸÜ ÿ™ÿ±€åŸÜ ŸÖŸÖ⁄©ŸÜ ÿßÿ≥ÿ™ ÿ™ÿ±€åŸÜ ŸÜÿ¥ŸàÿØ
    def forward(self, x):
        batch_size, seq_len, embed_dim = x.size()
        k,q,v = self.qkv_proj(x).view(batch_size, seq_len, 3, self.num_heads, self.head_dim).transpose(1,2).chunk(3)
        # F.scaled_dot_product_attention(q,k,v)
        # return self.out_proj(x)
        return q

In [None]:
x=torch.range(1,24).view(2,3,4)
print(x)

# /x=x.transpose(1,0)
print(x)
#

In [None]:
print(x.shape)
y= MultiHeadAtention(4,2)(x)
y.shape

# üî¥ **Model from scratch - Howsam**

## üü† Define

In [21]:
import time
from dataclasses import dataclass

from datasets import load_dataset
from tokenizers import Tokenizer

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F

In [22]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

## üü† Embedding

In [23]:
wte = nn.Embedding(tokenizer.get_vocab_size(), 100)
wte(torch.tensor([1, 2,3])).shape

torch.Size([3, 100])

In [24]:
seq_len = 128
wpe = nn.Embedding(seq_len, 100)
wpe(torch.tensor([1, 2, 100])).shape

torch.Size([3, 100])

In [113]:
wpe(torch.arange(x_batch.shape[1])).shape

torch.Size([128, 100])

In [25]:
x = wte(x_batch) + wpe(torch.arange(x_batch.shape[1]))
x.shape

torch.Size([32, 128, 100])

## üü† Scaled Dot-Product Attention

In [16]:
q = k = v = x
print(q.shape)

mask = torch.tril(torch.ones(seq_len, seq_len))

scores = q @ k.transpose(-2, -1) / (k.shape[-1]**0.5)
scores.masked_fill_(mask ==0, float(-torch.inf))
scores = scores.softmax(dim=-1)
print(scores.shape)

z = scores @ v
z.shape

NameError: name 'x' is not defined

In [None]:
# scores = torch.randn(3, 5, 5)
# mask = torch.tril(torch.ones(5, 5))
# scores.masked_fill_(mask ==0, float(-torch.inf))
# scores = scores.softmax(dim=-1)
# scores

In [None]:
def scaled_dot_product_attention(q, k, v):
    mask = torch.tril(torch.ones(q.shape[-2], q.shape[-2])).to(device)
    scores = q @ k.transpose(-2, -1) / (k.shape[-1]**0.5)
    scores.masked_fill_(mask==0, float(-torch.inf))
    scores = scores.softmax(dim=-1)
    z = scores @ v
    return z

In [None]:
scaled_dot_product_attention(x.to(device), x.to(device), x.to(device)).shape

In [17]:
q = torch.randn((128, 1024, 768), device=device)
k = torch.randn((128, 1024, 768), device=device)
v = torch.randn((128, 1024, 768), device=device)
q.shape

torch.Size([128, 1024, 768])

In [18]:
scaled_dot_product_attention(q, k, v).shape

NameError: name 'scaled_dot_product_attention' is not defined

In [None]:
# calculate_time(scaled_dot_product_attention, (q, k, v), num_runs=20)
calculate_time_cpu(scaled_dot_product_attention, (q, k, v), num_runs=20)

In [None]:
F.scaled_dot_product_attention(q, k, v, is_causal=True).shape

In [None]:
torch.abs(scaled_dot_product_attention(q, k, v) - F.scaled_dot_product_attention(q, k, v, is_causal=True)).max()

In [None]:
calculate_time(F.scaled_dot_product_attention, (q, k, v), num_runs=20)

## üü† Multi Head Attention

In [None]:
# class MultiHeadAttention(nn.Module):

#     def __init__(self):
#         super().__init__()
#         self.fc1 = nn.Linear(100, 1000)
#         self.fc2 = nn.Linear(1000, 100)
#         self.fc3 = nn.Linear(1000, 100)

#     def forward(self, x):
#         y = F.relu(self.fc1(x))
#         y1 = self.fc2(y)
#         y2 = self.fc3(y)
#         return F.relu(torch.concat([y1, y2], dim=-1))

In [None]:
# mha = MultiHeadAttention()
# num_trainable_params(mha)
# mha.forward(torch.rand(10, 100)).shape

In [83]:
# x=torch.randn(2,4)
# print(x)
# lx=nn.Linear(4,8,bias=False)
# y1 =x@ lx.weight.T
# y2=lx(x)
# print(y1.softmax(dim=-1).argmax(dim=0))
# print(y1.softmax(dim=-1))
# print(lx.weight.T.shape)

In [19]:
class GPTConfig:
    n_embd: int = 100
    n_head: int = 5

config = GPTConfig()
config.n_embd

100

In [26]:
class MultiHeadAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.n_embd = config.n_embd
        self.n_head = config.n_head
        self.head_size = self.n_embd // self.n_head

        self.qkv_proj = nn.Linear(self.n_embd, 3*self.n_embd, bias=False)

        self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False)
        self.c_proj.residual = True

    def forward(self, x):
        B, T, C = x.shape
        q, k, v = self.qkv_proj(x).view(B, T, 3*self.n_head, self.head_size).transpose(1, 2).chunk(3, dim=-3)

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C)

        y = self.c_proj(y)
        return y #,q, k, v

In [None]:
# mha = MultiHeadAttention(config)
# y,q, k, v= mha(x)
# print("X:",x.shape)
# print("qkv_proj:",mha.qkv_proj.weight.T.shape)
# print("c_proj:",mha.c_proj.weight.T.shape)

# print("q:",q.shape)
# print(k.shape)
# print(v.shape)
# print("y:",y.shape)

In [None]:
calculate_time(mha.to(device), (x.to(device),), num_runs=20)

## üü† Feed Forward (MLP)

In [21]:
class GPTConfig:
    n_embd: int = 100
    n_head: int = 5
    f_expnd: float = 4

config = GPTConfig()
config.n_embd

100

In [22]:
class FeedForward(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.n_embd = config.n_embd
        self.f_expnd = config.f_expnd

        self.up_proj = nn.Linear(self.n_embd, int(self.f_expnd*self.n_embd), bias=False)
        self.down_proj = nn.Linear(int(self.f_expnd*self.n_embd), self.n_embd, bias=False)
        self.down_proj.residual = True

    def forward(self, x):
        return self.down_proj(F.gelu(self.up_proj(x)))

In [None]:
# x1=torch.randn(2,4,100)
# plt.hist(x1.flatten(), bins=50);
# plt.show()
# plt.hist(F.gelu(x1).flatten(), bins=50);
# plt.show()
# F.gelu(x1).min(dim=-1)

In [23]:
feedfor = FeedForward(config)
print(feedfor(x).shape)
feedfor.up_proj.weight.T.shape

NameError: name 'x' is not defined

In [25]:
num_trainable_params(feedfor)*1000

80.0

In [None]:
calculate_time(mlp, (x, ), num_runs=20)

## üü† Decoder Block

In [28]:
class DecoderBlock(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.n_embd = config.n_embd

        self.ln1 = nn.LayerNorm(config.n_embd)
        self.mha = MultiHeadAttention(config)

        self.ln2 = nn.LayerNorm(config.n_embd)
        self.mlp = FeedForward(config)

    def forward(self, x):
        x = x + self.mha(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

In [29]:
decoder = DecoderBlock(config)
y = decoder(x)
print(x.shape)
plt.hist(x.detach().flatten(), bins=50);
plt.show()
plt.hist(y.detach().flatten(), bins=50);
plt.show()

means_before = x.mean(dim=-1)  # shape: (32, 128)
means_after = y.mean(dim=-1)

stds_before = x.std(dim=-1)
stds_after = y.std(dim=-1)

plt.hist(means_before.detach().flatten(), bins=50);
plt.show()
plt.hist(means_after.detach().flatten(), bins=50);
plt.show()

NameError: name 'config' is not defined

In [None]:
norml=nn.LayerNorm(100)
xrnd=torch.range(1,100)
print(xrnd.min().item(), xrnd.max().item(), xrnd.mean().item(), xrnd.std().item())
print(norml(xrnd).min().item(), norml(xrnd).max().item(),norml(xrnd).mean().item(),9 , norml(xrnd).std().item())
plt.hist(xrnd, bins=10);
plt.show()
plt.hist(norml(xrnd).detach(), bins=10);
plt.show()

In [30]:
num_trainable_params(decoder) * 1e3

NameError: name 'decoder' is not defined

In [49]:
calculate_time_cpu(decoder, (x, ), num_runs=20) * 1e3

33.37346315383911

## üü† GPT

In [31]:
class GPTConfig:
    vocab_size: int = 10_000
    seq_len: int = 128
    n_layer: int = 12
    n_embd: int = 100
    n_head: int = 5
    f_expnd: float = 4


config = GPTConfig()
config.n_embd

100

In [32]:
class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.seq_len, config.n_embd)
        # self.decoders = nn.Sequential(*[DecoderBlock(config) for _ in range(config.n_layer)])
        self.decoders = nn.ModuleList([DecoderBlock(config) for _ in range(config.n_layer)])
        self.lnf = nn.LayerNorm(config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.lm_head.weight = self.wte.weight
        # self.lm_head.weight.data.uniform_(-1/self.lm_head.in_features**0.5, 1/self.lm_head.in_features**0.5)
        # nn.init.uniform_(self.lm_head.weight, -1/self.lm_head.in_features**0.5, 1/self.lm_head.in_features**0.5)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        std = 0.02
        if isinstance(module, nn.Linear):
            if hasattr(module, 'residual'):
                std *= (2*self.config.n_layer)**-0.5
            nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=std)

    def forward(self, idx):
        B, T = idx.shape

        x = self.wte(idx) + self.wpe(torch.arange(T, device=device))

        # x = self.decoders(x)
        for decoder in self.decoders:
            x = decoder(x)

        x = self.lnf(x)
        logits = self.lm_head(x)
        return logits

In [33]:
model = GPT(config).to(device)
logits= model(x_batch.to(device))
logits.shape

NameError: name 'FeedForward' is not defined

ü¶æ ÿ™ŸÖÿ±€åŸÜ:

---


ÿ®ÿÆÿ¥ ÿ™ŸàŸÑ€åÿØ ŸÖÿ™ŸÜ ÿ±ÿß ÿ¨ŸÜÿ±€åÿ™ Ÿà ÿ®ÿ±ÿß€å ŸÖÿØŸÑ ÿ¨€å Ÿæ€å ÿ™€å ÿ®Ÿá ÿ¥⁄©ŸÑ€å ÿ≥ÿßÿØŸá Ÿà ÿ®ÿ± ÿßÿ≥ÿßÿ≥ ÿ¢ÿ±⁄Ø ŸÖ⁄©ÿ≥ Ÿæ€åÿßÿØŸá‚Äåÿ≥ÿßÿ≤€å ⁄©ŸÜ€åÿØ

In [34]:
gen_tokens= torch.argmax(F.softmax(logits,dim=-1), dim=-1)
pprint(tokenizer.decode(x_batch[0,:].tolist()))
print('\n\n')
pprint(tokenizer.decode(gen_tokens[0,:].tolist()))

NameError: name 'logits' is not defined

In [35]:
model.lm_head.weight.T.shape , model.wte.weight.T.shape

NameError: name 'model' is not defined

In [32]:
num_trainable_params(model), num_trainable_params(model.decoders), num_trainable_params(model.lm_head)

(2.4578, 1.4448, 1.0)

In [None]:
calculate_time(model, (x_batch.to(device),), num_runs=100) * 1e3

## üü† Initialization

In [34]:
model = GPT(
    GPTConfig).to(device)

In [None]:
plt.hist(model.decoders[0].mha.c_proj.weight.flatten().detach().cpu(), bins=50);

In [None]:
0.02 * (2*4)**-0.5 * 3

0.021213203435596427

In [None]:
plt.hist(model.wpe.weight.flatten()[:100_000].detach().cpu(), bins=50);

In [None]:
plt.hist(model)

In [None]:
plt.hist(model.decoders[2].mlp.down_proj.weight.flatten().detach().cpu(), bins=50);

# üü• GPT model implement with nn.torch transformer

ü¶æ ÿ™ŸÖÿ±€åŸÜ


ÿ¥ÿ®⁄©Ÿá ÿ¨€å Ÿæ€å ÿ™€å ÿ±ÿß ÿ®ÿß ÿßÿ≥ÿ™ŸÅÿßÿØŸá ÿßÿ≤ ÿØÿ≥ÿ™Ÿàÿ±ÿßÿ™ ÿ¢ŸÖÿßÿØŸá Ÿæÿß€å ÿ™Ÿàÿ±⁄Ü Ÿæ€åÿßÿØŸá‚Äåÿ≥ÿßÿ≤€å ⁄©ŸÜ€åÿØ Ÿà ÿ≤ŸÖÿßŸÜ ÿßÿ¨ÿ±ÿß€å ÿ¢ŸÜ ÿ±ÿß ÿ®ÿß ŸÖÿØŸÑ Ÿæ€åÿßÿØŸá‚Äåÿ≥ÿßÿ≤€å‚Äåÿ¥ÿØŸá ÿßÿ≤ ÿµŸÅÿ± ŸÖŸÇÿß€åÿ≥Ÿá ⁄©ŸÜ€åÿØ.

In [92]:
transformer = torch.nn.Transformer(d_model=100, nhead=5, num_encoder_layers=0, num_decoder_layers=12, dim_feedforward=400,
                     dropout=0.1, activation="gelu", custom_encoder=None, custom_decoder=None, layer_norm_eps=1e-05,
                     batch_first=False, norm_first=False, bias=True, device=None, dtype=None)



In [97]:
decoder_layer = nn.TransformerDecoderLayer(d_model= 100, nhead=5, dim_feedforward=400)
transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=12)
memory = torch.rand(10, 32, 100)
tgt = torch.rand(20, 32, 100)
out = transformer_decoder(tgt, memory)
out.shape

torch.Size([20, 32, 100])

In [51]:
class GPTTorch(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.seq_len, config.n_embd)
        decoder_layer = nn.TransformerDecoderLayer(d_model= config.n_embd, nhead=config.n_head,
                                                   dim_feedforward=config.f_expnd*config.n_embd,activation=F.gelu,norm_first=True
                                                   )
        # decoder_layer.self_attn.is_causal = True
        self.decoders = nn.TransformerDecoder(decoder_layer, num_layers=12)
        # self.decoders = nn.ModuleList([DecoderBlock(config) for _ in range(config.n_layer)])
        self.lnf = nn.LayerNorm(config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.lm_head.weight = self.wte.weight

        self.apply(self._init_weights)

    def _init_weights(self, module):
        std = 0.02
        if isinstance(module, nn.Linear):
            if hasattr(module, 'residual'):
                std *= (2*self.config.n_layer)**-0.5
            nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=std)

    def forward(self, idx):
        B, T = idx.shape

        x = self.wte(idx) + self.wpe(torch.arange(T, device=device))

        # x = self.decoders(x)
        # for decoder in self.decoders:
        x = self.decoders(x ,memory=None)

        x = self.lnf(x)
        logits = self.lm_head(x)
        return logits

In [52]:
model = GPTTorch(config).to(device)
logits= model(x_batch.to(device))
logits.shape

AttributeError: 'NoneType' object has no attribute 'is_nested'

In [109]:
gen_tokens= torch.argmax(F.softmax(logits,dim=-1), dim=-1)
pprint(tokenizer.decode(x_batch[0,:].tolist()))
print('\n\n')
pprint(tokenizer.decode(gen_tokens[0,:].tolist()))

('Spot. Spot saw the shiny car and said, "Wow, Kitty, your car is so bright '
 'and clean!" Kitty smiled and replied, "Thank you, Spot. I polish it every '
 'day."\n'
 '\n'
 'After playing with the car, Kitty and Spot felt thirsty. They found a small '
 'pond with clear water. They drank the water and felt very happy. They played '
 'together all day and became best friends.Once upon a time, in a big forest, '
 'there lived a rhinoceros named Roxy. Roxy loved to climb. She climbed trees, '
 'rocks, and hills. One day, Roxy found an icy hill. She')



(' throws singer becoming disgust mouse disgust disgust His earn farm ideas '
 'disgustothyextext farmendant farm farm farm farmendantendantendant opera smo '
 'farm glided intelligink don√¢iments intellig smo smoiments≈ìWhere≈ìWhere '
 'plantroom att bru rang rang don√¢ don√¢ att successful promisedext meowing '
 'slower as blood h att blood hhen cuts h htedink hhenhenMaya exc beetle att '
 'slower gap h h Carl asounced hawkhenhenlephante

ü¶æ ÿ™ŸÖÿ±€åŸÜ

⁄©ÿßŸÜŸÅ€å⁄Ø ŸÖÿØŸÑ‚ÄåŸáÿß€å ÿ™ÿß€åŸÜ€å ÿßÿ≥ÿ™Ÿàÿ±€å ÿ±ÿß ÿØÿ± €å⁄© ÿ¨ÿØŸàŸÑ ÿÆŸÑÿßÿµŸá ⁄©ŸÜ€åÿØ Ÿà ÿ®ÿ±ÿßÿ≥ÿßÿ≥ ÿß€åŸÜ ⁄©ÿßŸÜŸÅ€å⁄Ø‚ÄåŸáÿß ŸÖÿØŸÑ ÿ±ÿß ÿ®ÿ≥ÿßÿ≤€åÿØ.

Howsam |vocab_size:10_000| seq_len:128| n_layer:12| n_embd:100|n_head:5| f_expnd: 4  

GPT-14 |vocab_size:50304| seq_len:1024| n_layer:12| n_embd:768|n_head:12| f_expnd: 4| dropout: 0.0| bias:True

gpt_neo | vocab_size:50257| seq_len:256| n_layer:8| n_embd:256|n_head:16| f_expnd: --| dropout: 0.1| torch_dtype:float16


ü¶æ  ÿ™ŸÖÿ±€åŸÜ


ÿßŸÑÿ®ÿ™Ÿá! ÿØÿ± ÿß€åŸÜÿ¨ÿß ÿÆŸÑÿßÿµŸá‚Äåÿß€å ÿßÿ≤ **ÿß€åÿØŸá‚ÄåŸáÿß€å ÿßÿµŸÑ€å** ÿ≥Ÿá ŸÖÿØŸÑ ŸÖÿπÿ±ŸàŸÅ ÿßÿ≤ ÿØŸà ÿÆÿßŸÜŸàÿßÿØŸá ŸÖÿÆÿ™ŸÑŸÅ ÿ¢Ÿàÿ±ÿØŸá ÿ¥ÿØŸá ÿßÿ≥ÿ™:

---

## üîπ ÿÆÿßŸÜŸàÿßÿØŸá Encoder-only (ŸÖÿØŸÑ‚ÄåŸáÿß€å€å ŸÖÿ´ŸÑ BERT)

### üìå 1. **BERT (Bidirectional Encoder Representations from Transformers)**

* **ŸÜŸàÿπ**: Encoder-only
* **ÿß€åÿØŸá ÿßÿµŸÑ€å**: €åÿßÿØ⁄Ø€åÿ±€å ŸÜŸÖÿß€åÿ¥‚ÄåŸáÿß€å ÿπŸÖ€åŸÇ ÿ≤ÿ®ÿßŸÜ€å ÿßÿ≤ ŸÖÿ™ŸÜ ÿ®ÿß ÿÆŸàÿßŸÜÿØŸÜ **ÿØŸà ÿ∑ÿ±ŸÅŸá** (bidirectional).
* **ÿ±Ÿàÿ¥ ÿ¢ŸÖŸàÿ≤ÿ¥**:

  * **Masked Language Modeling (MLM)**: ÿ®ÿπÿ∂€å ÿßÿ≤ ÿ™Ÿà⁄©ŸÜ‚ÄåŸáÿß ÿØÿ± ÿ¨ŸÖŸÑŸá ŸÖÿßÿ≥⁄© ŸÖ€å‚Äåÿ¥ŸàŸÜÿØÿå Ÿà ŸÖÿØŸÑ ÿ®ÿß€åÿØ ÿ¢ŸÜ‚ÄåŸáÿß ÿ±ÿß Ÿæ€åÿ¥‚Äåÿ®€åŸÜ€å ⁄©ŸÜÿØ.
  * **Next Sentence Prediction (NSP)**: ÿ¢€åÿß ÿ¨ŸÖŸÑŸá ÿØŸàŸÖ ÿØŸÜÿ®ÿßŸÑŸá ÿ¨ŸÖŸÑŸá ÿßŸàŸÑ Ÿáÿ≥ÿ™ €åÿß ŸÜŸáÿü
* **⁄©ÿßÿ±ÿ®ÿ±ÿØŸáÿß**: ÿØÿ±⁄© ÿ≤ÿ®ÿßŸÜÿå ÿ∑ÿ®ŸÇŸá‚Äåÿ®ŸÜÿØ€å ŸÖÿ™ŸÜÿå Named Entity Recognition Ÿà ...

---

### üìå 2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**

* **ŸÜŸàÿπ**: Encoder-only (ŸÖÿ®ÿ™ŸÜ€å ÿ®ÿ± BERT)
* **ÿß€åÿØŸá ÿßÿµŸÑ€å**: ÿ®Ÿáÿ®ŸàÿØ ⁄©€åŸÅ€åÿ™ ÿ¢ŸÖŸàÿ≤ÿ¥ BERT ÿ®ÿß ÿ≠ÿ∞ŸÅ NSPÿå ÿßÿ≥ÿ™ŸÅÿßÿØŸá ÿßÿ≤ ÿØÿßÿØŸá ÿ®€åÿ¥ÿ™ÿ± Ÿà batchŸáÿß€å ÿ®ÿ≤ÿ±⁄Ø‚Äåÿ™ÿ±.
* **ÿ™ŸÅÿßŸàÿ™ ÿ®ÿß BERT**:

  * ŸÅŸÇÿ∑ ÿßÿ≤ MLM ÿßÿ≥ÿ™ŸÅÿßÿØŸá ŸÖ€å‚Äå⁄©ŸÜÿØ.
  * ÿßÿ≤ shuffling ÿ®Ÿáÿ™ÿ± ÿØÿßÿØŸá‚ÄåŸáÿß Ÿà dynamic masking ÿ®Ÿáÿ±Ÿá ŸÖ€å‚Äåÿ®ÿ±ÿØ.
* **⁄©ÿßÿ±ÿ®ÿ±ÿØŸáÿß**: ŸÖÿ¥ÿßÿ®Ÿá BERT ÿßŸÖÿß ÿ®ÿß ÿØŸÇÿ™ ÿ®ÿßŸÑÿßÿ™ÿ± ÿØÿ± ÿ®ÿ≥€åÿßÿ±€å ÿßÿ≤ ÿ™ÿ≥ÿ™‚ÄåŸáÿß.

---

### üìå 3. **ELECTRA**

* **ŸÜŸàÿπ**: Encoder-only (ŸàŸÑ€å ÿ®ÿß ŸÖ⁄©ÿßŸÜ€åÿ≤ŸÖ ŸÖÿ™ŸÅÿßŸàÿ™)
* **ÿß€åÿØŸá ÿßÿµŸÑ€å**: ÿ®Ÿá‚Äåÿ¨ÿß€å Ÿæ€åÿ¥‚Äåÿ®€åŸÜ€å ÿ™Ÿà⁄©ŸÜ‚ÄåŸáÿß€å ŸÖÿßÿ≥⁄©‚Äåÿ¥ÿØŸá (ŸÖÿßŸÜŸÜÿØ BERT)ÿå €åÿßÿØ ŸÖ€å‚Äå⁄Ø€åÿ±ÿØ ÿ™ÿ¥ÿÆ€åÿµ ÿØŸáÿØ ⁄©Ÿá **ÿ¢€åÿß €å⁄© ÿ™Ÿà⁄©ŸÜ ŸàÿßŸÇÿπ€å ÿßÿ≥ÿ™ €åÿß ÿ¨ÿß€å⁄Øÿ≤€åŸÜ ÿ¥ÿØŸá**.
* **ÿ±Ÿàÿ¥ ÿ¢ŸÖŸàÿ≤ÿ¥**:

  * €å⁄© generator ÿ™Ÿà⁄©ŸÜ‚ÄåŸáÿß€å ÿ¨ÿπŸÑ€å ÿ™ŸàŸÑ€åÿØ ŸÖ€å‚Äå⁄©ŸÜÿØ.
  * €å⁄© discriminator €åÿßÿØ ŸÖ€å‚Äå⁄Ø€åÿ±ÿØ ÿ™ÿ¥ÿÆ€åÿµ ÿØŸáÿØ ⁄©ÿØÿßŸÖ ÿ™Ÿà⁄©ŸÜ‚ÄåŸáÿß ÿ™ÿ∫€å€åÿ± ⁄©ÿ±ÿØŸá‚ÄåÿßŸÜÿØ.
* **ŸÖÿ≤€åÿ™**: ÿ≥ÿ±€åÿπ‚Äåÿ™ÿ± Ÿà ⁄©ÿßÿ±ÿ¢ŸÖÿØÿ™ÿ± ÿßÿ≤ BERT ÿßÿ≤ ŸÜÿ∏ÿ± €åÿßÿØ⁄Ø€åÿ±€å.

---

## üîπ ÿÆÿßŸÜŸàÿßÿØŸá Decoder-only (ŸÖÿØŸÑ‚ÄåŸáÿß€å€å ŸÖÿ´ŸÑ GPT)

### üìå 1. **GPT-2**

* **ŸÜŸàÿπ**: Decoder-only
* **ÿß€åÿØŸá ÿßÿµŸÑ€å**: ŸÖÿØŸÑ‚Äåÿ≥ÿßÿ≤€å ÿ≤ŸÜÿ¨€åÿ±Ÿá‚Äåÿß€å ÿ≤ÿ®ÿßŸÜ ÿ®Ÿá‚ÄåÿµŸàÿ±ÿ™ **⁄ÜŸæ ÿ®Ÿá ÿ±ÿßÿ≥ÿ™** (causal), ÿ®ÿß Ÿæ€åÿ¥‚Äåÿ®€åŸÜ€å ÿ™Ÿà⁄©ŸÜ ÿ®ÿπÿØ€å.
* **ÿ±Ÿàÿ¥ ÿ¢ŸÖŸàÿ≤ÿ¥**:

  * ŸÅŸÇÿ∑ ÿßÿ≤ **Language Modeling (Auto-regressive)** ÿßÿ≥ÿ™ŸÅÿßÿØŸá ŸÖ€å‚Äå⁄©ŸÜÿØ.
* **⁄©ÿßÿ±ÿ®ÿ±ÿØŸáÿß**: ÿ™ŸàŸÑ€åÿØ ŸÖÿ™ŸÜÿå ÿÆŸÑÿßÿµŸá‚Äåÿ≥ÿßÿ≤€åÿå Ÿæÿßÿ≥ÿÆ ÿ®Ÿá Ÿæÿ±ÿ≥ÿ¥ÿå ÿ™ÿ±ÿ¨ŸÖŸá Ÿà ...

---

### üìå 2. **GPT-3**

* **ŸÜŸàÿπ**: Decoder-only (ÿ™Ÿàÿ≥ÿπŸá‚Äå€åÿßŸÅÿ™Ÿá‚Äåÿ™ÿ± ÿßÿ≤ GPT-2)
* **ÿß€åÿØŸá ÿßÿµŸÑ€å**: ŸÖŸÇ€åÿßÿ≥‚ÄåÿØŸá€å ÿ¥ÿØ€åÿØ (175B Ÿæÿßÿ±ÿßŸÖÿ™ÿ±) + ÿ™ŸàÿßŸÜÿß€å€å ÿßŸÜÿ¨ÿßŸÖ ⁄ÜŸÜÿØŸàÿ∏€åŸÅŸá‚Äåÿß€å ÿ®ÿØŸàŸÜ fine-tuning.
* **Ÿà€å⁄ò⁄Ø€å ÿÆÿßÿµ**: €åÿßÿØ⁄Ø€åÿ±€å ÿßÿ≤ ÿ∑ÿ±€åŸÇ ŸÜŸÖŸàŸÜŸá (Few-shot / Zero-shot Learning).
* **⁄©ÿßÿ±ÿ®ÿ±ÿØŸáÿß**: ⁄Üÿ™‚Äåÿ®ÿßÿ™ÿå ÿ®ÿ±ŸÜÿßŸÖŸá‚ÄåŸÜŸà€åÿ≥€å ÿÆŸàÿØ⁄©ÿßÿ±ÿå ŸÖŸÇÿßŸÑŸá‚ÄåŸÜŸà€åÿ≥€å Ÿà...

---

### üìå 3. **LLaMA (Large Language Model Meta AI)**

* **ŸÜŸàÿπ**: Decoder-only
* **ÿß€åÿØŸá ÿßÿµŸÑ€å**: ÿ∑ÿ±ÿßÿ≠€å ŸÖÿØŸÑ‚ÄåŸáÿß€å ÿ≥ÿ®⁄©‚Äåÿ™ÿ± ÿßŸÖÿß ÿ®ÿ≥€åÿßÿ± ⁄©ÿßÿ±ÿ¢ŸÖÿØ (ŸÖÿπŸÖŸàŸÑÿßŸã ÿ®ÿß Ÿæÿßÿ±ÿßŸÖÿ™ÿ±Ÿáÿß€å ⁄©ŸÖÿ™ÿ± ÿßÿ≤ GPT-3 ÿßŸÖÿß ÿπŸÖŸÑ⁄©ÿ±ÿØ ŸÖÿ¥ÿßÿ®Ÿá).
* **ŸÖÿ≤€åÿ™**: ÿ®ÿßÿ≤ ÿ®ŸàÿØŸÜ ŸÖÿØŸÑ‚ÄåŸáÿß Ÿà ÿ™ŸàÿßŸÜÿß€å€å ÿßÿ¨ÿ±ÿß ÿ±Ÿà€å ÿ≥ÿÆÿ™‚ÄåÿßŸÅÿ≤ÿßÿ± ŸÖÿ≠ÿØŸàÿØ.
* **⁄©ÿßÿ±ÿ®ÿ±ÿØŸáÿß**: ⁄Üÿ™‚Äåÿ®ÿßÿ™ÿå ÿ™ÿ≠ŸÇ€åŸÇÿßÿ™€åÿå ÿßÿ®ÿ≤ÿßÿ±Ÿáÿß€å Ÿæÿ±ÿØÿßÿ≤ÿ¥ ÿ≤ÿ®ÿßŸÜ ÿ®ÿßÿ≤.

---

## ‚úÖ ÿ¨ŸÖÿπ‚Äåÿ®ŸÜÿØ€å ÿ¨ÿØŸàŸÑ‚ÄåŸàÿßÿ±:

| ŸÖÿØŸÑ     | ŸÜŸàÿπ ŸÖÿπŸÖÿßÿ±€å   | ÿ±Ÿàÿ¥ €åÿßÿØ⁄Ø€åÿ±€å ÿßÿµŸÑ€å           | ⁄©ÿßÿ±ÿ®ÿ±ÿØ ⁄©ŸÑ€åÿØ€å                |
| ------- | ------------ | -------------------------- | --------------------------- |
| BERT    | Encoder-only | MLM + NSP                  | ÿØÿ±⁄© ÿ≤ÿ®ÿßŸÜ                    |
| RoBERTa | Encoder-only | MLM (ÿ®Ÿá€åŸÜŸá‚Äåÿ¥ÿØŸá)            | ÿ∑ÿ®ŸÇŸá‚Äåÿ®ŸÜÿØ€å Ÿà ÿØÿ±⁄© Ÿæ€åÿ¥ÿ±ŸÅÿ™Ÿá     |
| ELECTRA | Encoder-only | Discriminator-Generator    | ÿ¢ŸÖŸàÿ≤ÿ¥ ÿ≥ÿ±€åÿπ‚Äåÿ™ÿ± Ÿà ÿØŸÇ€åŸÇ‚Äåÿ™ÿ±     |
| GPT-2   | Decoder-only | Auto-regressive LM         | ÿ™ŸàŸÑ€åÿØ ŸÖÿ™ŸÜ                   |
| GPT-3   | Decoder-only | Few-shot + Auto-regressive | Ÿæÿßÿ≥ÿÆ‚ÄåÿØŸá€å ⁄ÜŸÜÿØ⁄©ÿßÿ±Ÿá            |
| LLaMA   | Decoder-only | Efficient auto-regressive  | ÿ™ÿ≠ŸÇ€åŸÇÿßÿ™€å Ÿà ⁄©ÿßÿ±ÿ®ÿ±ÿØŸáÿß€å ÿ≥ÿ®⁄©‚Äåÿ™ÿ± |

---

ÿß⁄Øÿ± ÿÆŸàÿßÿ≥ÿ™€å ŸÖÿØŸÑ‚ÄåŸáÿß€å€å ÿßÿ≤ ÿÆÿßŸÜŸàÿßÿØŸá Encoder-Decoder ŸÖÿ´ŸÑ T5 €åÿß BART ÿ±Ÿà ŸáŸÖ ÿ®ÿ±ÿ±ÿ≥€å ⁄©ŸÜ€åŸÖÿå ŸÅŸÇÿ∑ ÿ®⁄ØŸà.


ü¶æ ÿ™ŸÖÿ±€åŸÜ

ÿ®ÿß ⁄ÜŸá ÿ™ÿ∫€å€åÿ±ÿßÿ™€å ŸÖ€å‚Äåÿ™ŸàÿßŸÜ GPT ÿ±ÿß ÿ®Ÿá ŸÖÿØŸÑ€å ŸÖÿ¥ÿßÿ®Ÿá Llama 3.2 ÿ™ÿ®ÿØ€åŸÑ ⁄©ÿ±ÿØÿü ÿß€åÿØŸá‚ÄåŸáÿß€å ÿÆŸàÿØ ÿ±ÿß ÿ®€åÿßŸÜ ⁄©ŸÜ€åÿØ.

ÿß€åŸÜ ⁄©ÿØ €å⁄©€å ÿßÿ≤ ÿ®ÿÆÿ¥‚ÄåŸáÿß€å ŸÖŸáŸÖ **ŸÖÿπŸÖÿßÿ±€å ŸÖÿØŸÑ LLaMA 3** ÿßÿ≤ ÿ¥ÿ±⁄©ÿ™ **Meta AI** ÿ±ÿß Ÿæ€åÿßÿØŸá‚Äåÿ≥ÿßÿ≤€å ŸÖ€å‚Äå⁄©ŸÜÿØ. ÿß€åŸÜ ŸÖÿπŸÖÿßÿ±€å ÿßÿ≤ ŸÜŸàÿπ **Decoder-only** Ÿà ÿ®ÿ± Ÿæÿß€åŸá Transformer ÿßÿ≥ÿ™ÿå ÿ®ÿß ÿ®Ÿá€åŸÜŸá‚Äåÿ≥ÿßÿ≤€å‚ÄåŸáÿß€å€å ŸÖÿßŸÜŸÜÿØ **Rotary Embeddings**ÿå **ŸÖÿØ€åÿ±€åÿ™ ÿ≠ÿßŸÅÿ∏Ÿá (KV Caching)**ÿå Ÿà **ŸÖÿØŸÑ‚Äåÿ≥ÿßÿ≤€å ŸÖŸàÿßÿ≤€å (Model Parallelism)** ÿ®ÿß ÿßÿ≥ÿ™ŸÅÿßÿØŸá ÿßÿ≤ ⁄©ÿ™ÿßÿ®ÿÆÿßŸÜŸá `fairscale`.

ÿ®€åÿß€å€åÿØ ÿ¢ŸÜ ÿ±ÿß ⁄ØÿßŸÖ ÿ®Ÿá ⁄ØÿßŸÖ Ÿà ÿ®Ÿá ÿ≤ÿ®ÿßŸÜ ÿ≥ÿßÿØŸá ÿ™ÿ≠ŸÑ€åŸÑ ⁄©ŸÜ€åŸÖ:

---

## üß© ÿ≥ÿßÿÆÿ™ÿßÿ± ⁄©ŸÑ€å ⁄©ÿØ

| ÿ®ÿÆÿ¥                                                         | ÿ™Ÿàÿ∂€åÿ≠                                                        |
| ----------------------------------------------------------- | ------------------------------------------------------------ |
| `RMSNorm`                                                   | ŸÜÿ±ŸÖÿßŸÑ‚Äåÿ≥ÿßÿ≤€å RMSÿå ÿ¨ÿß€å⁄Øÿ≤€åŸÜ€å ÿ≥ÿ®⁄©‚Äåÿ™ÿ± Ÿà Ÿæÿß€åÿØÿßÿ±ÿ™ÿ± ÿ®ÿ±ÿß€å LayerNorm    |
| `apply_scaling`, `precompute_freqs_cis`, `apply_rotary_emb` | Ÿæ€åÿ¥‚ÄåŸæÿ±ÿØÿßÿ≤ÿ¥ ŸÖŸàŸÇÿπ€åÿ™‚Äå€åÿßÿ®€å ⁄Üÿ±ÿÆÿ¥€å (RoPE)                          |
| `Attention`                                                 | Ÿæ€åÿßÿØŸá‚Äåÿ≥ÿßÿ≤€å ⁄©ÿßŸÖŸÑ Attention ÿ®ÿß ⁄©ÿ¥ (KV cache) Ÿà Ÿæÿ¥ÿ™€åÿ®ÿßŸÜ€å ÿßÿ≤ GQA |
| `FeedForward`                                               | ÿ¥ÿ®⁄©Ÿá FFN ÿØŸà ÿ¥ÿßÿÆŸá‚Äåÿß€å ÿ®ÿß ÿ≥€å⁄ØŸÖŸà€åÿØ ÿ∫€åÿ±ÿÆÿ∑€å (SiLU)                 |
| `TransformerBlock`                                          | €å⁄© ÿ®ŸÑŸà⁄© ⁄©ÿßŸÖŸÑ ÿ¥ÿßŸÖŸÑ Attention + FFN                            |
| `Transformer`                                               | ŸÖÿØŸÑ ⁄©ÿßŸÖŸÑ LLaMA ÿ¥ÿßŸÖŸÑ ŸÑÿß€åŸá‚ÄåŸáÿß€å ŸÖÿ™ÿπÿØÿØÿå embedding Ÿà ÿÆÿ±Ÿàÿ¨€å ŸÜŸáÿß€å€å  |

---

## üîπ ÿ¨ÿ≤ÿ¶€åÿßÿ™ ŸÖŸáŸÖ Ÿáÿ± ÿ®ÿÆÿ¥

### üß† 1. `RMSNorm`

```python
class RMSNorm(nn.Module):
```

ŸÜÿ±ŸÖÿßŸÑ‚Äåÿ≥ÿßÿ≤€å ¬´ÿ±€åÿ¥Ÿá ŸÖ€åÿßŸÜ⁄Ø€åŸÜ ŸÖÿ±ÿ®ÿπÿßÿ™¬ªÿå ÿ®ÿØŸàŸÜ ⁄©ŸÖ‚Äå⁄©ÿ±ÿØŸÜ ŸÖ€åÿßŸÜ⁄Ø€åŸÜ. ÿß€åŸÜ ŸÜÿ±ŸÖÿßŸÑ‚Äåÿ≥ÿßÿ≤€å ÿ≥ÿ±€åÿπ‚Äåÿ™ÿ± Ÿà Ÿæÿß€åÿØÿßÿ±ÿ™ÿ± ÿßÿ≥ÿ™ ŸÜÿ≥ÿ®ÿ™ ÿ®Ÿá LayerNorm.

---

### üåÄ 2. `Rotary Embeddings`

```python
def apply_rotary_emb(xq, xk, freqs_cis): ...
```

ÿß€åÿØŸá RoPE ÿß€åŸÜ ÿßÿ≥ÿ™ ⁄©Ÿá ŸÖŸàŸÇÿπ€åÿ™ ÿ™Ÿà⁄©ŸÜ‚ÄåŸáÿß ÿ±ÿß ÿ®Ÿá‚ÄåÿµŸàÿ±ÿ™ ÿ™Ÿàÿßÿ®ÿπ ÿ≥€åŸÜŸàÿ≥€å Ÿæ€å⁄Üÿ¥€å Ÿàÿßÿ±ÿØ ŸÅÿ∂ÿß€å attention ⁄©ŸÜ€åŸÖ. ÿß€åŸÜ ÿ®ÿßÿπÿ´ ŸÖ€å‚Äåÿ¥ŸàÿØ ÿßÿ∑ŸÑÿßÿπÿßÿ™ ÿ™ÿ±ÿ™€åÿ®€å (position) ÿØÿ± dot-product attention ŸÑÿ≠ÿßÿ∏ ÿ¥ŸàÿØ.

* `precompute_freqs_cis` ŸÅÿ±⁄©ÿßŸÜÿ≥‚ÄåŸáÿß ÿ±ÿß ÿ®ÿ±ÿß€å ŸÖŸàŸÇÿπ€åÿ™‚ÄåŸáÿß Ÿæ€åÿ¥‚ÄåŸÖÿ≠ÿßÿ≥ÿ®Ÿá ŸÖ€å‚Äå⁄©ŸÜÿØ.
* `apply_rotary_emb` ÿß€åŸÜ ŸÅÿ±⁄©ÿßŸÜÿ≥‚ÄåŸáÿß ÿ±ÿß ÿ®Ÿá query Ÿà key ÿßÿπŸÖÿßŸÑ ŸÖ€å‚Äå⁄©ŸÜÿØ.

---

### ‚ö° 3. `Attention`

```python
class Attention(nn.Module): ...
```

ŸÑÿß€åŸá attention ŸÖÿØŸÑ ÿ®ÿß Ÿæÿ¥ÿ™€åÿ®ÿßŸÜ€å ÿßÿ≤ ŸÖŸàÿßÿ±ÿØ ÿ≤€åÿ±:

* **GQA**: Grouped Query Attention (ŸÖÿ´ŸÑ LLaMA 2/3).
* **Model Parallelism**: ÿ®ÿß `ColumnParallelLinear` Ÿà `RowParallelLinear`.
* **KV Cache**: ŸÜ⁄ØŸá‚ÄåÿØÿßÿ¥ÿ™ŸÜ key/value ÿ®ÿ±ÿß€å ÿ¨ŸÑŸà⁄Ø€åÿ±€å ÿßÿ≤ ŸÖÿ≠ÿßÿ≥ÿ®Ÿá ÿØŸàÿ®ÿßÿ±Ÿá ÿØÿ± ÿ∑ŸàŸÑ decoding.
* **Rotary Embeddings**: ÿ®ÿß `apply_rotary_emb` ÿ±Ÿà€å xq/xk.
* **Causal Masking**: ÿ®ÿß `mask` ÿØÿ± dot-product attention ÿ®ÿ±ÿß€å ÿ¨ŸÑŸà⁄Ø€åÿ±€å ÿßÿ≤ ŸÜ⁄ØÿßŸá ÿ®Ÿá ÿ¢€åŸÜÿØŸá.

---

### ‚öôÔ∏è 4. `FeedForward`

```python
class FeedForward(nn.Module): ...
```

€å⁄© ŸÑÿß€åŸá FFN ⁄©Ÿá Ÿàÿ±ŸàÿØ€å ÿ±ÿß ÿ®Ÿá hidden dimension ⁄Øÿ≥ÿ™ÿ±ÿ¥ ŸÖ€å‚ÄåÿØŸáÿØ Ÿà ÿ®ÿß ÿ≥€å⁄ØŸÖŸà€åÿØ SiLU Ÿà ÿ∂ÿ±ÿ® pointwise ÿÆÿ±Ÿàÿ¨€å ÿ™ŸàŸÑ€åÿØ ŸÖ€å‚Äå⁄©ŸÜÿØ. ÿßÿ≤ €≥ ŸÑÿß€åŸá ÿÆÿ∑€å ÿßÿ≥ÿ™ŸÅÿßÿØŸá ÿ¥ÿØŸá (w1ÿå w2ÿå w3) ÿ®ÿ±ÿß€å ÿßŸÅÿ≤ÿß€åÿ¥ ÿßŸÜÿπÿ∑ÿßŸÅ‚ÄåŸæÿ∞€åÿ±€å.

---

### üß± 5. `TransformerBlock`

```python
class TransformerBlock(nn.Module): ...
```

€å⁄© ÿ®ŸÑÿß⁄© ⁄©ÿßŸÖŸÑ ÿ¥ÿßŸÖŸÑ:

* `RMSNorm ‚Üí Attention ‚Üí Residual`
* `RMSNorm ‚Üí FFN ‚Üí Residual`

ÿ®ÿß ÿ™ÿ±ÿ™€åÿ® ŸÖÿ¥ÿßÿ®Ÿá ŸÖÿπŸÖÿßÿ±€å LLaMA (norm-first).

---

### üèóÔ∏è 6. `Transformer`

```python
class Transformer(nn.Module): ...
```

ŸÖÿØŸÑ ŸÜŸáÿß€å€å ÿ¥ÿßŸÖŸÑ:

* Embedding ÿßŸàŸÑ€åŸá ÿ®ÿß `VocabParallelEmbedding`
* Ÿæÿ¥ÿ™Ÿá‚Äåÿß€å ÿßÿ≤ `TransformerBlock`s
* Normalization ŸÜŸáÿß€å€å
* ÿÆÿ±Ÿàÿ¨€å `ColumnParallelLinear` ÿ®Ÿá ÿßŸÜÿØÿßÿ≤Ÿá‚Äå€å Ÿàÿß⁄ò⁄ØÿßŸÜ

ŸáŸÖ⁄ÜŸÜ€åŸÜ:

* `precompute_freqs_cis`: ŸÅÿ±⁄©ÿßŸÜÿ≥‚ÄåŸáÿß€å RoPE ÿßÿ≤ ŸÇÿ®ŸÑ ŸÖÿ≠ÿßÿ≥ÿ®Ÿá ŸÖ€å‚Äåÿ¥ŸàÿØ.
* `@torch.inference_mode()`: ŸÜÿ¥ÿßŸÜ ŸÖ€å‚ÄåÿØŸáÿØ ⁄©Ÿá ÿß€åŸÜ `forward()` ÿ®ÿ±ÿß€å inference ÿßÿ≥ÿ™.
* `mask`: ÿ®ÿ±ÿß€å causal attention ÿßÿ≥ÿ™ŸÅÿßÿØŸá ŸÖ€å‚Äåÿ¥ŸàÿØ.

---



ü¶æ ÿ™ŸÖÿ±€åŸÜ

ŸÖŸÇÿßŸÑÿßÿ™ Transformerÿå BERTÿå GPT-2 Ÿà GPT-3 ÿ±ÿß ŸÖÿ∑ÿßŸÑÿπŸá ⁄©ŸÜ€åÿØ Ÿà ÿÆŸÑÿßÿµŸá‚Äåÿß€å ÿßÿ≤ ÿ¢ŸÜŸáÿß ÿ®ŸÜŸà€åÿ≥€åÿØ. ŸÜ€åÿßÿ≤€å ÿ®Ÿá ŸÖÿ∑ÿßŸÑÿπŸá ÿÆÿ∑ ÿ®Ÿá ÿÆÿ∑ ŸÜ€åÿ≥ÿ™ÿõ ŸÖÿ±Ÿàÿ± ÿß€åÿØŸá‚ÄåŸáÿß€å ÿßÿµŸÑ€å ⁄©ÿßŸÅ€å ÿßÿ≥ÿ™. ÿü

---

## üî∑ 1. **Transformer (Vaswani et al., 2017) ‚Äì "Attention is All You Need"**

### üß† ÿß€åÿØŸá‚Äå€å ÿßÿµŸÑ€å:

> ⁄©ŸÜÿßÿ± ⁄Øÿ∞ÿßÿ¥ÿ™ŸÜ ÿ≥ÿßÿÆÿ™ÿßÿ±Ÿáÿß€å ŸÇÿØ€åŸÖ€å ŸÖÿ´ŸÑ RNN Ÿà LSTM Ÿà ÿßÿ≥ÿ™ŸÅÿßÿØŸá ÿµÿ±ŸÅ ÿßÿ≤ **ŸÖ⁄©ÿßŸÜ€åÿ≤ŸÖ Attention** ÿ®ÿ±ÿß€å ŸÖÿØŸÑ‚Äåÿ≥ÿßÿ≤€å ÿØŸÜÿ®ÿßŸÑŸá‚ÄåŸáÿß (sequences).

### üìå ŸÜ⁄©ÿßÿ™ ⁄©ŸÑ€åÿØ€å:

* **Self-Attention:** Ÿáÿ± ÿ™Ÿà⁄©ŸÜ ÿ®ÿß ÿ≥ÿß€åÿ± ÿ™Ÿà⁄©ŸÜ‚ÄåŸáÿß ÿ™ÿπÿßŸÖŸÑ ÿØÿßÿ±ÿØ ÿ™ÿß ÿ≤ŸÖ€åŸÜŸáŸî ŸÖÿπŸÜÿß€å€å ÿ±ÿß ÿ®Ÿáÿ™ÿ± ÿØÿ±⁄© ⁄©ŸÜÿØ.
* **Multi-Head Attention:** ⁄ÜŸÜÿØ€åŸÜ attention ŸÖŸàÿßÿ≤€å ÿßÿ¨ÿ±ÿß ŸÖ€å‚Äåÿ¥ŸàŸÜÿØ ÿ™ÿß ÿßŸÜŸàÿßÿπ ŸÖÿÆÿ™ŸÑŸÅ ÿ±Ÿàÿßÿ®ÿ∑ ÿ±ÿß ÿ®€åÿßŸÖŸàÿ≤ÿØ.
* **ÿ®ÿØŸàŸÜ RNN €åÿß CNN**: ⁄©ÿßŸÖŸÑÿßŸã ŸÖŸàÿßÿ≤€å‚Äåÿ≥ÿßÿ≤€å‚Äåÿ¥ÿØŸáÿå ÿ≥ÿ±€åÿπ‚Äåÿ™ÿ± ÿ®ÿ±ÿß€å ÿ¢ŸÖŸàÿ≤ÿ¥.
* ŸÖÿØŸÑ ÿ¥ÿßŸÖŸÑ €å⁄© **encoder-decoder** ÿ®ÿß ŸÑÿß€åŸá‚ÄåŸáÿß€å attention Ÿà feed-forward ÿßÿ≥ÿ™.

### üåç ÿ™ÿ£ÿ´€åÿ±:

Ÿæÿß€åŸá‚Äå⁄Øÿ∞ÿßÿ± ÿ™ŸÖÿßŸÖ ŸÖÿØŸÑ‚ÄåŸáÿß€å ÿ®ÿ≤ÿ±⁄Ø ÿ®ÿπÿØ€å ŸÖÿ´ŸÑ BERTÿå GPTÿå T5 Ÿà LLaMA.

---

## üî∑ 2. **BERT (Devlin et al., 2018) ‚Äì "Bidirectional Encoder Representations from Transformers"**

### üß† ÿß€åÿØŸá‚Äå€å ÿßÿµŸÑ€å:

> ŸÖÿØŸÑ ÿ≤ÿ®ÿßŸÜ€å **ÿØŸàÿ∑ÿ±ŸÅŸá (bidirectional)** ⁄©Ÿá ⁄©ŸÑ ÿ¨ŸÖŸÑŸá ÿ±ÿß (ŸÇÿ®ŸÑ Ÿà ÿ®ÿπÿØ) ÿØÿ±⁄© ŸÖ€å‚Äå⁄©ŸÜÿØÿõ ÿ¢ŸÖŸàÿ≤ÿ¥ ÿ®Ÿá‚ÄåÿµŸàÿ±ÿ™ ÿßÿ≤‚ÄåŸæ€åÿ¥ (pretraining) Ÿà ÿ®ÿπÿØ ÿßÿ≥ÿ™ŸÅÿßÿØŸá ÿØÿ± Ÿàÿ∏ÿß€åŸÅ ŸÖÿÆÿ™ŸÑŸÅ.

### üìå ŸÜ⁄©ÿßÿ™ ⁄©ŸÑ€åÿØ€å:

* ÿßÿ≤ **encoder**Ÿáÿß€å ÿ™ÿ±ŸÜÿ≥ŸÅŸàÿ±ŸÖÿ± ÿßÿ≥ÿ™ŸÅÿßÿØŸá ŸÖ€å‚Äå⁄©ŸÜÿØ (ŸÜŸá decoder).
* **Masked Language Modeling (MLM):** ÿØÿ± ÿ≠€åŸÜ ÿ¢ŸÖŸàÿ≤ÿ¥ÿå ÿ®ÿ±ÿÆ€å ÿ™Ÿà⁄©ŸÜ‚ÄåŸáÿß ŸæŸÜŸáÿßŸÜ ŸÖ€å‚Äåÿ¥ŸàŸÜÿØ Ÿà ŸÖÿØŸÑ ÿ®ÿß€åÿØ ÿ¢ŸÜ‚ÄåŸáÿß ÿ±ÿß ÿ≠ÿØÿ≥ ÿ®ÿ≤ŸÜÿØ.
* **Next Sentence Prediction (NSP):** ÿ™ÿ¥ÿÆ€åÿµ ÿØŸáÿØ ⁄©Ÿá ÿ¢€åÿß ÿØŸà ÿ¨ŸÖŸÑŸá‚Äå€å Ÿæÿ¥ÿ™‚Äåÿ≥ÿ±ŸáŸÖ ÿ®Ÿá ŸáŸÖ ŸÖÿ±ÿ®Ÿàÿ∑‚ÄåÿßŸÜÿØ €åÿß ŸÜŸá.
* ŸÖŸÜÿßÿ≥ÿ® ÿ®ÿ±ÿß€å **ÿØÿ±⁄© ŸÖÿ™ŸÜ**ÿå ŸÜŸá ÿ™ŸàŸÑ€åÿØ ŸÖÿ™ŸÜ.

### üåç ÿ™ÿ£ÿ´€åÿ±:

ÿßŸÜŸÇŸÑÿßÿ®€å ÿØÿ± **ÿØÿ±⁄© ÿ≤ÿ®ÿßŸÜ** (ŸÖÿ´ŸÑÿßŸã Ÿæÿßÿ≥ÿÆ‚Äå ÿ®Ÿá ÿ≥ŸàÿßŸÑÿå ÿ∑ÿ®ŸÇŸá‚Äåÿ®ŸÜÿØ€åÿå ÿ™ÿ±ÿ¨ŸÖŸá). ÿßŸÑŸáÿßŸÖ‚Äåÿ®ÿÆÿ¥ RoBERTaÿå ALBERTÿå Ÿà DistilBERT.

---

## üî∑ 3. **GPT-2 (Radford et al., 2019) ‚Äì "Language Models are Unsupervised Multitask Learners"**

### üß† ÿß€åÿØŸá‚Äå€å ÿßÿµŸÑ€å:

> €å⁄© ŸÖÿØŸÑ **ÿ≤ÿ®ÿßŸÜ ÿÆŸàÿØÿ±⁄Øÿ±ÿ≥€åŸà (Auto-regressive)** ⁄©Ÿá ŸÅŸÇÿ∑ ÿ®ÿß Ÿæ€åÿ¥‚Äåÿ®€åŸÜ€å ⁄©ŸÑŸÖŸá ÿ®ÿπÿØ€åÿå ŸÖ€å‚Äåÿ™ŸàÿßŸÜÿØ ÿ∑€åŸÅ Ÿàÿ≥€åÿπ€å ÿßÿ≤ Ÿàÿ∏ÿß€åŸÅ ÿ≤ÿ®ÿßŸÜ€å ÿ±ÿß ÿßŸÜÿ¨ÿßŸÖ ÿØŸáÿØ.

### üìå ŸÜ⁄©ÿßÿ™ ⁄©ŸÑ€åÿØ€å:

* ÿßÿ≤ **decoder-only Transformer** ÿßÿ≥ÿ™ŸÅÿßÿØŸá ŸÖ€å‚Äå⁄©ŸÜÿØ (ÿ®ÿ±ÿÆŸÑÿßŸÅ BERT ⁄©Ÿá encoder-only ÿßÿ≥ÿ™).
* ŸÅŸÇÿ∑ ÿ®ÿß **language modeling ŸÖÿπŸÖŸàŸÑ€å** ÿ¢ŸÖŸàÿ≤ÿ¥ ŸÖ€å‚Äåÿ®€åŸÜÿØ (ÿ®ÿØŸàŸÜ ŸÖÿßÿ≥⁄© €åÿß NSP).
* ÿ±Ÿà€å ÿØ€åÿ™ÿßÿ≥ÿ™ ÿπÿ∏€åŸÖ **WebText** ÿ¢ŸÖŸàÿ≤ÿ¥ ÿØ€åÿØŸá.
* Ÿá€å⁄Ü fine-tuning ÿÆÿßÿµ€å ÿßŸÜÿ¨ÿßŸÖ ŸÜŸÖ€å‚Äåÿ¥ŸàÿØÿõ ŸÅŸÇÿ∑ ÿßÿ≤ **prompting** ÿßÿ≥ÿ™ŸÅÿßÿØŸá ŸÖ€å‚Äåÿ¥ŸàÿØ.

### üåç ÿ™ÿ£ÿ´€åÿ±:

ŸÜÿ¥ÿßŸÜ ÿØÿßÿØ ⁄©Ÿá €å⁄© ŸÖÿØŸÑ ÿ®ÿ≤ÿ±⁄Ø ÿ≤ÿ®ÿßŸÜ€å ŸÖ€å‚Äåÿ™ŸàÿßŸÜÿØ ÿ®ÿØŸàŸÜ ÿ¢ŸÖŸàÿ≤ÿ¥ ÿßÿÆÿ™ÿµÿßÿµ€åÿå **⁄ÜŸÜÿØ Ÿàÿ∏€åŸÅŸá ŸÖÿÆÿ™ŸÑŸÅ ÿ±ÿß ŸáŸÖ‚Äåÿ≤ŸÖÿßŸÜ ÿßŸÜÿ¨ÿßŸÖ ÿØŸáÿØ** (zero-shot, one-shot, few-shot).

---

## üî∑ 4. **GPT-3 (Brown et al., 2020) ‚Äì "Language Models are Few-Shot Learners"**

### üß† ÿß€åÿØŸá‚Äå€å ÿßÿµŸÑ€å:

> ÿ®ÿß **ÿßŸÅÿ≤ÿß€åÿ¥ ÿ¥ÿØ€åÿØ ÿßŸÜÿØÿßÿ≤Ÿá ŸÖÿØŸÑ Ÿà ÿØÿßÿØŸá‚ÄåŸáÿß**ÿå ŸÖ€å‚Äåÿ™ŸàÿßŸÜ €åÿßÿØ⁄Ø€åÿ±€å few-shot ÿ±ÿß ÿ®Ÿáÿ®ŸàÿØ ÿØÿßÿØ ÿ®ÿØŸàŸÜ ŸÜ€åÿßÿ≤ ÿ®Ÿá fine-tuning.

### üìå ŸÜ⁄©ÿßÿ™ ⁄©ŸÑ€åÿØ€å:

* 175 ŸÖ€åŸÑ€åÿßÿ±ÿØ Ÿæÿßÿ±ÿßŸÖÿ™ÿ± (ÿ®ÿ≤ÿ±⁄Ø‚Äåÿ™ÿ±€åŸÜ ÿØÿ± ÿ≤ŸÖÿßŸÜ ÿÆŸàÿØ).
* ŸáŸÖÿßŸÜ ÿ≥ÿßÿÆÿ™ÿßÿ± GPT-2 (decoder-only, auto-regressive) ŸàŸÑ€å ÿ®ÿ≥€åÿßÿ± ÿ®ÿ≤ÿ±⁄Ø‚Äåÿ™ÿ±.
* ŸÅŸÇÿ∑ ÿ®ÿß prompting ŸÖ€å‚Äåÿ™ŸàÿßŸÜ ŸÖÿØŸÑ ÿ±ÿß ÿ®ÿ±ÿß€å ÿ≠ŸÑ ŸÖÿ≥ÿßÿ¶ŸÑ€å ŸÖÿ´ŸÑ ÿ™ÿ±ÿ¨ŸÖŸáÿå ÿÆŸÑÿßÿµŸá‚Äåÿ≥ÿßÿ≤€åÿå ⁄©ÿØŸÜŸà€åÿ≥€åÿå Ÿà... ŸáÿØÿß€åÿ™ ⁄©ÿ±ÿØ.
* ÿØ€å⁄Øÿ± ŸÜ€åÿßÿ≤ ÿ®Ÿá ÿ¢ŸÖŸàÿ≤ÿ¥ ŸÖÿ¨ÿØÿØ (fine-tuning) ŸÜ€åÿ≥ÿ™ ‚Äî ŸÅŸÇÿ∑ ŸÖÿ´ÿßŸÑ ÿ®ÿØŸá Ÿà ŸÖÿØŸÑ €åÿßÿØ ŸÖ€å‚Äå⁄Ø€åÿ±ÿØ.

### üåç ÿ™ÿ£ÿ´€åÿ±:

ŸÜÿ¥ÿßŸÜ ÿØÿßÿØ ⁄©Ÿá ŸÖŸÇ€åÿßÿ≥‚ÄåÿØŸá€å ÿπÿ∏€åŸÖ ŸÖ€å‚Äåÿ™ŸàÿßŸÜÿØ ÿ®Ÿá **€åÿßÿØ⁄Ø€åÿ±€å ÿπŸÖŸàŸÖ€å Ÿà ŸÖŸÜÿπÿ∑ŸÅ** ŸÖŸÜÿ¨ÿ± ÿ¥ŸàÿØÿõ ŸÜŸÇÿ∑ŸáŸî ÿπÿ∑ŸÅ ÿØÿ± ÿ≥ÿßÿÆÿ™ ŸÖÿØŸÑ‚ÄåŸáÿß€å ŸÖŸàŸÑÿØ ÿπŸÖŸàŸÖ€å ŸÖÿ´ŸÑ ChatGPT Ÿà Copilot.

---

## üìä ŸÖŸÇÿß€åÿ≥Ÿá ⁄©Ÿàÿ™ÿßŸá:

| ŸÖÿØŸÑ         | ÿ≥ÿßÿÆÿ™ÿßÿ±            | ŸÜŸàÿπ €åÿßÿØ⁄Ø€åÿ±€å                 | ŸáÿØŸÅ               | ⁄©ÿßÿ±ÿ®ÿ±ÿØ ÿßÿµŸÑ€å               |
| ----------- | ----------------- | --------------------------- | ----------------- | ------------------------- |
| Transformer | Encoder + Decoder | Supervised (ÿ™ÿ±ÿ¨ŸÖŸá)          | ŸÖÿπŸÖÿßÿ±€å Ÿæÿß€åŸá       | Ÿæÿß€åŸáŸî ŸáŸÖŸá ŸÖÿØŸÑ‚ÄåŸáÿß€å ÿ®ÿπÿØ€å    |
| BERT        | Encoder-only      | Pretraining + Fine-tuning   | ÿØÿ±⁄© ÿ≤ÿ®ÿßŸÜ          | ÿ∑ÿ®ŸÇŸá‚Äåÿ®ŸÜÿØ€åÿå Ÿæÿßÿ≥ÿÆ‚Äåÿ®Ÿá‚Äåÿ≥ŸàÿßŸÑ   |
| GPT-2       | Decoder-only      | ŸÅŸÇÿ∑ Pretraining             | ÿ™ŸàŸÑ€åÿØ ŸÖÿ™ŸÜ         | ŸÖÿ™ŸÜ‚ÄåŸÜŸà€åÿ≥€åÿå multi-task     |
| GPT-3       | Decoder-only      | ŸÅŸÇÿ∑ Pretraining + Prompting | few-shot learning | €åÿßÿØ⁄Ø€åÿ±€å ŸÖŸÜÿπÿ∑ŸÅÿå general AI |

---

ÿß⁄Øÿ± ÿ®ÿÆŸàÿß€åÿå ŸÖ€å‚Äåÿ™ŸàŸÜŸÖ ÿß€åŸÜ ÿßÿ∑ŸÑÿßÿπÿßÿ™ ÿ±Ÿà ÿ™Ÿà€å ÿ¨ÿØŸàŸÑ PDF €åÿß ŸÜŸÖŸàÿØÿßÿ± ⁄Øÿ±ÿßŸÅ€å⁄©€å ŸáŸÖ ÿØÿ±ÿ®€åÿßÿ±ŸÖÿå €åÿß ÿÆŸÑÿßÿµŸá ÿßŸÜ⁄ØŸÑ€åÿ≥€å‚Äåÿ¥ŸàŸÜ ÿ±Ÿà ŸáŸÖ ÿßÿ±ÿßÿ¶Ÿá ÿ®ÿØŸÖ.
