<a href="https://colab.research.google.com/github/ZYF-B/Pytorch_learning/blob/main/GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
import matplotlib.pyplot as plt
%matplotlib inline


torch.manual_seed(1024)

<torch._C.Generator at 0x7a27b0161ab0>

In [None]:
# 超参数
learning_rate = 1e-3
batch_size = 128
sequence_len = 64
n_head = 4
n_layer = 8
emb_size = 64
head_size = emb_size//n_head
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
raw_datasets = load_dataset('tiny_shakespeare')
train_data = raw_datasets['train']['text'][0]
val_data = raw_datasets['validation']['text'][0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

The repository for tiny_shakespeare contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/tiny_shakespeare.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/435k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
class CharTokenizer:

    def __init__(self, data, end_ind=0):
        # data: list[str]
        # 得到所有的字符
        chars = sorted(list(set(''.join(data))))
        self.char2ind = {s: i + 1 for i, s in enumerate(chars)}
        self.char2ind['<|e|>'] = end_ind
        self.ind2char = {v: k for k, v in self.char2ind.items()}
        self.end_ind = end_ind

    def encode(self, x):
        # x: str
        return [self.char2ind[i] for i in x]

    def decode(self, x):
        # x: int or list[x]
        if isinstance(x, int):
            return self.ind2char[x]
        return [self.ind2char[i] for i in x]

tokenizer = CharTokenizer(train_data)
test_str = 'RES'
re = tokenizer.encode(test_str)
print(re)
print(len(tokenizer.char2ind))
''.join(tokenizer.decode(range(len(tokenizer.char2ind))))

[31, 18, 32]
66


"<|e|>\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

In [None]:
@torch.no_grad()
def generate(model, context, tokenizer, max_new_tokens=300):
    # context: (1, T)
    #out = []
    out = context.tolist()[0]
    model.eval()
    for _ in range(max_new_tokens):
        logits = model(context[:, -sequence_len:])    # (1, T, vs) 因为注意力机制，截断长度
        probs = F.softmax(logits[:, -1, :], dim=-1)  # (1, vs)
        ix = torch.multinomial(probs, num_samples=1)  # (1, 1)
        context = torch.concat((context, ix), dim=-1)
        out.append(ix.item())
        if out[-1] == tokenizer.end_ind:
          break
    model.train()
    return out

In [None]:
train_datas = torch.tensor(tokenizer.encode(train_data), dtype=torch.long)
val_datas = torch.tensor(tokenizer.encode(val_data), dtype=torch.long)

In [None]:
def get_batch(split, tokenizer):
    # generate a small batch of data of inputs x and targets y
    data = train_datas if split == 'train' else val_datas
    ix = torch.randint(len(data) - sequence_len, (batch_size,))
    x = torch.stack([data[i:i+sequence_len] for i in ix])
    y = torch.stack([data[i+1:i+sequence_len+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [None]:
@torch.no_grad()
def estimate_loss(model, tokenizer, eval_iters=100):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, tokenizer)
            logits = model(X)
            loss = F.cross_entropy(logits.transpose(-2, -1), Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
def train(model, tokenizer, optimizer, max_step = 5000, eval_step = 200):
  for step in range(max_step):
    if step % eval_step == 0 or step == max_step - 1:
        losses = estimate_loss(model, tokenizer=tokenizer)
        print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train', tokenizer=tokenizer)
    logits = model(xb)
    loss = F.cross_entropy(logits.transpose(-2, -1), yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
def attention(query, key, value, dropout, mask=None):
    # query, key, value: (B, T, H)
    # mask:         (T, T)
    # output:        (B, T, H)
    B, T, H = query.shape
    scores = query @ key.transpose(-2, -1) / H ** 0.5
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))
    w_att = F.softmax(scores, dim=-1)  # (B, T, T)
    out = w_att @ value       # (B, T, H)
    return dropout(out)

In [None]:
class MaskedSelfAttention(nn.Module):
    # 单向自注意力

    def __init__(self, emb_size, head_size):
        # emb_size: C, head_size: H
        super().__init__()
        self.key = nn.Linear(emb_size, head_size, bias=False)
        self.query = nn.Linear(emb_size, head_size, bias=False)
        self.value = nn.Linear(emb_size, head_size, bias=False)
        # 定义下三角矩阵
        self.register_buffer('tril', torch.tril(torch.ones(sequence_len, sequence_len)))
        self.dp = nn.Dropout(0.2)

    def forward(self, x):
        # x:   (B, T, C)
        # out: (B, T, H)
        B, T, C = x.shape
        k = self.key(x)    # (B, T, H)
        q = self.query(x)  # (B, T, H)
        v = self.value(x)  # (B, T, H)
        mask = self.tril[:T, :T]
        out = attention(q, k, v, self.dp, mask)
        return out

In [None]:
class MaskedMultiHeadAttention(nn.Module):

    def __init__(self, emb_size, head_size):
        super().__init__()
        n_head = emb_size // head_size
        heads = [MaskedSelfAttention(emb_size, head_size) for _ in range(n_head)]
        self.heads = nn.ModuleList(heads)
        self.proj = nn.Linear(emb_size, emb_size)
        self.dp = nn.Dropout(0.2)

    def forward(self, x):
        # x:   (B, T, C)
        # out: (B, T, C)
        out = torch.concat([h(x) for h in self.heads], dim=-1)  # (B, T, C)
        out = self.dp(self.proj(out))                           # (B, T, C)
        return out

In [None]:
class FeedForward(nn.Module):

    def __init__(self, emb_size):
        super().__init__()
        self.ln1 = nn.Linear(emb_size, 4 * emb_size)
        self.ln2 = nn.Linear(4 * emb_size, emb_size)
        self.dp = nn.Dropout(0.2)

    def forward(self, x):
        # x: (B, T, C)
        out = F.gelu(self.ln1(x))     # (B, T, C)
        out = self.dp(self.ln2(out))  # (B, T, C)
        return out

In [None]:
class Block(nn.Module):

    def __init__(self, emb_size, head_size):
        super().__init__()
        self.l1 = nn.LayerNorm(emb_size)
        self.mha = MaskedMultiHeadAttention(emb_size, head_size)
        self.l2 = nn.LayerNorm(emb_size)
        self.ff = FeedForward(emb_size)

    def forward(self, x):
        # x:   (B, T, C)
        # out: (B, T, C)
        x = x + self.mha(self.l1(x))
        x = x + self.ff(self.l2(x))
        return x

In [None]:
class GPT(nn.Module):

    def __init__(self, vs):
        super().__init__()
        self.token_emb = nn.Embedding(vs, emb_size)
        self.pos_emb = nn.Embedding(sequence_len, emb_size)
        block = [Block(emb_size, head_size) for _ in range(n_layer)]
        self.blocks = nn.Sequential(*block)
        self.l = nn.LayerNorm(emb_size)
        self.lm = nn.Linear(emb_size, vs)

    def forward(self, x):
        # x: (B, T)
        # logits: (B, T, vs)
        B, T = x.shape
        pos = torch.arange(0, T, dtype=torch.long, device=x.device)
        token_embeddings = self.token_emb(x)        # (B, T, C)
        position_embeddings = self.pos_emb(pos)     # (B, T, C)
        h = token_embeddings + position_embeddings  # (B, T, C)
        h = self.blocks(h)                # (B, T, C)
        logits = self.lm(self.l(h))             # (B, T, vs)
        return logits

In [None]:
model = GPT(len(tokenizer.char2ind)).to(device)
model, sum(p.numel() for p in model.parameters())

(GPT(
   (token_emb): Embedding(66, 64)
   (pos_emb): Embedding(64, 64)
   (blocks): Sequential(
     (0): Block(
       (l1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
       (mha): MaskedMultiHeadAttention(
         (heads): ModuleList(
           (0-3): 4 x MaskedSelfAttention(
             (key): Linear(in_features=64, out_features=16, bias=False)
             (query): Linear(in_features=64, out_features=16, bias=False)
             (value): Linear(in_features=64, out_features=16, bias=False)
             (dp): Dropout(p=0.2, inplace=False)
           )
         )
         (proj): Linear(in_features=64, out_features=64, bias=True)
         (dp): Dropout(p=0.2, inplace=False)
       )
       (l2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
       (ff): FeedForward(
         (ln1): Linear(in_features=64, out_features=256, bias=True)
         (ln2): Linear(in_features=256, out_features=64, bias=True)
         (dp): Dropout(p=0.2, inplace=False)
       )
     )
   

In [None]:
context = torch.tensor(tokenizer.encode('def'), device=device).unsqueeze(0)
print(''.join(tokenizer.decode(generate(model, context, tokenizer))))

defmY!,el<|e|>


In [None]:
estimate_loss(model, tokenizer)

{'train': tensor(4.3207), 'val': tensor(4.3121)}

In [None]:
train(model=model, tokenizer=tokenizer, optimizer=optim.AdamW(model.parameters(), lr=learning_rate))

step 0: train loss 4.3203, val loss 4.3111
step 200: train loss 2.3663, val loss 2.3693
step 400: train loss 2.1264, val loss 2.1454
step 600: train loss 1.9616, val loss 2.0230
step 800: train loss 1.8433, val loss 1.9292
step 1000: train loss 1.7577, val loss 1.8661
step 1200: train loss 1.6977, val loss 1.8196
step 1400: train loss 1.6553, val loss 1.7854
step 1600: train loss 1.6222, val loss 1.7436
step 1800: train loss 1.5936, val loss 1.7097
step 2000: train loss 1.5673, val loss 1.6876
step 2200: train loss 1.5500, val loss 1.6730
step 2400: train loss 1.5306, val loss 1.6586
step 2600: train loss 1.5164, val loss 1.6424
step 2800: train loss 1.5040, val loss 1.6307
step 3000: train loss 1.4922, val loss 1.6231
step 3200: train loss 1.4790, val loss 1.6099
step 3400: train loss 1.4738, val loss 1.5972
step 3600: train loss 1.4616, val loss 1.5931
step 3800: train loss 1.4582, val loss 1.5817
step 4000: train loss 1.4496, val loss 1.5770
step 4200: train loss 1.4431, val loss 1.

In [None]:
context = torch.tensor(tokenizer.encode('B'), device=device).unsqueeze(0)
print(''.join(tokenizer.decode(generate(model, context, tokenizer, max_new_tokens=500))))

BENVOLIO:
He country
Splacks the moke o'll, bewless that to the goss?

OXFORD:
Marry, that!

ANGELO:
Ay, and lady, I preve to record,
Of be a time, ply your oscounts cheave me of be
To your son woman of thy death, your make and bried by my load,
But then! Now do me harm'd the dark'd up.
Make, then I sir,
Spit food thy hand, is his wrongs,
Is take your bother tears there's fair without and as man,
on to old, by my woman, if being with it for then.

KING RICHARD II:
Do that uson the with life. He w
