### 大致观察文本

In [1]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115393


In [3]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



### 文本由多少不同的字符构成
因为要构建以字符为单位的分词

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


### 编码整个text

In [5]:
# create a mapping from character to integers
stoi = { ch:i for i,ch in enumerate(chars) }  # 0: '\n', 1: ' ', ...
itos = { i:ch for i,ch in enumerate(chars) }  
encode = lambda s: [stoi[c] for c in s]  # take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l])  # take a list of integers, output a string

print(encode('hii there'))
print(decode(encode('hii there')))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [6]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115393]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

### 划分训练集和测试集

In [7]:
# Let's now split up the data into train and validation sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

### 划分训练时的chunk
肯定不能一次把整个text都丢到模型里，要分割成小的chunk来训练

In [8]:
block_size = 8  # 虽然有9个字符，但是只有8个样例
train_data[:block_size+1]  # 输入18，目标预测47；输入18,47，目标预测56...

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, the target is {target}")

when input is tensor([18]), the target is 47
when input is tensor([18, 47]), the target is 56
when input is tensor([18, 47, 56]), the target is 57
when input is tensor([18, 47, 56, 57]), the target is 58
when input is tensor([18, 47, 56, 57, 58]), the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


### 将数据集整理成batch

In [10]:
torch.manual_seed(1337)
batch_size = 4  
block_size = 8  # 用来预测的最大context

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("-----------")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()}, the target is {target}")

inputs:
torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
targets:
torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])
-----------
when input is [53], the target is 59
when input is [53, 59], the target is 6
when input is [53, 59, 6], the target is 1
when input is [53, 59, 6, 1], the target is 58
when input is [53, 59, 6, 1, 58], the target is 56
when input is [53, 59, 6, 1, 58, 56], the target is 47
when input is [53, 59, 6, 1, 58, 56, 47], the target is 40
when input is [53, 59, 6, 1, 58, 56, 47, 40], the target is 59
when input is [49], the target is 43
when input is [49, 43], the target is 43
when input is [49, 43, 43], the target is 54
when input is [49, 43, 43, 54], the target is 1
when input is [49, 43, 43, 54

In [11]:
print(xb)  # our input to the transformer

tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])


### 模型

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)  # (B, T) --> (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  # 展平，(4,8,65) --> (32,65)即每一个字符的下一个字符的概率分布
            targets = targets.view(B*T)   # 展平，(4, 8) --> (32)即真实的下一个字符的索引
            loss = F.cross_entropy(logits, targets)
        return logits, loss
        
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C) 
            # sample from the distribution
            id_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat([idx, id_next], dim=1)  # (B, T+1)
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8948, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [13]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
batch_size = 32
for step in range(10000):
    # Sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

4.764118671417236
4.738727569580078
4.7658371925354
4.807843208312988
4.7020087242126465
4.76759147644043
4.6634721755981445
4.77570915222168
4.658776760101318
4.74403715133667
4.7658867835998535
4.6947174072265625
4.80064582824707
4.606237888336182
4.7218828201293945
4.690646648406982
4.623412609100342
4.695773124694824
4.813107013702393
4.552684783935547
4.622650623321533
4.685900688171387
4.725422382354736
4.685242652893066
4.626786231994629
4.619174480438232
4.633866786956787
4.7050042152404785
4.705442428588867
4.700276851654053
4.671611309051514
4.742623329162598
4.756971836090088
4.732963562011719
4.724040508270264
4.707589626312256
4.6813507080078125
4.668712139129639
4.49385929107666
4.706875801086426
4.740492343902588
4.691798210144043
4.711052894592285
4.701241493225098
4.592638969421387
4.659737586975098
4.700632572174072
4.653691291809082
4.629376411437988
4.632992744445801
4.640700817108154
4.726120948791504
4.582829475402832
4.607274055480957
4.649630069732666
4.67586135

In [15]:
print(decode(m.generate(idx, max_new_tokens=300)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henouratucenonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h haybet--s n prids, r loncave w hollular s O:
HIs; ht 


### 模拟self-attention

In [55]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [56]:
# 写法1：
# We want x[b, t] = mean_{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]  # (t, C)
        xbow[b, t] = torch.mean(xprev, dim=0) 

In [57]:
# 写法2：
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(dim=1, keepdim=True)
xbow2 = wei @ x
torch.allclose(xbow, xbow2)

False

In [58]:
# 写法3：softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow2, xbow3)

True

In [59]:
# 写法4：self-attention
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# 单头self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)
q = query(x)
v = value(x)

wei = q @ k.transpose(-2, -1)
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ v

out.shape

torch.Size([4, 8, 32])

In [19]:
torch.manual_seed(1337)
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('------')
print('b=')
print(b)
print('------')
print('c=')
print(c)

a=
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
------
b=
tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])
------
c=
tensor([[12., 10.],
        [12., 10.],
        [12., 10.]])
