In [1]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [3]:
s2i = {ch:i for i,ch in enumerate(chars)}
i2s = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [s2i[c] for c in s]
decode = lambda l: ''.join([i2s[i] for i in l])

In [4]:
# for education only
import tiktoken
encoder = tiktoken.get_encoding('gpt2') # Byte Pair Encoding BPE, size 50257
encoder.encode("hii there")

[71, 4178, 612]

In [5]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [6]:
n = int(0.9 * len(data))
train = data[:n]
test = data[n:]

In [7]:
# for education only
block_size = 8
x = train[:block_size]
y = train[1: block_size+1]
for i in range(block_size):
    print('context: ', x[:i+1], ', target: ', y[i])

context:  tensor([18]) , target:  tensor(47)
context:  tensor([18, 47]) , target:  tensor(56)
context:  tensor([18, 47, 56]) , target:  tensor(57)
context:  tensor([18, 47, 56, 57]) , target:  tensor(58)
context:  tensor([18, 47, 56, 57, 58]) , target:  tensor(1)
context:  tensor([18, 47, 56, 57, 58,  1]) , target:  tensor(15)
context:  tensor([18, 47, 56, 57, 58,  1, 15]) , target:  tensor(47)
context:  tensor([18, 47, 56, 57, 58,  1, 15, 47]) , target:  tensor(58)


In [8]:
batch_size = 4
block_size = 8
torch.manual_seed(1337)

def get_batch(isTrain):
    data = train if isTrain else test
    randomIndex = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i: i + block_size] for i in randomIndex])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in randomIndex])
    return x, y

xb, yb = get_batch(True)

for batch in range(batch_size): # batch dimension
    for block in range(block_size): # time dimension
        context = xb[batch, :block+1]
        target = yb[batch, block]
        print('context: ', context.tolist(), ', target: ', target)
        

context:  [24] , target:  tensor(43)
context:  [24, 43] , target:  tensor(58)
context:  [24, 43, 58] , target:  tensor(5)
context:  [24, 43, 58, 5] , target:  tensor(57)
context:  [24, 43, 58, 5, 57] , target:  tensor(1)
context:  [24, 43, 58, 5, 57, 1] , target:  tensor(46)
context:  [24, 43, 58, 5, 57, 1, 46] , target:  tensor(43)
context:  [24, 43, 58, 5, 57, 1, 46, 43] , target:  tensor(39)
context:  [44] , target:  tensor(53)
context:  [44, 53] , target:  tensor(56)
context:  [44, 53, 56] , target:  tensor(1)
context:  [44, 53, 56, 1] , target:  tensor(58)
context:  [44, 53, 56, 1, 58] , target:  tensor(46)
context:  [44, 53, 56, 1, 58, 46] , target:  tensor(39)
context:  [44, 53, 56, 1, 58, 46, 39] , target:  tensor(58)
context:  [44, 53, 56, 1, 58, 46, 39, 58] , target:  tensor(1)
context:  [52] , target:  tensor(58)
context:  [52, 58] , target:  tensor(1)
context:  [52, 58, 1] , target:  tensor(58)
context:  [52, 58, 1, 58] , target:  tensor(46)
context:  [52, 58, 1, 58, 46] , 

In [18]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        # index 和 targets 都是 (batch, block)
        logits = self.token_embedding_table(index) # (batch, block, character), logit 是向量的意思，在这里可以理解成预测值
        if targets is None: loss = None
        else:
            batch, block, character = logits.shape
            logits = logits.view(batch * block, character)
            targets = targets.view(batch * block)
            loss = F.cross_entropy(logits, targets) # Pytorch 希望你的格式是 (batch, class, ... )
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(index) # predict
            logits = logits[:, -1, :] # the last character prediction, (batch, character)
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1) # (batch, 1)
            index = torch.cat((index, index_next), dim=1) # (batch, block + 1)
        return index
    
model = BigramLanguageModel(vocab_size)
output, loss = model(xb, yb)
print(decode(model.generate(index = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())) # 生成一堆垃圾，因为还没训练呢


dM'D'qf'orlWA;cZvPzNZT!-&Bdvq3TMqFL'ptgdSOMtekNRpygSv
hvumYJ'p.YJqf-
DgwwPf!TW,izClsWVZ
&O?kaBhtTh&;


In [19]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [23]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch(True)
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.379244565963745


In [27]:
print(decode(model.generate(index = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=400)[0].tolist())) # 看起来稍微好一点


Fook'tharund arer, t th rou, henthueaff thifl ce, intid imelfe cor sesofoch ce
CENI hawaceleoolt.
YORT: wllithamondslo's?
I mavear tld'ld th s INIARCion heere s, bu o ave g:
Yitaifurta seathinthe or' peenourd thaith t.
MENou h st winreindowis h bos yseseco he no ounngouchinelayo s the,
D isthet.
Tid,

Fore, per tof llsishavimes so'she:
QUS:
G Eveveouesth Pr titre ad rikndothincin odowavesumr-d by,
