In [154]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm

# 查看数据集

In [155]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [156]:
print("num of chars", len(text))

num of chars 1115394


In [157]:
# First 1000 chars
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


# 构造Vocab

In [158]:
# All unique chars that occur
# We'all use char as token
chars = sorted(list(set(text)))
print(''.join(chars))
print(len(chars))
vocab_size = len(chars)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [159]:
# Simple tokenizer example (just using index) (More complex, see OpenAI tiktoken)
# Write the encode and decode function
stoi = {character:index for index, character in enumerate(chars)}
itos = {index:character for index, character in enumerate(chars)}
encode = lambda chars: [stoi[x] for x in chars]
decode = lambda ints: [itos[x] for x in ints]

test_str = "From either sides the river lies"
test_str_code = encode(test_str)
test_str_decode = decode(test_str_code)
print(test_str_code)
print(test_str_decode)

[18, 56, 53, 51, 1, 43, 47, 58, 46, 43, 56, 1, 57, 47, 42, 43, 57, 1, 58, 46, 43, 1, 56, 47, 60, 43, 56, 1, 50, 47, 43, 57]
['F', 'r', 'o', 'm', ' ', 'e', 'i', 't', 'h', 'e', 'r', ' ', 's', 'i', 'd', 'e', 's', ' ', 't', 'h', 'e', ' ', 'r', 'i', 'v', 'e', 'r', ' ', 'l', 'i', 'e', 's']


# 训练和测试集、批量抽取

In [160]:
# Encode the entire dataset
# Split the train and validation set of the dataset
# Transfer to tensor
data = torch.tensor(encode(text), dtype=torch.long)
split_point = int(0.9 * len(data))
train_data = data[:split_point]
test_data = data[split_point:]

In [161]:
# Write the get_batch function
# Draw chunks of data and understand how to use a chunk of data consider varying length

batch_size = 8
block_size = 32
device = "cuda" if torch.cuda.is_available() else "cpu"

def get_batch(data):
    indices = torch.randint(len(data)-block_size, (batch_size, ))
    x = [data[start:start+block_size] for start in indices]
    y = [data[start+1:start+block_size+1] for start in indices]
    x, y = torch.stack(x), torch.stack(y)
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch(train_data)
print(x)

tensor([[39, 56, 43,  1, 49, 52, 53, 61, 52,  1, 61, 43, 50, 50,  1, 43, 52, 53,
         59, 45, 46,  1, 58, 53, 53,  8,  0,  0, 25, 17, 26, 17],
        [43, 42, 57,  6,  1, 61, 43,  1, 51, 59, 57, 58,  1, 39, 50, 57, 53,  1,
         58, 43, 50, 50,  1, 46, 47, 51,  0, 53, 59, 56,  1, 52],
        [57,  1, 56, 43, 39, 57, 53, 52,  1, 61, 47, 58, 46,  1, 46, 47, 51,  8,
          0,  0, 15, 24, 13, 30, 17, 26, 15, 17, 10,  0, 35, 46],
        [ 8,  0,  0, 26, 59, 56, 57, 43, 10,  0, 13, 52,  1, 46, 53, 52, 53, 59,
         56,  2,  1, 61, 43, 56, 43,  1, 52, 53, 58,  1, 21,  1],
        [57, 11,  1, 58, 46, 43,  1, 50, 39, 42, 63,  0, 61, 47, 42, 53, 61,  1,
         53, 44,  1, 34, 47, 58, 56, 39, 60, 47, 53, 11,  1, 31],
        [ 1, 53, 44,  1, 42, 47, 50, 42, 53, 57,  1, 39, 52, 42,  1, 44, 39, 42,
         47, 52, 45, 57,  6,  1,  5, 48, 59, 51, 54,  1, 46, 43],
        [43, 56, 43,  1, 21,  1, 51, 53, 60, 43,  6,  0, 35, 46, 39, 58,  1, 51,
         63,  1, 58, 53, 52, 45, 59, 

In [162]:
# The way how a chunk of data is used: Enumerate all possible prediction context length
x_example = train_data[:block_size]
y_example = train_data[1:block_size+1]

for t in range(block_size):
    x = x_example[:t+1]
    y = y_example[t]
    print("----------------------")
    print("Training input is ", x)
    print("Target is ", y)

----------------------
Training input is  tensor([18])
Target is  tensor(47)
----------------------
Training input is  tensor([18, 47])
Target is  tensor(56)
----------------------
Training input is  tensor([18, 47, 56])
Target is  tensor(57)
----------------------
Training input is  tensor([18, 47, 56, 57])
Target is  tensor(58)
----------------------
Training input is  tensor([18, 47, 56, 57, 58])
Target is  tensor(1)
----------------------
Training input is  tensor([18, 47, 56, 57, 58,  1])
Target is  tensor(15)
----------------------
Training input is  tensor([18, 47, 56, 57, 58,  1, 15])
Target is  tensor(47)
----------------------
Training input is  tensor([18, 47, 56, 57, 58,  1, 15, 47])
Target is  tensor(58)
----------------------
Training input is  tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])
Target is  tensor(47)
----------------------
Training input is  tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])
Target is  tensor(64)
----------------------
Training input is  tensor([1

# 简单BigramLanguageModel

In [163]:
# Test the data using simple language model BigramLanguageModel
# Embedding in Bigram is just a nxn matrix represents the transition probability from token to token
"""
BigramLanguageModel
-------------------
Forward:
Input: x, tensor of training data with shape (B, T).
Input: target, tensor of labeling data with shape (B, T). Default target = None
Output: logits, rows of the probability of each token in the data x, shape (B, T, C), C is the length of the vocab.
Output: loss, cross entropy of logits and targets. Note that input logits have to be reshaped to use crossEntropyLoss.

Generate: write a generic version that considers the history
Input: x, tensor (B, T)
Input: max_new_tokens
Output: x' after expanding
"""

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, x, target=None):
        logits = self.token_embedding_table(x)
        
        if target is None:
            loss = 0
        else:
            B, T, C = logits.shape # C represents channels
            logits_reshaped = logits.view(B*T, C)
            target = target.view(-1)
            loss = F.cross_entropy(logits_reshaped, target)

        return logits, loss

    def generate(self, x, predict_len):
        for step in range(predict_len):
            logits, _ = self(x)
            logits = logits[:, -1, :] # Draw the logits of last time step. Now the shape is (B, C)
            probs = F.softmax(logits, dim=1)
            char_new = torch.multinomial(probs, 1)
            x = torch.cat([x, char_new], dim=1)
        return x

model = BigramLanguageModel(len(chars))
model = model.to(device)
x, y = get_batch(train_data)
logits, loss = model(x, y)
print(logits.shape)
print(loss)

# Use .tolist() method to transform the tensor
print(''.join(decode(x[0].tolist())))
x_predicted = model.generate(x, 8)
print(''.join(decode(x_predicted[0].tolist())))

torch.Size([8, 32, 65])
tensor(4.6995, grad_fn=<NllLossBackward0>)
un to strike,
We'll never leave 
un to strike,
We'll never leave jS!wxEg.


In [164]:
# Train the Bigram model

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

epochs = 1000
for _ in tqdm(range(epochs)):
    x, y = get_batch(train_data)
    _, loss = model(x, y)
    optimizer.zero_grad(set_to_none=True) # Save memory, and faster
    loss.backward()
    optimizer.step()
        
print(loss)

100%|██████████| 1000/1000 [00:00<00:00, 1368.96it/s]

tensor(3.6132, grad_fn=<NllLossBackward0>)





In [165]:
x, y = get_batch(train_data)

# Use .tolist() method to transform the tensor
print(''.join(decode(x[0].tolist())))
x_predicted = model.generate(x, 100)
print(''.join(decode(x_predicted[0].tolist())))

nd ply his book, welcome his fri
nd ply his book, welcome his friKg:p,lFrfjoU?CLJw$SBG,DYet baGrBenml!xgucQig wjZ:VT3va3;kCru-WjZh,?r3;gPBH;E'r
yiOGUMc'DMqyoKXIioowy


# 计算attention的向量化方法

In [166]:
# For a (B, T, C) tensor
# By averaging over the prefix tokens to summarize the past info and ignore the future info (just like autoregressive)
# Ver1: Double loops version
# Track the dim can help coding

torch.manual_seed(42)

B = 4
T = 5
C = 3
x = torch.randn((B, T, C))

print(x[0])

x_loop = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        x_loop[b][t] = torch.mean(xprev, 0)

print(x_loop[0])

tensor([[ 1.9269,  1.4873,  0.9007],
        [-2.1055,  0.6784, -1.2345],
        [-0.0431, -1.6047, -0.7521],
        [ 1.6487, -0.3925, -1.4036],
        [-0.7279, -0.5594, -0.7688]])
tensor([[ 1.9269,  1.4873,  0.9007],
        [-0.0893,  1.0829, -0.1669],
        [-0.0739,  0.1870, -0.3620],
        [ 0.3568,  0.0421, -0.6224],
        [ 0.1398, -0.0782, -0.6517]])


In [167]:
# Ver2: Matrix mul version. Efficient, using lower triangle torch.tril(torch.ones(3, 3)) and normalize it
# Use torch.allclose() to check these two

wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
print(wei)
x_MM = wei @ x
print(x_MM[0])

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]])
tensor([[ 1.9269,  1.4873,  0.9007],
        [-0.0893,  1.0829, -0.1669],
        [-0.0739,  0.1870, -0.3620],
        [ 0.3568,  0.0421, -0.6224],
        [ 0.1398, -0.0782, -0.6517]])


In [168]:
# Ver3: Set -inf (it is a mask) and then softmax to get the normalized lower triangular
# Why use this? A perspective of affinities. Weight matrix can represent how each token is interesting to each other (In this example it's just 0)
# Use [masked_fill] method
wei = torch.zeros((T, T))
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, value=-float('inf'))
wei = F.softmax(wei, dim=-1)
print(wei)
x_sm = wei @ x
print(x_sm[0])

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]])
tensor([[ 1.9269,  1.4873,  0.9007],
        [-0.0893,  1.0829, -0.1669],
        [-0.0739,  0.1870, -0.3620],
        [ 0.3568,  0.0421, -0.6224],
        [ 0.1398, -0.0782, -0.6517]])


In [169]:
# Ver4: Self-attention
# query * key to get the weights that represents the affinities(relationship) from token to token
# When training, the model will automatically learn the affinities from token to token.
# A simple one head attention.

"""
Important notes about atttention:
1. a communication machanism that can be describe by direct graph with block_size node (the edge represents the affinities.
    not necessarily to be a lower triangle. Encoder part needs tokens to talk to each other.
2. no notion of space. positional code is needed.
3. Raw value x is like private data, and V is like public data for aggregation
4. Self-attention: QKV from same source x. Cross-attention: outer source QK.
"""
torch.manual_seed(42)
head_size = 5
Q = nn.Linear(C, head_size)
K = nn.Linear(C, head_size)
V = nn.Linear(C, head_size)

q = Q(x) # (B, T, head_size)
k = K(x)

wei = q @ k.transpose(-2, -1) # (B, T, T)
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, value=-float('inf')) * head_size**(-0.5)
wei = F.softmax(wei, dim=-1)
print(wei[0])

v = V(x)
print(v[0])
x_att = wei @ v # A good way of thinking this mat mul is row perspective
print(x_att[0])

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1491, 0.8509, 0.0000, 0.0000, 0.0000],
        [0.6266, 0.0590, 0.3144, 0.0000, 0.0000],
        [0.2801, 0.0278, 0.2398, 0.4523, 0.0000],
        [0.2760, 0.1401, 0.2008, 0.2031, 0.1801]], grad_fn=<SelectBackward0>)
tensor([[ 0.5241,  1.4449,  1.0499, -1.2772, -0.4092],
        [-0.4937, -0.3647, -1.0037,  1.0804,  0.6383],
        [ 0.3009, -0.1853, -0.4107,  1.5519, -1.2764],
        [ 0.2153,  1.2557,  0.0874,  0.4091, -1.5597],
        [ 0.0270, -0.1853, -0.5358,  1.1992, -0.4808]],
       grad_fn=<SelectBackward0>)
tensor([[ 0.5241,  1.4449,  1.0499, -1.2772, -0.4092],
        [-0.3419, -0.0948, -0.6974,  0.7288,  0.4821],
        [ 0.3938,  0.8256,  0.4695, -0.2486, -0.6200],
        [ 0.3026,  0.9182,  0.2073,  0.2294, -1.1084],
        [ 0.1845,  0.5321, -0.0121,  0.4095, -0.6831]],
       grad_fn=<SelectBackward0>)


In [170]:
# Why scaling using query and key dimension matters? 
# We wants the attention to combine more info. If not scaling, it will tend to focus on the largest one.
# Essentially, by scalling, the final weight matrix will have low variance.
example = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])
print(torch.softmax(example, dim=-1))
print(torch.softmax(example * 8, dim=-1)) # Not what we want

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])


# Attention组件

In [171]:
# Implement a self-attention Head: qkv layer, tril register_buffer, dropout_layer
"""
Input: (B, T, embed_size) data, head_size
Output: (B, T, head_size)
"""

embed_size = 8
dropout = 0.1

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.query = nn.Linear(embed_size, head_size)
        self.key = nn.Linear(embed_size, head_size)
        self.value = nn.Linear(embed_size, head_size)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        q = self.query(x)
        k = self.key(x)
        wei = q @ k.transpose(-1, -2) * head_size**(-0.5)
        print(wei.shape)
        print(tril.shape)
        wei = wei.masked_fill(tril == 0, value=-float('inf'))
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        
        v = self.value(x)
        b = wei @ v # (B, T, head_size)
        return b

x = torch.randn((B, T, embed_size))
att_head = Head(head_size=8)
b = att_head(x)
print(b.shape)

torch.Size([4, 5, 5])
torch.Size([5, 5])
torch.Size([4, 5, 8])


In [172]:
# Implement a multi-head attention with projection (Linear trans)
# Projection layer here is to map the size back to embeding_size, which is then compatible for residual connetion
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(embed_size, embed_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        b = torch.cat([h(x) for h in self.heads], dim=-1)
        b = self.proj(b)
        b = self.dropout(b)
        return b

x = torch.randn((B, T, embed_size))
multi_att_head = MultiHeadAttention(num_heads=4, head_size=embed_size//4)
b = multi_att_head(x)
print(b.shape)

torch.Size([4, 5, 5])
torch.Size([5, 5])
torch.Size([4, 5, 5])
torch.Size([5, 5])
torch.Size([4, 5, 5])
torch.Size([5, 5])
torch.Size([4, 5, 5])
torch.Size([5, 5])
torch.Size([4, 5, 8])


In [173]:
"""
Implement the feed forward networks after attention layer
"""

class FeedForward(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),
            nn.ReLU(),
            nn.Linear(4 * embed_size, embed_size),
            nn.Dropout(dropout),
        )
    
    def forward(self, x):
        return self.fc(x)

In [174]:
"""
Implement attention block class with residual connection
- Note that we choose num_heads as hyper parameter instead of choosing head_size. The head_size is computed from embed_size//num_heads
? Why choose layer norm? And how to use LayerNorm of 1d and 2d.
"""

class Block(nn.Module):
    def __init__(self, embed_size, num_heads):
        super().__init__()
        head_size = embed_size // num_heads       
        self.att = MultiHeadAttention(num_heads, head_size)
        self.fc = FeedForward(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)
    
    def forward(self, x):
        x = x + self.ln1(self.att(x))
        x = x + self.ln2(self.fc(x))
        return x

x = torch.randn((B, T, embed_size))
block = Block(embed_size, num_heads=4)
print(block(x).shape)

torch.Size([4, 5, 5])
torch.Size([5, 5])
torch.Size([4, 5, 5])
torch.Size([5, 5])
torch.Size([4, 5, 5])
torch.Size([5, 5])
torch.Size([4, 5, 5])
torch.Size([5, 5])
torch.Size([4, 5, 8])


# Attention用于语言模型

In [175]:
"""
Modification on BigramModel -- 3 Linear layers
1. token_embedding layer (vocab_size, embed_size)
2. position_embedding layer (block_size, embed_size)
3. language model head used to compute logits (embed_size, block_size)
4. Pluggin the attention block (use multi-head directly)
5. Pluggin the last feedforward layer

Make sure you understand the embedding here, which is no longer the same as original BigrameModel (which represents the transition probability)
    nn.Embedding creates a lookup table that converts indices (usually token IDs) into dense vectors of fixed size. 
    It's commonly used as the first layer in NLP tasks to convert tokens to continuous representations.
So this model is actually not a BigramModel anymore.

Just to remind1: loss is computed through cross_entropy. before using F.cross_entropy, you only need to reshape the logits and target.
Just to remind2: use F.softmax to convert the logits to the probability and then use torch.multinomial to predict the next token.

? Why token_embed and pos_embed use nn.Embedding instead of nn.Linear
? Understand the embedding in NLP tasks, why it is necessary.
"""

num_layers = 6
num_heads = 4

class BigramModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_size)
        self.pos_embed = nn.Embedding(block_size, embed_size)
        self.blocks = nn.Sequential(*[Block(embed_size, num_heads) for _ in range(num_layers)])
        self.ln_f = nn.LayerNorm(embed_size)
        self.lm_head = nn.Linear(embed_size, vocab_size)

    def forward(self, x, target=None):
        B, T = x.shape

        tok_emb = self.token_embed(x) # (B, T, C)
        pos_emb = self.pos_embed(torch.arange(T, device=device)) # (T, C)
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x) # (B, T, C)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if target is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits_temp = logits.reshape(B*T, C)
            target_temp = target.reshape(B*T)
            loss = F.cross_entropy(logits_temp, target_temp)

        return logits, loss


    def generate(self, x, num_predict):
        for t in range(num_predict):
            x_cond = x[:,-block_size:] # (B, T)
            logits, _ = self(x_cond) # (B, T, vocab_size)
            logits = logits[:,-1,:] # (B, vocab_size)
            probs = F.softmax(logits, dim=-1) # (B, vocab_size)
            x_next = torch.multinomial(probs, 1) # (B, 1)
            x = torch.cat([x, x_next], dim=1) # (B, T+1)
        return x

# 最后测试

In [176]:
"""
Func for train and eval the model.
Use random batch from train and eval dataset to evaluate the mean losses.
Output: list, contains two scalars represent the 'train' and 'eval' losses.
"""

eval_iters = 200

@torch.no_grad()
def estimate_loss():
    out = []
    model.eval()
    for split in [train_data, test_data]: # This is not efficient!! Thats why we would prefer using string to choose the dataset
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss
        out.append(losses.mean())     
    return out

In [177]:
batch_size = 16
block_size = 32
max_iters = 5000
eval_iters = 200
lr = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embed_size = 64
num_heads = 4
num_layers = 4
dropout = 0.0

torch.manual_seed(1337)

model = BigramModel()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

for it in range(max_iters):
    if it % eval_iters == 0 or it == max_iters - 1:
        losses = estimate_loss()
        print(f'Step{it}, Train loss: {losses[0]}, Val loss: {losses[1]}')

    X, Y = get_batch(train_data)
    logits, loss = model(X, Y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

torch.Size([16, 32, 32])
torch.Size([5, 5])


RuntimeError: The size of tensor a (5) must match the size of tensor b (32) at non-singleton dimension 2