# 1 Load Data

In [None]:
import torch
from rich.jupyter import print
X = torch.load('data/X.tensor')
X.shape

# 2 Load Model

In [41]:
n_layer = 12
n_head = 12
n_embd = 768
dropout = 0.1
bias = False

model_args =  dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=1024,
                  bias=bias, vocab_size=64, dropout=dropout)

from model import GPTConfig, GPT
gptconf = GPTConfig(**model_args)
model = GPT(gptconf).to('cpu')
print("预测词表大小", model.config.vocab_size) # BPE词表

number of parameters: 85.00M


# 3 Greedy Generation

In [36]:
# 推2个token
from torch .nn import functional as F
idx =  X[1,:10].reshape(1,10)
print("预测词表大小", model.config.vocab_size) # BPE词表
print("输入X:", idx)
print("输入X长度:", idx.shape)
for _ in range(2):
    print(f'----------{_+1} token----------')
    logits, _ = model(idx)
    print('输出Logits:', logits.shape)
    probs = F.softmax(logits, dim=-1)
    print('输出Probs:', probs.shape)
    idx_next = torch.argmax(probs, dim=-1)
    print('输出下一个Token:', idx_next.shape)
    print('token:', idx_next)
    
    idx = torch.cat((idx, idx_next), dim=-1)
    print("当前的token长度:", len(idx[0]))
    print("当前的token序列:", idx)
    
    

In [38]:
torch.set_printoptions(precision=4)
torch.set_printoptions(sci_mode=False)
torch.set_printoptions(linewidth=100)
'''
torch.set_printoptions 是 PyTorch 中用来设置张量打印选项的函数。每个参数的作用如下：

precision: 设置浮点数的打印精度。例如，precision=4 表示打印浮点数时保留四位小数。

sci_mode: 控制科学计数法的使用。当设置为 False 时，张量将以普通数字格式打印，而不是科学计数法。默认情况下，如果数字太大或太小，PyTorch 会自动使用科学计数法。

linewidth: 设置每行的最大字符数。超过这个长度的行将会被换行。设置为 linewidth=100 表示每行最多打印 100 个字符。

这些选项有助于提高打印输出的可读性，尤其在处理较大的张量时。
'''

'\ntorch.set_printoptions 是 PyTorch 中用来设置张量打印选项的函数。每个参数的作用如下：\n\nprecision: 设置浮点数的打印精度。例如，precision=4 表示打印浮点数时保留四位小数。\n\nsci_mode: 控制科学计数法的使用。当设置为 False 时，张量将以普通数字格式打印，而不是科学计数法。默认情况下，如果数字太大或太小，PyTorch 会自动使用科学计数法。\n\nlinewidth: 设置每行的最大字符数。超过这个长度的行将会被换行。设置为 linewidth=100 表示每行最多打印 100 个字符。\n\n这些选项有助于提高打印输出的可读性，尤其在处理较大的张量时。\n'

## 3.1 Top-K GPT-2

In [81]:
temperature = 1.0
top_k = 5
print(model.config.vocab_size) # BPE词表
print("input_ids", idx)

In [85]:
from torch.nn import functional as F

idx = X[1,:10].reshape(1, 10)
print(idx.shape)
for _ in range(10):
    idx_cond = idx if idx.size(1) <= model.config.block_size else idx[:,-model.config.block_size]
    
    logits, _ = model(idx_cond)
    print('logits:',logits.shape)
    
    print("no - 1", logits[:, -1, :].shape)  
    logits = logits[:, -1, :] / temperature
    
    if top_k:
        # 找到top k更高的概率
        v, i = torch.topk(logits, min(top_k, logits.size(1)))
        print('top_k v:', v, v[:, -1])
        print('i', i)
        
        logits[logits < v[:, -1]] = -float('Inf')
        print('top_k logits', logits.shape)
        print('logits', logits)
    
    # softmax
    probs = F.softmax(logits, dim=-1)
    print('probs', probs.shape)
    print(probs)
    
    #sample
    idx_next = torch.multinomial(probs, num_samples=1)
    print('idx_next', idx_next)
    
    idx = torch.cat((idx, idx_next), dim=-1)
    print('new_idx', idx.shape)
    
    print("generate: length :", len(idx[0]))
    print("generate:", idx)

In [90]:
idx = X[1,:10].reshape(1, 10)
model.generate(idx, 10, 1.0, 5) # temparature 1.0 ,  Top_K None 

tensor([[ 0, 13, 52, 42,  1, 26, 43, 56, 53,  1, 59, 36,  8,  0,  0,  7,  7, 25,  1, 43]])

## 3.2 Sample

In [116]:
dict_map = {0:'🐯', 1:'🐷', 2:'🐱'}
probs = torch.tensor([0.7,0.2,0.1])
print('probs', probs)

print('根据概率选next token')
for _ in range(10):
    sample = torch.multinomial(probs, num_samples=1)[0]
    print(f'第{_+1}次抽样结果为{dict_map[int(sample)]}')

## 3.3 Sample with temperature

In [123]:
# temp > 1，概率分布更平滑，temp < 1，概率分布更陡峭
temperature = 0.1
dict_map = {0:'🐯', 1:'🐷', 2:'🐱'}
probs = torch.tensor([0.7,0.2,0.1])
print('probs', probs)
probs = probs / temperature
print('probs with temperature', probs)

print('根据概率选next token')
for _ in range(10):
    sample = torch.multinomial(probs, num_samples=1)[0]
    print(f'第{_+1}次抽样结果为{dict_map[int(sample)]}')

## 3.4  Top K

In [133]:
T = 0.1
top_k = 2
dict_map = {0:'🐯', 1:'🐷', 2:'🐱'}
probs = torch.tensor([0.7,0.2,0.1])
probs /= T
print('probs/T', probs)
probs = F.softmax(probs, dim=-1)
print('softmax_probs', probs)
probs, _ = torch.topk(probs, top_k)
print('topk_probs', probs)
probs = F.softmax(probs, dim=-1)
print('topk_softmax_probs', probs)

print('根据概率选next token')
for _ in range(10):
    sample = torch.multinomial(probs, num_samples=1)
    print(f'第{_+1}次抽样结果为{dict_map[int(sample)]}')

In [145]:
idx = X[1,:10].reshape(1, 10) 
print(idx)
print(idx.shape)

## 3.5 repetition penalty

In [158]:
penalty = 2.0
for _ in range(256):
    logits, _ = model(idx)
    print(logits.shape)
    print(logits)
    logits = logits[:, -1, :]
    original_logits = logits.clone()
    
    print(logits.shape)
    logits_idx = torch.gather(logits, 1, idx)
    print(logits_idx)
    logits_idx = torch.where(logits_idx < 0, logits_idx * penalty, logits_idx / penalty).clone()
    print(logits_idx)
    logits = logits.scatter_(1, idx, logits_idx)
    print(logits)

    probs = F.softmax(logits, dim=-1)
    print(probs)
    idx_next = torch.multinomial(probs, num_samples=1)
    idx = torch.cat((idx, idx_next), dim=-1)

    break

original_logits - logits

tensor([[0.1487, 0.4662, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0318, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.1401, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0551, 0.2451,
         0.0000, 0.0000, 0.4048, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.3478, 0.0543, 0.0000,
         0.0000, 0.5049, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
       grad_fn=<SubBackward0>)

# 4. Huggingface Transformer Generate

In [159]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

torch_device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("gpt2")

model_GPT = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id).to(torch_device)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [160]:
print(tokenizer)

In [173]:
text = "pneumonoultramicroscopicsilicovolcanoconiosis "

inputs = tokenizer(text, return_tensors='pt').get('input_ids')
print(inputs)
print(inputs[0,0])
print(tokenizer.decode(inputs[0,0]))
print(tokenizer.decode(inputs[0]))

In [175]:
inputs = tokenizer("hello world", return_tensors='pt')
inputs1 = tokenizer(" hello world", return_tensors='pt')
print(inputs)
print(inputs1)

## 4.1 Generate Beam search (num_beams, top_k, top_p, temperature, repetition_penalty)
### 4.1.1 activate beam search and early_stopping

In [199]:
model_inputs = tokenizer('I enjoy walking with my cute dog', return_tensors='pt').to(torch_device)

beam_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0]))
help(model.generate)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Help on method generate in module model:

generate(idx, max_new_tokens, temperature=1.0, top_k=None) method of model.GPT instance
    Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
    the sequence max_new_tokens times, feeding the predictions back into the model each time.
    Most likely you'll want to make sure to be in model.eval() mode of operation for this.



### 4.1.2 set no_repeat_ngram_size to 2

In [198]:
## Generate Beam Searching
# set no_repeat_ngram_size to 2
beam_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


### 4.1.3 set return_num_sequences > 1

In [206]:

beam_outputs = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=5,
    early_stopping=True
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
    print(f'{i}:{tokenizer.decode(beam_output, skip_special_tokens=True)}')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


### 4.1.4 sampling

In [215]:
from transformers import set_seed

set_seed(13)
sample_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=0#
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


### 4.1.5 sampling temparature

In [216]:
sample_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=0,
    temperature=0.6,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


### 4.1.6 top-k sampling

In [217]:
set_seed(42)

# set top_k to 50
sample_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


### 4.1.7 Top-p (nucleus) sampling

In [218]:
set_seed(42)

# set top_k to 50
sample_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_p=0.92,
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [221]:
sample_outputs = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3,
    repetition_penalty=1.2
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# 5 Decoder loss

In [222]:
import torch
import math

In [223]:
batch_size = 1  # 一句话

x = torch.randn(batch_size, 4, 512) #input :  batch_size, length, embd_dim
y = torch.randint(low=0, high=32000, size=(batch_size, 4), dtype=torch.long)

print(x.shape)
print(y.shape)
print(y)

In [224]:
q = torch.randn(512, 512)  
k = torch.randn(512, 512)
v = torch.randn(512, 512)
o = torch.randn(512, 512)

mask=torch.tril(torch.ones(1, 4, 4))
print(mask)

In [225]:
# scaled dot produc attention 
Q, K, V = x@q, x@k, x@v
scores = Q@K.transpose(1,2) / math.sqrt(512)
scores = scores.masked_fill(mask==0, float(-10000.0))
weights = F.softmax(scores, dim=-1)
attn = weights @ V
attn = attn @ o
attn.shape

torch.Size([1, 4, 512])

In [230]:
# mlp
mlp_up = torch.randn(512, 1024)
mlp_down = torch.randn(1024, 512)

mlp = attn @ mlp_up @ mlp_down
mlp.shape

torch.Size([1, 4, 512])

In [233]:
lm_head = torch.randn(512, 32000)
logits = mlp @ lm_head
logits.shape

torch.Size([1, 4, 32000])

In [235]:
probs = torch.softmax(logits, dim=2)

print(probs.shape)

In [256]:
# 计算交叉熵损失
print(probs.shape)
print(y.shape)
print('prob:',probs.view(-1, probs.size(-1)).shape)
print('y:',y.view(-1).shape)
loss = F.cross_entropy(probs.view(-1, probs.size(-1)), y.view(-1))
print('loss:', loss)