# 第五章 在无标签数据集上预训练

## 5.1 评估文本生成大模型
### 5.1.1 用GPT来生成文本

In [1]:
import torch
from previous_chapters import *
import torch.nn.functional as F
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();  # Disable dropout during inference
#导入模型, 设定一系列参数, 设定随机种子确保可复现

In [2]:
import tiktoken

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # Add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    token_ids = token_ids.squeeze(0)  # Remove batch dimension
    decoded = tokenizer.decode(token_ids.tolist())
    return decoded


start_context = 'every effort moves you'
tokenizer = tiktoken.get_encoding("gpt2")
# 举例说明

token_ids = generate_text_simple(model=model,idx=text_to_token_ids(start_context, tokenizer), max_new_tokens=10, context_size=GPT_CONFIG_124M['context_length'])

print("output text:\n",token_ids_to_text(token_ids, tokenizer))


output text:
 every effort moves you rentingetic minion mobilized Macicone heterogeneity achaRAM


### 5.1.2&3 计算训练集和验证集的损失

In [3]:
file_path = 'the-verdict.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    text_data = f.read()
# 检查数据集中的字符数和词元数
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Total characters:", total_characters)
print("Total tokens:", total_tokens)

Total characters: 20479
Total tokens: 5145


In [4]:
train_ratio = 0.9
split_idx = int(total_characters * train_ratio)
train_data = text_data[:split_idx]
test_data = text_data[split_idx:]

### 建立数据加载器

In [5]:
torch.manual_seed(123)
train_loader = create_dataloader_v1(train_data,2,max_length=GPT_CONFIG_124M['context_length'],
                                    stride=GPT_CONFIG_124M['context_length'],
                                    shuffle=True,
                                    drop_last=True,
                                    num_workers=2)
test_loader = create_dataloader_v1(test_data,2,max_length=GPT_CONFIG_124M['context_length'],
                                    stride=GPT_CONFIG_124M['context_length'],
                                    shuffle=True,
                                    drop_last=True,
                                    num_workers=2)
                                                                        

In [6]:
# 测试数据加载器
print("Train loader:")
for x , y in train_loader:
    print(x.shape,y.shape)


print("\nTest loader:")
for x , y in test_loader:
    print(x.shape,y.shape)

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Test loader:
torch.Size([2, 256]) torch.Size([2, 256])


#### 实现工具函数，计算批次的损失值

In [7]:
def calc_loss_batch(inputs,targets,model,device):
    inputs = inputs.to(device)
    targets = targets.to(device)
    outputs = model(inputs)
    loss = F.cross_entropy(outputs.flatten(0,1), targets.flatten())
    return loss

def calc_loss_loader(data_loader,model,device,num_batches=None):
    total_loss = 0
    if len(data_loader)==0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches,len(data_loader))
    for i , (inputs, targets) in enumerate(data_loader):
        if i >= num_batches:
            break
        loss = calc_loss_batch(inputs, targets, model, device)
        total_loss += loss.item()
    return total_loss / num_batches


### 计算损失值

In [8]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    test_loss = calc_loss_loader(test_loader, model, device)
print('Training loss:',train_loss)
print('Test loss:',test_loss)

Training loss: 10.987583266364204
Test loss: 10.98110580444336


## 5.2 训练大语言模型

In [9]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    #评价模块
    model.eval()
    #检验模式
    with torch.no_grad():
        #我认为的双保险,防止梯度更新
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    #	在评估结束后切换回训练模式，确保模型能继续用于训练。
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [10]:
def train_model_simple(model,train_loader,val_loader,optimizer,device,num_epochs,eval_freq,eval_iter,start_context,tokenizer):
    train_losses , val_losses , track_tokens_seen = [],[],[]
    tokens_seen , global_step = 0,-1
    for epoch in range(num_epochs):
        model.train()
        for input_batch , target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen +=input_batch.numel()
            global_step +=1
            
            if global_step % eval_freq ==0:
                train_loss , val_loss = evaluate_model(model,train_loader,val_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f'Ep{epoch+1} (Step{global_step:06d}):'
                      f'Train loss {train_loss:.3f},'
                      f' Val loss {val_loss:.3f},')
        generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses , val_losses , track_tokens_seen


In [None]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004,weight_decay=0.1)
num_epochs = 10
train_losses , val_losses , tokens_seen = train_model_simple(
    model=model,
    train_loader=train_loader,
    val_loader=test_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context='Every effort moves you',
    tokenizer=tokenizer
)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator


def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))

    # Plot training and validation loss against epochs
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))  # only show integer labels on x-axis

    # Create a second x-axis for tokens seen
    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis
    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks
    ax2.set_xlabel("Tokens seen")

    fig.tight_layout()  # Adjust layout to make room
    plt.savefig("loss-plot.pdf")
    plt.show()

epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)
#一个经典的plot画图函数