In [1]:
import os
import torch
import random
import numpy as np
from datasets import load_from_disk
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer, 
)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

DATA_PATH = "/home/ubuntu/data/dataset/wikitext_dataset"
dataset = load_from_disk(DATA_PATH)
train_data = dataset['train']
valid_data = dataset['validation']

MODEL_PATH = "/home/ubuntu/data/model/gpt2_model"
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    ).to(DEVICE)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!


In [2]:
from tqdm import tqdm

def compute_standard_ppl_with_sliding_window(model, tokenizer, dataset, device="cuda"):
    """
    结合了：
    1. 滑动窗口 (Stride) -> 保证每个 token 都有足够的上文
    2. 全局聚合 (Global Aggregation) -> 符合 PPL 标准定义
    """
    model.eval()
    
    # 1. 拼接全量文本
    encodings = tokenizer("\n\n".join(dataset["text"]), return_tensors="pt")
    
    max_length = model.config.n_positions
    stride = 512  # 滑动步长，通常设为 max_length 的一半或更小
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    
    # tqdm 进度条
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # 我们这一轮实际要评估的 token 长度
        
        # 获取当前窗口的 input_ids
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100 

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            
            # outputs.loss 是平均 loss，我们需要还原成 sum loss
            # 因为最后一个 batch 的 trg_len 可能不等于 stride
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)
        
        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    # 3. 全局计算 PPL
    ppl = torch.exp(torch.stack(nlls).sum() / seq_len) # 总 NLL / 总长度
    return ppl.item()

model.eval()

ppl_score = compute_standard_ppl_with_sliding_window(model, tokenizer, valid_data, DEVICE)
print(f"Validation Perplexity: {ppl_score:.2f}")

Token indices sequence length is longer than the specified maximum sequence length for this model (251048 > 1024). Running this sequence through the model will result in indexing errors
  0%|          | 0/491 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
100%|█████████▉| 489/491 [00:03<00:00, 162.72it/s]

Validation Perplexity: 26.62





In [3]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

MAX_LENGTH = 1024
EPOCHS = 10
TRAIN_BATCH_SIZE = 16
GRADIENT_ACCUMULATION = 1
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.05
WARMUP_RATIO = 0.03

use_bf16 = torch.cuda.is_bf16_supported()
use_fp16 = torch.cuda.is_available() and not use_bf16

def group_texts(examples):
    """将文本拼接并切块 (Packing)"""
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    if total_length >= MAX_LENGTH:
        total_length = (total_length // MAX_LENGTH) * MAX_LENGTH
    
    result = {
        k: [t[i : i + MAX_LENGTH] for i in range(0, total_length, MAX_LENGTH)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

def train_model(model, train_dataset, eval_dataset, tokenizer, output_dir):
    """
    模型微调函数：包含自动列清理、BF16加速、Epoch级评估
    """
    # 内部预处理函数
    def preprocess_dataset(dataset):
        # 关键：获取当前所有列名 (text, entropy 等)，以便稍后移除
        column_names = dataset.column_names

        def tokenize_function(examples):
            # 批量处理：给每个文本末尾加上 EOS
            return tokenizer(
                [t + tokenizer.eos_token for t in examples["text"]]
            )
        
        # 1. Tokenize 并移除旧列
        tokenized = dataset.map(
            tokenize_function, 
            batched=True, 
            num_proc=8,
            remove_columns=column_names # 彻底清理，防止 group_texts 报错
        )
        
        # 2. Packing
        packed = tokenized.map(
            group_texts, 
            batched=True,
            num_proc=8
        )
        return packed

    lm_train_dataset = preprocess_dataset(train_dataset)
    lm_eval_dataset = preprocess_dataset(eval_dataset)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=EPOCHS,
        
        per_device_train_batch_size=TRAIN_BATCH_SIZE, 
        gradient_accumulation_steps=GRADIENT_ACCUMULATION,
        bf16=use_bf16,
        fp16=use_fp16,
        dataloader_num_workers=8,
        optim="adamw_torch_fused",

        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        save_total_limit=1,
        
        report_to="none", 
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        lr_scheduler_type="cosine",
        warmup_ratio=WARMUP_RATIO,
        logging_steps=20,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=lm_train_dataset,
        eval_dataset=lm_eval_dataset,
    )
    
    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    return model

real_output_dir = "model_real_trained"
real_model = train_model(model, train_data, valid_data, tokenizer, real_output_dir)
real_model.eval()
ppl_score = compute_standard_ppl_with_sliding_window(real_model, tokenizer, valid_data, DEVICE)
print(f"Validation Perplexity: {ppl_score:.2f}")

Epoch,Training Loss,Validation Loss
1,3.407,3.219363
2,3.3177,3.167754
3,3.287,3.146741
4,3.2891,3.135971
5,3.2457,3.132008
6,3.2787,3.129983
7,3.2617,3.128894
8,3.2425,3.128394
9,3.2612,3.128713
10,3.2562,3.128685


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
100%|█████████▉| 489/491 [00:03<00:00, 140.86it/s]

Validation Perplexity: 21.66



