## 1. Data Preparation


In [None]:
import torch
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [None]:
# Step 1: 读取上传的文本文件
file_path = 'gpt2/data/tinyshakespeare.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Step 2: 将文本按行分割成句子或段落
data = text.strip().split('\n')
data = [line for line in data if len(line.strip()) > 0]  # 清除空行

# Step 3: 转换为 HuggingFace 数据集对象
dataset = Dataset.from_dict({'text': data})

# Step 4: 初始化 BERT 分词器
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("local_bert_base_uncased")

# Step 5: 编码函数（不重命名 input_ids！）
def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

# Step 6: 批量处理数据
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Step 7: 拆分训练集和测试集
split_dataset = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

# Step 8: 设置 PyTorch 数据格式（保留 input_ids！）
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids"])

# Step 9: 打印示例验证
print(f"Train Dataset Sample: {train_dataset[0]}")
print(f"Test Dataset Sample: {test_dataset[0]}")



Map:   0%|          | 0/32777 [00:00<?, ? examples/s]

Train Dataset Sample: {'input_ids': tensor([  101,  2026,  7267,  1029,  2030,  2026,  4113,  1029,  2011,  2115,
        14436,  1005,  2310, 11272,  1010,  1005,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)


In [None]:
from torch.utils.data import DataLoader

batch_size = 64

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=data_collator)


In [None]:
import torch
import transformers
from transformers import BertModel, BertConfig

d_model = 512
nhead = 4
dim_feedforward = 1024
dropout = 0.1
num_layers = 4
vocab_size = tokenizer.vocab_size

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



model = BertModel(
    config=BertConfig(
        hidden_size=d_model,
        num_attention_heads=nhead,
        intermediate_size=dim_feedforward,
        num_hidden_layers=num_layers,
        vocab_size=vocab_size,
        max_position_embeddings=256,
        type_vocab_size=2,
        hidden_dropout_prob=dropout,
        attention_probs_dropout_prob=dropout
    )
).to(device)

In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
import torch.nn as nn

# 优化器
optimizer = AdamW(model.parameters(), lr=5e-4)

# 损失函数
criterion = nn.CrossEntropyLoss(ignore_index=-100)

# 超参数
num_epochs = 10
total_steps = len(train_dataloader) * num_epochs
warmup_steps = int(0.1 * total_steps)  # warmup 10% 步数

# 调度器
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)


In [None]:
import gc
# 训练循环
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}", leave=False)

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # 前向
        logits = model(input_ids=input_ids, attention_mask=attention_mask)

        # 计算损失
        loss = criterion(logits.view(-1, vocab_size), labels.view(-1))


        # 反向传播
        loss.backward()
        optimizer.step()
        scheduler.step()  # 学习率调度器更新

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_dataloader)
    print(f"\nEpoch {epoch+1} Average Loss: {avg_loss:.4f}")

# 清理显存
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
gc.collect()


                                                                     


Epoch 1 Average Loss: 7.1503


                                                                     


Epoch 2 Average Loss: 6.0828


                                                                     


Epoch 3 Average Loss: 5.8736


                                                                     


Epoch 4 Average Loss: 5.7278


                                                                     


Epoch 5 Average Loss: 5.6159


                                                                     


Epoch 6 Average Loss: 5.4809


                                                                     


Epoch 7 Average Loss: 5.4362


                                                                     


Epoch 8 Average Loss: 5.3115


                                                                     


Epoch 9 Average Loss: 5.2245


                                                                      


Epoch 10 Average Loss: 5.1801


192

In [39]:
model.eval()
total = 0
correct = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids, attention_mask)  # (batch, seq, vocab_size)
        predictions = torch.argmax(logits, dim=-1)

        # 只计算 labels 中不为 -100 的位置
        mask = labels != -100
        correct += (predictions[mask] == labels[mask]).sum().item()
        total += mask.sum().item()

torch.cuda.empty_cache()
torch.cuda.ipc_collect()
gc.collect()

accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 19.64%
